123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455 |
- package handler
- import (
- "encoding/json"
- "errors"
- "fmt"
- amodels "go-admin/app/admin/models"
- omodels "go-admin/app/observe/models"
- "log"
- "sync"
- "time"
- "github.com/prometheus/client_golang/prometheus"
- "github.com/go-admin-team/go-admin-core/logger"
- "gorm.io/gorm"
- )
- type EventHandler struct {
- RP *omodels.OtRulesPolicy `json:"result_policy"`
- AC *omodels.AlertCondition `json:"alert_condition"`
- JR *omodels.JudgeResult `json:"judge_result"`
- Emo *omodels.Events
- promMap *sync.Map
- SmsInfo *SmsConfig
- CheckStartTime time.Time `json:"check_starttime"`
- Errs map[string]error `json:"_"`
- }
- type SmsConfig struct {
- Appkey string //:= "95598109"
- Appsecret string //:= "VXX8H0MzT7"
- Url string `json:"url"`
- AppsGroup map[string][]string //{"app_1":["172723811341","18622423425"],"app_2":["18726361723","18976572653"]}
- SmsTpl string
- SQLRecord bool
- }
- /*
- row: 单个数值,出现不代表异常,需要与point比较,符合则立即写入异常事件,单不一定告警,需要结合频率、周期进行判断
- --{取消该条检测} rows: list值,出现代表有异常,立即写入异常事件监控,但不一定告警,需要结合频率,周期进行判断
- */
- func InitEventHandler(pMap *sync.Map, smsInfo *SmsConfig) *EventHandler {
- e := new(EventHandler)
- e.RP = new(omodels.OtRulesPolicy)
- e.AC = new(omodels.AlertCondition)
- e.JR = new(omodels.JudgeResult)
- e.SmsInfo = smsInfo
- e.JR.AlertStatus = INACTIVE
- //eg:@@UNSET_apdex::health:5m_{"condition":"<","point":0.7,"point_type":"float","tigger_hz":3,"interval":5}
- e.Errs = make(map[string]error)
- e.promMap = pMap
- e.CheckStartTime = time.Now()
- if smsInfo == nil {
- e.Errs["smsinf error"] = errors.New("smsinfo 解析错误")
- }
- return e
- }
- func (e *EventHandler) JudgeRow() *EventHandler {
- switch e.RP.RuleValueType {
- case "rows":
- e.JR.CompareV = len(e.JR.RowsResult)
- //rows 类型出现则表明符合异常
- if len(e.JR.RowsResult) > 0 {
- e.JR.IsException = true
- e.JR.AlertStatus = PENDING
- }
- /*
- 不处理rows类型
- 取消该类型的解析,在prometheus中进行数量判断及相关收敛规则
- */
- case "row":
- e.JR.CompareV = e.JR.RowResult
- default:
- e.JR.CompareV = e.JR.RowGaugeResult
- }
- logger.Debug("compareV is: ", e.JR.CompareV)
- return e
- }
- func (e *EventHandler) PointCompare() *EventHandler {
- if e.JR.IsException {
- return e
- }
- logger.Debugf("%v %s %v", e.JR.CompareV, e.AC.Condition, e.AC.Point)
- switch e.AC.Condition {
- case ">":
- e.JR.IsException = e.g(e.JR.CompareV, e.AC.Point, e.AC.PointType)
- case ">=":
- e.JR.IsException = e.ge(e.JR.CompareV, e.AC.Point, e.AC.PointType)
- case "<":
- e.JR.IsException = e.l(e.JR.CompareV, e.AC.Point, e.AC.PointType)
- case "<=":
- e.JR.IsException = e.le(e.JR.CompareV, e.AC.Point, e.AC.PointType)
- case "=", "==":
- e.JR.IsException = e.e(e.JR.CompareV, e.AC.Point, e.AC.PointType)
- case "!=", "<>", "><":
- e.JR.IsException = e.ne(e.JR.CompareV, e.AC.Point, e.AC.PointType)
- }
- return e
- }
- func (e *EventHandler) JudgeTriggerHz() *EventHandler {
- //仅做第一次标注,此处并非直接控制告警
- if e.JR.IsException {
- if e.AC.TiggerHz == 0 {
- //@@对应事件:x分钟内出现1次,立即告警
- e.JR.AlertStatus = FIRING //立即告警, 记录监控事件,event
- } else {
- e.JR.AlertStatus = PENDING //出现异常,记录监控事件 event,clickhouse TODO:
- }
- if e.RP.RuleValueType == "rows" && len(e.JR.RowsResult) >= e.AC.TiggerHz {
- e.JR.AlertStatus = FIRING
- }
- }
- return e
- }
- /*
- SELECT AppendTime
- FROM otel_events
- WHERE AlertStatus = 4 AND UID = 'UNSET_2'
- ORDER BY AppendTime DESC
- LIMIT 1
- */
- func (e *EventHandler) JudgeInterval(chdb *gorm.DB) *EventHandler {
- if e.JR.AlertStatus == FIRING {
- return e
- }
- switch e.RP.RuleValueType {
- // case "row":
- default:
- if !e.JR.IsException {
- return e
- }
- //周期内数值异常检测结果为pending的总次数 (AlertStatus = 2)
- ft := e.RP.CreateTime.Add(-time.Duration(e.AC.Interval) * time.Minute)
- var err error
- emo := omodels.Events{}
- var timer time.Time
- var count int64
- err = chdb.Table(emo.TableName()).
- Raw(`SELECT COUNT(*)
- FROM otel_events
- WHERE AlertStatus = ? AND UID = ? AND Timestamp >= ?`, FIRING, e.JR.UID, ft).
- Row().Scan(&count)
- if err != nil {
- logger.Error("select fire count err: ", err)
- e.Errs["select fire count err"] = err
- return e
- }
- // if count == 0 {
- // // ft = timer
- // timer = ft
- // } else
- if count != 0 {
- err = chdb.Table(emo.TableName()).
- Raw(`SELECT AppendTime
- FROM otel_events
- WHERE AlertStatus = ? AND UID = ?
- ORDER BY AppendTime DESC
- LIMIT 1`, FIRING, e.JR.UID).
- Row().Scan(&timer)
- if err != nil {
- logger.Error("alert status 4 row query err: ", err)
- e.Errs["alert status 4 row"] = err
- return e
- }
- if timer.After(ft) || timer.Equal(ft) {
- ft = timer
- }
- }
- rows, err := chdb.Table(emo.TableName()).
- Raw(`SELECT *
- FROM otel_events
- WHERE AlertStatus = ? AND UID = ? AND AppendTime > ?
- ORDER BY AppendTime DESC`,
- PENDING, e.JR.UID, ft).
- Rows()
- if err != nil {
- logger.Error("rows query err: ", err)
- e.Errs["Get events rows err"] = err
- }
- elist := make([]omodels.Events, 0)
- defer rows.Close()
- er := omodels.Events{}
- for rows.Next() {
- if err := chdb.ScanRows(rows, &er); err != nil {
- logger.Errorf("range alert rows scan err: %s", err)
- e.Errs["range alert rows scan err"] = err
- }
- elist = append(elist, er)
- }
- if len(elist) >= e.AC.TiggerHz {
- e.JR.IsException = true
- e.JR.AlertStatus = FIRING
- } else {
- e.JR.IsException = true
- e.JR.AlertStatus = PENDING
- return e
- }
- case "rows":
- //rows 立即告警
- // if e.JR.IsException {
- // e.JR.AlertStatus = FIRING
- // }
- return e
- }
- return e
- }
- func (e *EventHandler) SetUID() *EventHandler {
- // logger.Debug("UUUUUUid: ", fmt.Sprintf("%s_%s_%s", e.RP.AppAlias, e.RP.RuleMonitorAlias, e.RP.Policy))
- // e.JR.UID = fmt.Sprintf("%s_%d", e.RP.AppAlias, e.RP.Id)
- e.JR.UID = fmt.Sprintf("%s", e.RP.AppAlias)
- return e
- }
- func (e *EventHandler) RegisterPrometheusGauge() *EventHandler {
- gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
- Name: e.RP.RuleMonitorAlias,
- Help: e.RP.RuleName,
- }, []string{"app_alias", "kind", "uid"})
- if err := prometheus.Register(gauge); err != nil {
- if err.Error() == "duplicate metrics collector registration attempted" {
- logger.Errorf("Failed to register gauge: %v", err)
- logger.Infof("promMap: %v", e.promMap)
- return e
- } else {
- log.Fatalf("Failed to register gauge: %v", err)
- }
- }
- e.promMap.Store(e.RP.RuleMonitorAlias, gauge)
- return e
- }
- func (e *EventHandler) SetPromKV() *EventHandler {
- e.GetRegisteredGauge(e.RP.RuleMonitorAlias).WithLabelValues(
- e.RP.AppAlias, e.RP.RuleKind, e.JR.UID,
- ).Set(float64(e.JR.RowGaugeResult))
- logger.Debug("e.JR.RowGaugeResult: ", e.JR.RowGaugeResult)
- return e
- }
- // checkCollectorExists 检查注册表中是否存在具有给定名称的collector
- func CheckCollectorExists(reg *prometheus.Registry, name string) error {
- merticF, err := reg.Gather()
- if err != nil {
- logger.Errorf("checkCollector, error: %s", err.Error())
- }
- for _, m := range merticF {
- fmt.Println("metricFamily: ", m)
- }
- return nil
- }
- func (e *EventHandler) GetRegisteredGauge(gaugeName string) *prometheus.GaugeVec {
- logger.Debugf("guageName: %s", gaugeName)
- logger.Debug(e.promMap)
- promk, ok := e.promMap.Load(gaugeName)
- if ok {
- logger.Debug("has guage name")
- return promk.(*prometheus.GaugeVec)
- } else {
- e.RegisterPrometheusGauge()
- logger.Debug("recreate prometh guagename")
- return e.GetRegisteredGauge(gaugeName)
- }
- }
- func (e *EventHandler) CreateEventRecord(chdb *gorm.DB) *EventHandler {
- if e.JR.IsException {
- row := "UNSET"
- rows := make([]map[string]string, 0)
- switch e.RP.RuleValueType {
- case "row":
- row = fmt.Sprintf("%v", e.JR.RowResult)
- case "rows":
- for _, d := range e.JR.RowsResult {
- tmp := make(map[string]string)
- for k, v := range d {
- tmp[k] = fmt.Sprintf("%v", v)
- }
- rows = append(rows, tmp)
- }
- default:
- row = fmt.Sprintf("%v", e.JR.RowGaugeResult)
- }
- var exceptionName string
- if e.RP.RuleValueType == "rows" {
- if e.JR.AlertStatus == PENDING {
- //业务状态码异常数(近30分钟), x分钟内发生x次
- exceptionName = fmt.Sprintf("%s, %d分钟内发生: %v次",
- e.RP.RuleName, e.AC.Interval, e.JR.CompareV)
- } else if e.JR.AlertStatus == FIRING || e.JR.AlertStatus == SENDSUCCESS {
- exceptionName = fmt.Sprintf("%s, %d分钟内发生%s%v次, 当前值为: %v",
- e.RP.RuleName, e.AC.Interval, e.AC.Condition, e.JR.CompareV, e.JR.CompareV)
- }
- } else {
- exceptionName = fmt.Sprintf("%s%s %v, 当前值为: %v",
- e.RP.RuleName, e.AC.Condition, e.AC.Point, e.JR.CompareV)
- }
- emo := omodels.Events{
- Timestamp: time.Now(),
- AppendTime: e.RP.CreateTime,
- UID: e.JR.UID,
- AppID: int64(e.RP.AppId),
- RuleID: int64(e.RP.RuleId),
- AppName: e.RP.AppName,
- AppAlias: e.RP.AppAlias,
- ExceptionName: exceptionName,
- RuleInfo: map[string]string{
- "policy": e.RP.Policy,
- "rule_kind": e.RP.RuleKind,
- "rule_group": e.RP.RuleGroup,
- "rule_data_source": e.RP.RuleDataSource,
- "rule_table": e.RP.RuleTable,
- "rule_value_type": e.RP.RuleValueType,
- },
- CompareV: fmt.Sprintf("%v", e.JR.CompareV),
- RowResult: row,
- RowsResult: rows,
- AlertStatus: int64(e.JR.AlertStatus),
- }
- // logger.Debug("emo: ", emo)
- if err := chdb.Table(emo.TableName()).Create(emo).Error; err != nil {
- e.Errs["CreateEventRecord_func(Create)"] = err
- logger.Error("create emos err:", err.Error())
- }
- //发生告警,生成告警数据
- if e.JR.AlertStatus >= FIRING {
- e.Emo = &emo
- }
- }
- return e
- }
- func (e *EventHandler) CreateAlert(db *gorm.DB) *EventHandler {
- if e.JR.AlertStatus >= FIRING {
- //TODO: 发送至告警队列
- fe := amodels.OtFireEvents{
- AppId: e.Emo.AppID,
- RuleId: e.Emo.RuleID,
- ExceptionName: e.Emo.ExceptionName,
- RecordTime: e.Emo.Timestamp,
- AppendTime: e.Emo.AppendTime,
- Uid: e.Emo.UID,
- AppName: e.Emo.AppName,
- AppAlias: e.Emo.AppAlias,
- RowResult: fmt.Sprintf("%v", e.Emo.RowResult),
- AlertStatus: e.Emo.AlertStatus,
- IsKnow: 0,
- IsResolve: 0,
- IsIgnore: 0,
- }
- var err error
- rif, err := json.Marshal(e.Emo.RuleInfo)
- if err != nil {
- logger.Error("Marshal rule_info err: ", err)
- e.Errs["Marshal rule_info err"] = err
- }
- fe.RuleInfo = string(rif)
- rowst, err := json.Marshal(e.Emo.RowsResult)
- if err != nil {
- logger.Error("Marshal rows_result err: ", err)
- e.Errs["Marshal rows_result err"] = err
- }
- fe.RowsResult = string(rowst)
- if err = db.Table(fe.TableName()).Create(&fe).Error; err != nil {
- logger.Error("Create ot_fire_event err: ", err)
- e.Errs["Create ot_fire_event err"] = err
- }
- }
- return e
- }
- func (e *EventHandler) assertType(v, point interface{}, t string) {}
- // >
- func (e *EventHandler) g(v, point interface{}, t string) bool {
- switch t {
- case "float":
- return v.(float64) > point.(float64)
- case "int":
- return v.(int) > int(point.(float64))
- }
- return false
- }
- // >=
- func (e *EventHandler) ge(v, point interface{}, t string) bool {
- switch t {
- case "float":
- return v.(float64) >= point.(float64)
- case "int":
- return v.(int) >= int(point.(float64))
- }
- return false
- }
- // <
- func (e *EventHandler) l(v, point interface{}, t string) bool {
- switch t {
- case "float":
- return v.(float64) < point.(float64)
- case "int":
- return v.(int) < int(point.(float64))
- }
- return false
- }
- // <=
- func (e *EventHandler) le(v, point interface{}, t string) bool {
- switch t {
- case "float":
- return v.(float64) <= point.(float64)
- case "int":
- return v.(int) <= int(point.(float64))
- }
- return false
- }
- // ==
- func (e *EventHandler) e(v, point interface{}, t string) bool {
- switch t {
- case "float":
- return v.(float64) == point.(float64)
- case "int":
- return v.(int) == int(point.(float64))
- case "string":
- return v.(string) == point.(string)
- }
- return false
- }
- // !=
- func (e *EventHandler) ne(v, point interface{}, t string) bool {
- switch t {
- case "float":
- return v.(float64) != point.(float64)
- case "int":
- return v.(int) != int(point.(float64))
- case "string":
- return v.(string) != point.(string)
- }
- return false
- }
|