events.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461
  1. package handler
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. amodels "go-admin/app/admin/models"
  7. omodels "go-admin/app/observe/models"
  8. "log"
  9. "sync"
  10. "time"
  11. "github.com/prometheus/client_golang/prometheus"
  12. "github.com/go-admin-team/go-admin-core/logger"
  13. "gorm.io/gorm"
  14. )
  15. type EventHandler struct {
  16. RP *omodels.OtRulesPolicy `json:"result_policy"`
  17. AC *omodels.AlertCondition `json:"alert_condition"`
  18. JR *omodels.JudgeResult `json:"judge_result"`
  19. Emo *omodels.Events
  20. promMap *sync.Map
  21. SmsInfo *SmsConfig
  22. CheckStartTime time.Time `json:"check_starttime"`
  23. Errs map[string]error `json:"_"`
  24. }
  25. type SmsConfig struct {
  26. Appkey string //:= "95598109"
  27. Appsecret string //:= "VXX8H0MzT7"
  28. Url string `json:"url"`
  29. AppsGroup map[string][]string //{"app_1":["172723811341","18622423425"],"app_2":["18726361723","18976572653"]}
  30. SmsTpl string
  31. SQLRecord bool
  32. }
  33. /*
  34. row: 单个数值,出现不代表异常,需要与point比较,符合则立即写入异常事件,单不一定告警,需要结合频率、周期进行判断
  35. --{取消该条检测} rows: list值,出现代表有异常,立即写入异常事件监控,但不一定告警,需要结合频率,周期进行判断
  36. */
  37. func InitEventHandler(pMap *sync.Map, smsInfo *SmsConfig) *EventHandler {
  38. e := new(EventHandler)
  39. e.RP = new(omodels.OtRulesPolicy)
  40. e.AC = new(omodels.AlertCondition)
  41. e.JR = new(omodels.JudgeResult)
  42. e.SmsInfo = smsInfo
  43. e.JR.AlertStatus = INACTIVE
  44. //eg:@@UNSET_apdex::health:5m_{"condition":"<","point":0.7,"point_type":"float","tigger_hz":3,"interval":5}
  45. e.Errs = make(map[string]error)
  46. e.promMap = pMap
  47. e.CheckStartTime = time.Now()
  48. if smsInfo == nil {
  49. e.Errs["smsinf error"] = errors.New("smsinfo 解析错误")
  50. }
  51. return e
  52. }
  53. func (e *EventHandler) JudgeRow() *EventHandler {
  54. switch e.RP.RuleValueType {
  55. case "rows":
  56. e.JR.CompareV = len(e.JR.RowsResult)
  57. //rows 类型出现则表明符合异常
  58. if len(e.JR.RowsResult) > 0 {
  59. e.JR.IsException = true
  60. e.JR.AlertStatus = PENDING
  61. }
  62. /*
  63. 不处理rows类型
  64. 取消该类型的解析,在prometheus中进行数量判断及相关收敛规则
  65. */
  66. case "row":
  67. e.JR.CompareV = e.JR.RowResult
  68. default:
  69. e.JR.CompareV = e.JR.RowGaugeResult
  70. }
  71. logger.Debug("compareV is: ", e.JR.CompareV)
  72. return e
  73. }
  74. func (e *EventHandler) PointCompare() *EventHandler {
  75. if e.JR.IsException {
  76. return e
  77. }
  78. logger.Debugf("%v %s %v", e.JR.CompareV, e.AC.Condition, e.AC.Point)
  79. switch e.AC.Condition {
  80. case ">":
  81. e.JR.IsException = e.g(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  82. case ">=":
  83. e.JR.IsException = e.ge(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  84. case "<":
  85. e.JR.IsException = e.l(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  86. case "<=":
  87. e.JR.IsException = e.le(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  88. case "=", "==":
  89. e.JR.IsException = e.e(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  90. case "!=", "<>", "><":
  91. e.JR.IsException = e.ne(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  92. }
  93. return e
  94. }
  95. func (e *EventHandler) JudgeTriggerHz() *EventHandler {
  96. //仅做第一次标注,此处并非直接控制告警
  97. if e.JR.IsException {
  98. if e.AC.TiggerHz == 0 {
  99. //@@对应事件:x分钟内出现1次,立即告警
  100. e.JR.AlertStatus = FIRING //立即告警, 记录监控事件,event
  101. } else {
  102. e.JR.AlertStatus = PENDING //出现异常,记录监控事件 event,clickhouse TODO:
  103. }
  104. if e.RP.RuleValueType == "rows" && len(e.JR.RowsResult) >= e.AC.TiggerHz {
  105. e.JR.AlertStatus = FIRING
  106. }
  107. }
  108. return e
  109. }
  110. /*
  111. SELECT AppendTime
  112. FROM otel_events
  113. WHERE AlertStatus = 4 AND UID = 'UNSET_2'
  114. ORDER BY AppendTime DESC
  115. LIMIT 1
  116. */
  117. func (e *EventHandler) JudgeInterval(chdb *gorm.DB) *EventHandler {
  118. if e.JR.AlertStatus == FIRING {
  119. return e
  120. }
  121. switch e.RP.RuleValueType {
  122. // case "row":
  123. default:
  124. if !e.JR.IsException {
  125. return e
  126. }
  127. var err error
  128. //周期内数值异常检测结果为pending的总次数 (AlertStatus = 2)
  129. location, err := time.LoadLocation("Asia/Shanghai")
  130. if err != nil {
  131. logger.Error("Error loading location: ", err)
  132. e.Errs["Error loading location"] = err
  133. return e
  134. }
  135. ft := e.RP.CreateTime.In(location).Add(-time.Duration(e.AC.Interval) * time.Minute)
  136. emo := omodels.Events{}
  137. var timer time.Time
  138. var count int64
  139. err = chdb.Table(emo.TableName()).
  140. Raw(`SELECT COUNT(*)
  141. FROM otel_events
  142. WHERE AlertStatus >= ? AND UID = ? AND Timestamp >= ?`, FIRING, e.JR.UID, ft).
  143. Row().Scan(&count)
  144. if err != nil {
  145. logger.Error("select fire count err: ", err)
  146. e.Errs["select fire count err"] = err
  147. return e
  148. }
  149. // if count == 0 {
  150. // // ft = timer
  151. // timer = ft
  152. // } else
  153. if count != 0 {
  154. err = chdb.Table(emo.TableName()).
  155. Raw(`SELECT AppendTime
  156. FROM otel_events
  157. WHERE AlertStatus >= ? AND UID = ?
  158. ORDER BY AppendTime DESC
  159. LIMIT 1`, FIRING, e.JR.UID).
  160. Row().Scan(&timer)
  161. if err != nil {
  162. logger.Error("alert status 4 row query err: ", err)
  163. e.Errs["alert status 4 row"] = err
  164. return e
  165. }
  166. if timer.After(ft) || timer.Equal(ft) {
  167. ft = timer
  168. }
  169. }
  170. rows, err := chdb.Table(emo.TableName()).
  171. Raw(`SELECT *
  172. FROM otel_events
  173. WHERE AlertStatus = ? AND UID = ? AND AppendTime > ?
  174. ORDER BY AppendTime DESC`,
  175. PENDING, e.JR.UID, ft).
  176. Rows()
  177. if err != nil {
  178. logger.Error("rows query err: ", err)
  179. e.Errs["Get events rows err"] = err
  180. }
  181. elist := make([]omodels.Events, 0)
  182. defer rows.Close()
  183. er := omodels.Events{}
  184. for rows.Next() {
  185. if err := chdb.ScanRows(rows, &er); err != nil {
  186. logger.Errorf("range alert rows scan err: %s", err)
  187. e.Errs["range alert rows scan err"] = err
  188. }
  189. elist = append(elist, er)
  190. }
  191. if len(elist) >= e.AC.TiggerHz {
  192. e.JR.IsException = true
  193. e.JR.AlertStatus = FIRING
  194. } else {
  195. e.JR.IsException = true
  196. e.JR.AlertStatus = PENDING
  197. return e
  198. }
  199. case "rows":
  200. //rows 立即告警
  201. // if e.JR.IsException {
  202. // e.JR.AlertStatus = FIRING
  203. // }
  204. return e
  205. }
  206. return e
  207. }
  208. func (e *EventHandler) SetUID() *EventHandler {
  209. // logger.Debug("UUUUUUid: ", fmt.Sprintf("%s_%s_%s", e.RP.AppAlias, e.RP.RuleMonitorAlias, e.RP.Policy))
  210. // e.JR.UID = fmt.Sprintf("%s_%d", e.RP.AppAlias, e.RP.Id)
  211. e.JR.UID = fmt.Sprintf("%s", e.RP.AppAlias)
  212. return e
  213. }
  214. func (e *EventHandler) RegisterPrometheusGauge() *EventHandler {
  215. gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  216. Name: e.RP.RuleMonitorAlias,
  217. Help: e.RP.RuleName,
  218. }, []string{"app_alias", "kind", "uid"})
  219. if err := prometheus.Register(gauge); err != nil {
  220. if err.Error() == "duplicate metrics collector registration attempted" {
  221. logger.Errorf("Failed to register gauge: %v", err)
  222. logger.Infof("promMap: %v", e.promMap)
  223. return e
  224. } else {
  225. log.Fatalf("Failed to register gauge: %v", err)
  226. }
  227. }
  228. e.promMap.Store(e.RP.RuleMonitorAlias, gauge)
  229. return e
  230. }
  231. func (e *EventHandler) SetPromKV() *EventHandler {
  232. e.GetRegisteredGauge(e.RP.RuleMonitorAlias).WithLabelValues(
  233. e.RP.AppAlias, e.RP.RuleKind, e.JR.UID,
  234. ).Set(float64(e.JR.RowGaugeResult))
  235. logger.Debug("e.JR.RowGaugeResult: ", e.JR.RowGaugeResult)
  236. return e
  237. }
  238. // checkCollectorExists 检查注册表中是否存在具有给定名称的collector
  239. func CheckCollectorExists(reg *prometheus.Registry, name string) error {
  240. merticF, err := reg.Gather()
  241. if err != nil {
  242. logger.Errorf("checkCollector, error: %s", err.Error())
  243. }
  244. for _, m := range merticF {
  245. fmt.Println("metricFamily: ", m)
  246. }
  247. return nil
  248. }
  249. func (e *EventHandler) GetRegisteredGauge(gaugeName string) *prometheus.GaugeVec {
  250. logger.Debugf("guageName: %s", gaugeName)
  251. logger.Debug(e.promMap)
  252. promk, ok := e.promMap.Load(gaugeName)
  253. if ok {
  254. logger.Debug("has guage name")
  255. return promk.(*prometheus.GaugeVec)
  256. } else {
  257. e.RegisterPrometheusGauge()
  258. logger.Debug("recreate prometh guagename")
  259. return e.GetRegisteredGauge(gaugeName)
  260. }
  261. }
  262. func (e *EventHandler) CreateEventRecord(chdb *gorm.DB) *EventHandler {
  263. if e.JR.IsException {
  264. row := "UNSET"
  265. rows := make([]map[string]string, 0)
  266. switch e.RP.RuleValueType {
  267. case "row":
  268. row = fmt.Sprintf("%v", e.JR.RowResult)
  269. case "rows":
  270. for _, d := range e.JR.RowsResult {
  271. tmp := make(map[string]string)
  272. for k, v := range d {
  273. tmp[k] = fmt.Sprintf("%v", v)
  274. }
  275. rows = append(rows, tmp)
  276. }
  277. default:
  278. row = fmt.Sprintf("%v", e.JR.RowGaugeResult)
  279. }
  280. var exceptionName string
  281. if e.RP.RuleValueType == "rows" {
  282. if e.JR.AlertStatus == PENDING {
  283. //业务状态码异常数(近30分钟), x分钟内发生x次
  284. exceptionName = fmt.Sprintf("%s, %d分钟内发生: %v次",
  285. e.RP.RuleName, e.AC.Interval, e.JR.CompareV)
  286. } else if e.JR.AlertStatus == FIRING || e.JR.AlertStatus == SENDSUCCESS {
  287. exceptionName = fmt.Sprintf("%s, %d分钟内发生%s%v次, 当前值为: %v",
  288. e.RP.RuleName, e.AC.Interval, e.AC.Condition, e.JR.CompareV, e.JR.CompareV)
  289. }
  290. } else {
  291. exceptionName = fmt.Sprintf("%s%s %v, 当前值为: %v",
  292. e.RP.RuleName, e.AC.Condition, e.AC.Point, e.JR.CompareV)
  293. }
  294. emo := omodels.Events{
  295. Timestamp: time.Now(),
  296. AppendTime: e.RP.CreateTime,
  297. UID: e.JR.UID,
  298. AppID: int64(e.RP.AppId),
  299. RuleID: int64(e.RP.RuleId),
  300. AppName: e.RP.AppName,
  301. AppAlias: e.RP.AppAlias,
  302. ExceptionName: exceptionName,
  303. RuleInfo: map[string]string{
  304. "policy": e.RP.Policy,
  305. "rule_kind": e.RP.RuleKind,
  306. "rule_group": e.RP.RuleGroup,
  307. "rule_data_source": e.RP.RuleDataSource,
  308. "rule_table": e.RP.RuleTable,
  309. "rule_value_type": e.RP.RuleValueType,
  310. },
  311. CompareV: fmt.Sprintf("%v", e.JR.CompareV),
  312. RowResult: row,
  313. RowsResult: rows,
  314. AlertStatus: int64(e.JR.AlertStatus),
  315. }
  316. // logger.Debug("emo: ", emo)
  317. if err := chdb.Table(emo.TableName()).Create(emo).Error; err != nil {
  318. e.Errs["CreateEventRecord_func(Create)"] = err
  319. logger.Error("create emos err:", err.Error())
  320. }
  321. //发生告警,生成告警数据
  322. if e.JR.AlertStatus >= FIRING {
  323. e.Emo = &emo
  324. }
  325. }
  326. return e
  327. }
  328. func (e *EventHandler) CreateAlert(db *gorm.DB) *EventHandler {
  329. if e.JR.AlertStatus >= FIRING {
  330. //TODO: 发送至告警队列
  331. fe := amodels.OtFireEvents{
  332. AppId: e.Emo.AppID,
  333. RuleId: e.Emo.RuleID,
  334. ExceptionName: e.Emo.ExceptionName,
  335. RecordTime: e.Emo.Timestamp,
  336. AppendTime: e.Emo.AppendTime,
  337. Uid: e.Emo.UID,
  338. AppName: e.Emo.AppName,
  339. AppAlias: e.Emo.AppAlias,
  340. RowResult: fmt.Sprintf("%v", e.Emo.RowResult),
  341. AlertStatus: e.Emo.AlertStatus,
  342. IsKnow: 0,
  343. IsResolve: 0,
  344. IsIgnore: 0,
  345. }
  346. var err error
  347. rif, err := json.Marshal(e.Emo.RuleInfo)
  348. if err != nil {
  349. logger.Error("Marshal rule_info err: ", err)
  350. e.Errs["Marshal rule_info err"] = err
  351. }
  352. fe.RuleInfo = string(rif)
  353. rowst, err := json.Marshal(e.Emo.RowsResult)
  354. if err != nil {
  355. logger.Error("Marshal rows_result err: ", err)
  356. e.Errs["Marshal rows_result err"] = err
  357. }
  358. fe.RowsResult = string(rowst)
  359. if err = db.Table(fe.TableName()).Create(&fe).Error; err != nil {
  360. logger.Error("Create ot_fire_event err: ", err)
  361. e.Errs["Create ot_fire_event err"] = err
  362. }
  363. }
  364. return e
  365. }
  366. func (e *EventHandler) assertType(v, point interface{}, t string) {}
  367. // >
  368. func (e *EventHandler) g(v, point interface{}, t string) bool {
  369. switch t {
  370. case "float":
  371. return v.(float64) > point.(float64)
  372. case "int":
  373. return v.(int) > int(point.(float64))
  374. }
  375. return false
  376. }
  377. // >=
  378. func (e *EventHandler) ge(v, point interface{}, t string) bool {
  379. switch t {
  380. case "float":
  381. return v.(float64) >= point.(float64)
  382. case "int":
  383. return v.(int) >= int(point.(float64))
  384. }
  385. return false
  386. }
  387. // <
  388. func (e *EventHandler) l(v, point interface{}, t string) bool {
  389. switch t {
  390. case "float":
  391. return v.(float64) < point.(float64)
  392. case "int":
  393. return v.(int) < int(point.(float64))
  394. }
  395. return false
  396. }
  397. // <=
  398. func (e *EventHandler) le(v, point interface{}, t string) bool {
  399. switch t {
  400. case "float":
  401. return v.(float64) <= point.(float64)
  402. case "int":
  403. return v.(int) <= int(point.(float64))
  404. }
  405. return false
  406. }
  407. // ==
  408. func (e *EventHandler) e(v, point interface{}, t string) bool {
  409. switch t {
  410. case "float":
  411. return v.(float64) == point.(float64)
  412. case "int":
  413. return v.(int) == int(point.(float64))
  414. case "string":
  415. return v.(string) == point.(string)
  416. }
  417. return false
  418. }
  419. // !=
  420. func (e *EventHandler) ne(v, point interface{}, t string) bool {
  421. switch t {
  422. case "float":
  423. return v.(float64) != point.(float64)
  424. case "int":
  425. return v.(int) != int(point.(float64))
  426. case "string":
  427. return v.(string) != point.(string)
  428. }
  429. return false
  430. }