events.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423
  1. package handler
  2. import (
  3. "encoding/json"
  4. "fmt"
  5. amodels "go-admin/app/admin/models"
  6. omodels "go-admin/app/observe/models"
  7. "log"
  8. "sync"
  9. "time"
  10. "github.com/prometheus/client_golang/prometheus"
  11. "github.com/go-admin-team/go-admin-core/logger"
  12. "gorm.io/gorm"
  13. )
  14. type EventHandler struct {
  15. RP *omodels.OtRulesPolicy `json:"result_policy"`
  16. AC *omodels.AlertCondition `json:"alert_condition"`
  17. JR *omodels.JudgeResult `json:"judge_result"`
  18. Emo *omodels.Events
  19. promMap *sync.Map
  20. Errs map[string]error `json:"_"`
  21. }
  22. /*
  23. row: 单个数值,出现不代表异常,需要与point比较,复合则立即写入异常事件,单不一定告警,需要结合频率、周期进行判断
  24. rows: list值,出现代表有异常,立即写入异常事件监控,但不一定告警,需要结合频率,周期进行判断
  25. */
  26. func InitEventHandler(pMap *sync.Map) *EventHandler {
  27. e := new(EventHandler)
  28. e.RP = new(omodels.OtRulesPolicy)
  29. e.AC = new(omodels.AlertCondition)
  30. e.JR = new(omodels.JudgeResult)
  31. e.JR.AlertStatus = INACTIVE
  32. //eg:@@UNSET_apdex::health:5m_{"condition":"<","point":0.7,"point_type":"float","tigger_hz":3,"interval":5}
  33. e.Errs = make(map[string]error)
  34. e.promMap = pMap
  35. return e
  36. }
  37. func (e *EventHandler) JudgeRow() *EventHandler {
  38. switch e.RP.RuleValueType {
  39. case "rows":
  40. e.JR.CompareV = len(e.JR.RowsResult)
  41. //rows 类型出现则表明符合异常
  42. if len(e.JR.RowsResult) > 0 {
  43. e.JR.IsException = true
  44. e.JR.AlertStatus = PENDING
  45. }
  46. /*
  47. 不处理rows类型
  48. 取消该类型的解析,在prometheus中进行数量判断及相关收敛规则
  49. */
  50. case "row":
  51. e.JR.CompareV = e.JR.RowResult
  52. default:
  53. e.JR.CompareV = e.JR.RowResult
  54. }
  55. return e
  56. }
  57. func (e *EventHandler) PointCompare() *EventHandler {
  58. if e.JR.IsException {
  59. return e
  60. }
  61. logger.Debugf("%v %s %v", e.JR.CompareV, e.AC.Condition, e.AC.Point)
  62. switch e.AC.Condition {
  63. case ">":
  64. e.JR.IsException = e.g(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  65. case ">=":
  66. e.JR.IsException = e.ge(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  67. case "<":
  68. e.JR.IsException = e.l(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  69. case "<=":
  70. e.JR.IsException = e.le(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  71. case "=", "==":
  72. e.JR.IsException = e.e(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  73. case "!=", "<>", "><":
  74. e.JR.IsException = e.ne(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  75. }
  76. return e
  77. }
  78. func (e *EventHandler) JudgeTriggerHz() *EventHandler {
  79. //仅做第一次标注,此处并非直接控制告警
  80. if e.JR.IsException {
  81. if e.AC.TiggerHz == 0 {
  82. //@@对应事件:x分钟内出现1次,立即告警
  83. e.JR.AlertStatus = FIRING //立即告警, 记录监控事件,event
  84. } else {
  85. e.JR.AlertStatus = PENDING //出现异常,记录监控事件 event,clickhouse TODO:
  86. }
  87. if e.RP.RuleValueType == "rows" && len(e.JR.RowsResult) >= e.AC.TiggerHz {
  88. e.JR.AlertStatus = FIRING
  89. }
  90. }
  91. return e
  92. }
  93. /*
  94. SELECT AppendTime
  95. FROM otel_events
  96. WHERE AlertStatus = 4 AND UID = 'UNSET_2'
  97. ORDER BY AppendTime DESC
  98. LIMIT 1
  99. */
  100. func (e *EventHandler) JudgeInterval(chdb *gorm.DB) *EventHandler {
  101. if e.JR.AlertStatus == FIRING {
  102. return e
  103. }
  104. switch e.RP.RuleValueType {
  105. case "row":
  106. if !e.JR.IsException {
  107. return e
  108. }
  109. //周期内数值异常检测结果为pending的总次数 (AlertStatus = 2)
  110. ft := e.RP.CreateTime.Add(-time.Duration(e.AC.Interval) * time.Minute)
  111. var err error
  112. emo := omodels.Events{}
  113. var timer time.Time
  114. var count int64
  115. err = chdb.Table(emo.TableName()).
  116. Raw(`SELECT COUNT(*)
  117. FROM otel_events
  118. WHERE AlertStatus = ? AND UID = ?`, FIRING, e.JR.UID).
  119. Row().Scan(&count)
  120. if count == 0 {
  121. ft = timer
  122. } else {
  123. err = chdb.Table(emo.TableName()).
  124. Raw(`SELECT AppendTime
  125. FROM otel_events
  126. WHERE AlertStatus = ? AND UID = ?
  127. ORDER BY AppendTime DESC
  128. LIMIT 1`, FIRING, e.JR.UID).
  129. Row().Scan(&timer)
  130. if err != nil {
  131. logger.Error("alert status 4 row query err: ", err)
  132. e.Errs["alert status 4 row"] = err
  133. return e
  134. }
  135. }
  136. if timer.After(ft) || timer.Equal(ft) {
  137. ft = timer
  138. }
  139. rows, err := chdb.Table(emo.TableName()).
  140. Raw(`SELECT *
  141. FROM otel_events
  142. WHERE AlertStatus = ? AND UID = ? AND AppendTime > ?
  143. ORDER BY AppendTime DESC`,
  144. PENDING, e.JR.UID, ft).
  145. Rows()
  146. if err != nil {
  147. logger.Error("rows query err: ", err)
  148. e.Errs["Get events rows err"] = err
  149. }
  150. elist := make([]omodels.Events, 0)
  151. defer rows.Close()
  152. er := omodels.Events{}
  153. for rows.Next() {
  154. if err := chdb.ScanRows(rows, &er); err != nil {
  155. logger.Errorf("range alert rows scan err: %s", err)
  156. e.Errs["range alert rows scan err"] = err
  157. }
  158. elist = append(elist, er)
  159. }
  160. if len(elist) >= e.AC.TiggerHz {
  161. e.JR.IsException = true
  162. e.JR.AlertStatus = FIRING
  163. } else {
  164. e.JR.IsException = true
  165. e.JR.AlertStatus = PENDING
  166. return e
  167. }
  168. case "rows":
  169. //rows 立即告警
  170. // if e.JR.IsException {
  171. // e.JR.AlertStatus = FIRING
  172. // }
  173. return e
  174. }
  175. return e
  176. }
  177. func (e *EventHandler) SetUID() *EventHandler {
  178. // logger.Debug("UUUUUUid: ", fmt.Sprintf("%s_%s_%s", e.RP.AppAlias, e.RP.RuleMonitorAlias, e.RP.Policy))
  179. e.JR.UID = fmt.Sprintf("%s_%d", e.RP.AppAlias, e.RP.Id)
  180. return e
  181. }
  182. func (e *EventHandler) RegisterPrometheusGauge() *EventHandler {
  183. gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  184. Name: e.RP.RuleMonitorAlias,
  185. Help: e.RP.RuleName,
  186. }, []string{"app_alias", "kind", "uid"})
  187. if err := prometheus.Register(gauge); err != nil {
  188. if err.Error() == "duplicate metrics collector registration attempted" {
  189. logger.Errorf("Failed to register gauge: %v", err)
  190. return e
  191. } else {
  192. log.Fatalf("Failed to register gauge: %v", err)
  193. }
  194. }
  195. e.promMap.Store(e.RP.RuleMonitorAlias, gauge)
  196. return e
  197. }
  198. func (e *EventHandler) SetPromKV() *EventHandler {
  199. e.GetRegisteredGauge(e.RP.RuleMonitorAlias).WithLabelValues(
  200. e.RP.AppAlias, e.RP.RuleKind, e.JR.UID,
  201. ).Set(float64(e.JR.RowGaugeResult))
  202. logger.Debugf("e.JR.RowGaugeResult", e.JR.RowGaugeResult)
  203. return e
  204. }
  205. // checkCollectorExists 检查注册表中是否存在具有给定名称的collector
  206. func CheckCollectorExists(reg *prometheus.Registry, name string) error {
  207. merticF, err := reg.Gather()
  208. if err != nil {
  209. logger.Errorf("checkCollector, error: %s", err.Error())
  210. }
  211. for _, m := range merticF {
  212. fmt.Println("metricFamily: ", m)
  213. }
  214. return nil
  215. }
  216. func (e *EventHandler) GetRegisteredGauge(gaugeName string) *prometheus.GaugeVec {
  217. logger.Debugf("guageName: %s", gaugeName)
  218. logger.Debug(e.promMap)
  219. promk, ok := e.promMap.Load(gaugeName)
  220. if ok {
  221. logger.Debug("has guage name")
  222. return promk.(*prometheus.GaugeVec)
  223. } else {
  224. e.RegisterPrometheusGauge()
  225. logger.Debug("recreate prometh guagename")
  226. return e.GetRegisteredGauge(gaugeName)
  227. }
  228. }
  229. func (e *EventHandler) CreateEventRecord(chdb *gorm.DB) *EventHandler {
  230. if e.JR.IsException {
  231. row := "UNSET"
  232. rows := make([]map[string]string, 0)
  233. if e.RP.RuleValueType == "row" {
  234. row = fmt.Sprintf("%v", e.JR.RowResult)
  235. } else {
  236. for _, d := range e.JR.RowsResult {
  237. tmp := make(map[string]string)
  238. for k, v := range d {
  239. tmp[k] = fmt.Sprintf("%v", v)
  240. }
  241. rows = append(rows, tmp)
  242. }
  243. }
  244. var exceptionName string
  245. if e.RP.RuleValueType == "rows" {
  246. if e.JR.AlertStatus == PENDING {
  247. //业务状态码异常数(近30分钟), x分钟内发生x次
  248. exceptionName = fmt.Sprintf("%s, %d分钟内发生: %v次",
  249. e.RP.RuleName, e.AC.Interval, e.JR.CompareV)
  250. } else if e.JR.AlertStatus == FIRING {
  251. exceptionName = fmt.Sprintf("%s, %d分钟内发生%s%v次, 当前值为: %v",
  252. e.RP.RuleName, e.AC.Interval, e.AC.Condition, e.JR.CompareV, e.JR.CompareV)
  253. }
  254. } else {
  255. exceptionName = fmt.Sprintf("%s%s %v, 当前值为: %v",
  256. e.RP.RuleName, e.AC.Condition, e.AC.Point, e.JR.CompareV)
  257. }
  258. emo := omodels.Events{
  259. Timestamp: time.Now(),
  260. AppendTime: e.RP.CreateTime,
  261. UID: e.JR.UID,
  262. AppID: int64(e.RP.AppId),
  263. RuleID: int64(e.RP.RuleId),
  264. AppName: e.RP.AppName,
  265. AppAlias: e.RP.AppAlias,
  266. ExceptionName: exceptionName,
  267. RuleInfo: map[string]string{
  268. "policy": e.RP.Policy,
  269. "rule_kind": e.RP.RuleKind,
  270. "rule_group": e.RP.RuleGroup,
  271. "rule_data_source": e.RP.RuleDataSource,
  272. "rule_table": e.RP.RuleTable,
  273. "rule_value_type": e.RP.RuleValueType,
  274. },
  275. CompareV: fmt.Sprintf("%v", e.JR.CompareV),
  276. RowResult: row,
  277. RowsResult: rows,
  278. AlertStatus: int64(e.JR.AlertStatus),
  279. }
  280. // logger.Debug("emo: ", emo)
  281. if err := chdb.Table(emo.TableName()).Create(emo).Error; err != nil {
  282. e.Errs["CreateEventRecord_func(Create)"] = err
  283. logger.Error("create emos err:", err.Error())
  284. }
  285. if e.JR.AlertStatus == FIRING {
  286. e.Emo = &emo
  287. }
  288. }
  289. return e
  290. }
  291. func (e *EventHandler) CreateAlert(db *gorm.DB) *EventHandler {
  292. if e.JR.AlertStatus == FIRING {
  293. //TODO: 发送至告警队列
  294. fe := amodels.OtFireEvents{
  295. AppId: e.Emo.AppID,
  296. RuleId: e.Emo.RuleID,
  297. ExceptionName: e.Emo.ExceptionName,
  298. RecordTime: e.Emo.Timestamp,
  299. AppendTime: e.Emo.AppendTime,
  300. Uid: e.Emo.UID,
  301. AppName: e.Emo.AppName,
  302. AppAlias: e.Emo.AppAlias,
  303. RowResult: fmt.Sprintf("%v", e.Emo.RowResult),
  304. AlertStatus: e.Emo.AlertStatus,
  305. IsKnow: 0,
  306. IsResolve: 0,
  307. IsIgnore: 0,
  308. }
  309. var err error
  310. rif, err := json.Marshal(e.Emo.RuleInfo)
  311. if err != nil {
  312. logger.Error("Marshal rule_info err: ", err)
  313. e.Errs["Marshal rule_info err"] = err
  314. }
  315. fe.RuleInfo = string(rif)
  316. rowst, err := json.Marshal(e.Emo.RowsResult)
  317. if err != nil {
  318. logger.Error("Marshal rows_result err: ", err)
  319. e.Errs["Marshal rows_result err"] = err
  320. }
  321. fe.RowsResult = string(rowst)
  322. if err = db.Table(fe.TableName()).Create(&fe).Error; err != nil {
  323. logger.Error("Create ot_fire_event err: ", err)
  324. e.Errs["Create ot_fire_event err"] = err
  325. }
  326. }
  327. return e
  328. }
  329. func (e *EventHandler) assertType(v, point interface{}, t string) {}
  330. // >
  331. func (e *EventHandler) g(v, point interface{}, t string) bool {
  332. switch t {
  333. case "float":
  334. return v.(float64) > point.(float64)
  335. case "int":
  336. return v.(int) > int(point.(float64))
  337. }
  338. return false
  339. }
  340. // >=
  341. func (e *EventHandler) ge(v, point interface{}, t string) bool {
  342. switch t {
  343. case "float":
  344. return v.(float64) >= point.(float64)
  345. case "int":
  346. return v.(int) >= int(point.(float64))
  347. }
  348. return false
  349. }
  350. // <
  351. func (e *EventHandler) l(v, point interface{}, t string) bool {
  352. switch t {
  353. case "float":
  354. return v.(float64) < point.(float64)
  355. case "int":
  356. return v.(int) < int(point.(float64))
  357. }
  358. return false
  359. }
  360. // <=
  361. func (e *EventHandler) le(v, point interface{}, t string) bool {
  362. switch t {
  363. case "float":
  364. return v.(float64) <= point.(float64)
  365. case "int":
  366. return v.(int) <= int(point.(float64))
  367. }
  368. return false
  369. }
  370. // ==
  371. func (e *EventHandler) e(v, point interface{}, t string) bool {
  372. switch t {
  373. case "float":
  374. return v.(float64) == point.(float64)
  375. case "int":
  376. return v.(int) == int(point.(float64))
  377. case "string":
  378. return v.(string) == point.(string)
  379. }
  380. return false
  381. }
  382. // !=
  383. func (e *EventHandler) ne(v, point interface{}, t string) bool {
  384. switch t {
  385. case "float":
  386. return v.(float64) != point.(float64)
  387. case "int":
  388. return v.(int) != int(point.(float64))
  389. case "string":
  390. return v.(string) != point.(string)
  391. }
  392. return false
  393. }