events.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455
  1. package handler
  2. import (
  3. "encoding/json"
  4. "errors"
  5. "fmt"
  6. amodels "go-admin/app/admin/models"
  7. omodels "go-admin/app/observe/models"
  8. "log"
  9. "sync"
  10. "time"
  11. "github.com/prometheus/client_golang/prometheus"
  12. "github.com/go-admin-team/go-admin-core/logger"
  13. "gorm.io/gorm"
  14. )
  15. type EventHandler struct {
  16. RP *omodels.OtRulesPolicy `json:"result_policy"`
  17. AC *omodels.AlertCondition `json:"alert_condition"`
  18. JR *omodels.JudgeResult `json:"judge_result"`
  19. Emo *omodels.Events
  20. promMap *sync.Map
  21. SmsInfo *SmsConfig
  22. CheckStartTime time.Time `json:"check_starttime"`
  23. Errs map[string]error `json:"_"`
  24. }
  25. type SmsConfig struct {
  26. Appkey string //:= "95598109"
  27. Appsecret string //:= "VXX8H0MzT7"
  28. Url string `json:"url"`
  29. AppsGroup map[string][]string //{"app_1":["172723811341","18622423425"],"app_2":["18726361723","18976572653"]}
  30. SmsTpl string
  31. SQLRecord bool
  32. }
  33. /*
  34. row: 单个数值,出现不代表异常,需要与point比较,符合则立即写入异常事件,单不一定告警,需要结合频率、周期进行判断
  35. --{取消该条检测} rows: list值,出现代表有异常,立即写入异常事件监控,但不一定告警,需要结合频率,周期进行判断
  36. */
  37. func InitEventHandler(pMap *sync.Map, smsInfo *SmsConfig) *EventHandler {
  38. e := new(EventHandler)
  39. e.RP = new(omodels.OtRulesPolicy)
  40. e.AC = new(omodels.AlertCondition)
  41. e.JR = new(omodels.JudgeResult)
  42. e.SmsInfo = smsInfo
  43. e.JR.AlertStatus = INACTIVE
  44. //eg:@@UNSET_apdex::health:5m_{"condition":"<","point":0.7,"point_type":"float","tigger_hz":3,"interval":5}
  45. e.Errs = make(map[string]error)
  46. e.promMap = pMap
  47. e.CheckStartTime = time.Now()
  48. if smsInfo == nil {
  49. e.Errs["smsinf error"] = errors.New("smsinfo 解析错误")
  50. }
  51. return e
  52. }
  53. func (e *EventHandler) JudgeRow() *EventHandler {
  54. switch e.RP.RuleValueType {
  55. case "rows":
  56. e.JR.CompareV = len(e.JR.RowsResult)
  57. //rows 类型出现则表明符合异常
  58. if len(e.JR.RowsResult) > 0 {
  59. e.JR.IsException = true
  60. e.JR.AlertStatus = PENDING
  61. }
  62. /*
  63. 不处理rows类型
  64. 取消该类型的解析,在prometheus中进行数量判断及相关收敛规则
  65. */
  66. case "row":
  67. e.JR.CompareV = e.JR.RowResult
  68. default:
  69. e.JR.CompareV = e.JR.RowGaugeResult
  70. }
  71. logger.Debug("compareV is: ", e.JR.CompareV)
  72. return e
  73. }
  74. func (e *EventHandler) PointCompare() *EventHandler {
  75. if e.JR.IsException {
  76. return e
  77. }
  78. logger.Debugf("%v %s %v", e.JR.CompareV, e.AC.Condition, e.AC.Point)
  79. switch e.AC.Condition {
  80. case ">":
  81. e.JR.IsException = e.g(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  82. case ">=":
  83. e.JR.IsException = e.ge(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  84. case "<":
  85. e.JR.IsException = e.l(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  86. case "<=":
  87. e.JR.IsException = e.le(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  88. case "=", "==":
  89. e.JR.IsException = e.e(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  90. case "!=", "<>", "><":
  91. e.JR.IsException = e.ne(e.JR.CompareV, e.AC.Point, e.AC.PointType)
  92. }
  93. return e
  94. }
  95. func (e *EventHandler) JudgeTriggerHz() *EventHandler {
  96. //仅做第一次标注,此处并非直接控制告警
  97. if e.JR.IsException {
  98. if e.AC.TiggerHz == 0 {
  99. //@@对应事件:x分钟内出现1次,立即告警
  100. e.JR.AlertStatus = FIRING //立即告警, 记录监控事件,event
  101. } else {
  102. e.JR.AlertStatus = PENDING //出现异常,记录监控事件 event,clickhouse TODO:
  103. }
  104. if e.RP.RuleValueType == "rows" && len(e.JR.RowsResult) >= e.AC.TiggerHz {
  105. e.JR.AlertStatus = FIRING
  106. }
  107. }
  108. return e
  109. }
  110. /*
  111. SELECT AppendTime
  112. FROM otel_events
  113. WHERE AlertStatus = 4 AND UID = 'UNSET_2'
  114. ORDER BY AppendTime DESC
  115. LIMIT 1
  116. */
  117. func (e *EventHandler) JudgeInterval(chdb *gorm.DB) *EventHandler {
  118. if e.JR.AlertStatus == FIRING {
  119. return e
  120. }
  121. switch e.RP.RuleValueType {
  122. // case "row":
  123. default:
  124. if !e.JR.IsException {
  125. return e
  126. }
  127. //周期内数值异常检测结果为pending的总次数 (AlertStatus = 2)
  128. ft := e.RP.CreateTime.Add(-time.Duration(e.AC.Interval) * time.Minute)
  129. var err error
  130. emo := omodels.Events{}
  131. var timer time.Time
  132. var count int64
  133. err = chdb.Table(emo.TableName()).
  134. Raw(`SELECT COUNT(*)
  135. FROM otel_events
  136. WHERE AlertStatus = ? AND UID = ? AND Timestamp >= ?`, FIRING, e.JR.UID, ft).
  137. Row().Scan(&count)
  138. if err != nil {
  139. logger.Error("select fire count err: ", err)
  140. e.Errs["select fire count err"] = err
  141. return e
  142. }
  143. // if count == 0 {
  144. // // ft = timer
  145. // timer = ft
  146. // } else
  147. if count != 0 {
  148. err = chdb.Table(emo.TableName()).
  149. Raw(`SELECT AppendTime
  150. FROM otel_events
  151. WHERE AlertStatus = ? AND UID = ?
  152. ORDER BY AppendTime DESC
  153. LIMIT 1`, FIRING, e.JR.UID).
  154. Row().Scan(&timer)
  155. if err != nil {
  156. logger.Error("alert status 4 row query err: ", err)
  157. e.Errs["alert status 4 row"] = err
  158. return e
  159. }
  160. if timer.After(ft) || timer.Equal(ft) {
  161. ft = timer
  162. }
  163. }
  164. rows, err := chdb.Table(emo.TableName()).
  165. Raw(`SELECT *
  166. FROM otel_events
  167. WHERE AlertStatus = ? AND UID = ? AND AppendTime > ?
  168. ORDER BY AppendTime DESC`,
  169. PENDING, e.JR.UID, ft).
  170. Rows()
  171. if err != nil {
  172. logger.Error("rows query err: ", err)
  173. e.Errs["Get events rows err"] = err
  174. }
  175. elist := make([]omodels.Events, 0)
  176. defer rows.Close()
  177. er := omodels.Events{}
  178. for rows.Next() {
  179. if err := chdb.ScanRows(rows, &er); err != nil {
  180. logger.Errorf("range alert rows scan err: %s", err)
  181. e.Errs["range alert rows scan err"] = err
  182. }
  183. elist = append(elist, er)
  184. }
  185. if len(elist) >= e.AC.TiggerHz {
  186. e.JR.IsException = true
  187. e.JR.AlertStatus = FIRING
  188. } else {
  189. e.JR.IsException = true
  190. e.JR.AlertStatus = PENDING
  191. return e
  192. }
  193. case "rows":
  194. //rows 立即告警
  195. // if e.JR.IsException {
  196. // e.JR.AlertStatus = FIRING
  197. // }
  198. return e
  199. }
  200. return e
  201. }
  202. func (e *EventHandler) SetUID() *EventHandler {
  203. // logger.Debug("UUUUUUid: ", fmt.Sprintf("%s_%s_%s", e.RP.AppAlias, e.RP.RuleMonitorAlias, e.RP.Policy))
  204. // e.JR.UID = fmt.Sprintf("%s_%d", e.RP.AppAlias, e.RP.Id)
  205. e.JR.UID = fmt.Sprintf("%s", e.RP.AppAlias)
  206. return e
  207. }
  208. func (e *EventHandler) RegisterPrometheusGauge() *EventHandler {
  209. gauge := prometheus.NewGaugeVec(prometheus.GaugeOpts{
  210. Name: e.RP.RuleMonitorAlias,
  211. Help: e.RP.RuleName,
  212. }, []string{"app_alias", "kind", "uid"})
  213. if err := prometheus.Register(gauge); err != nil {
  214. if err.Error() == "duplicate metrics collector registration attempted" {
  215. logger.Errorf("Failed to register gauge: %v", err)
  216. logger.Infof("promMap: %v", e.promMap)
  217. return e
  218. } else {
  219. log.Fatalf("Failed to register gauge: %v", err)
  220. }
  221. }
  222. e.promMap.Store(e.RP.RuleMonitorAlias, gauge)
  223. return e
  224. }
  225. func (e *EventHandler) SetPromKV() *EventHandler {
  226. e.GetRegisteredGauge(e.RP.RuleMonitorAlias).WithLabelValues(
  227. e.RP.AppAlias, e.RP.RuleKind, e.JR.UID,
  228. ).Set(float64(e.JR.RowGaugeResult))
  229. logger.Debug("e.JR.RowGaugeResult: ", e.JR.RowGaugeResult)
  230. return e
  231. }
  232. // checkCollectorExists 检查注册表中是否存在具有给定名称的collector
  233. func CheckCollectorExists(reg *prometheus.Registry, name string) error {
  234. merticF, err := reg.Gather()
  235. if err != nil {
  236. logger.Errorf("checkCollector, error: %s", err.Error())
  237. }
  238. for _, m := range merticF {
  239. fmt.Println("metricFamily: ", m)
  240. }
  241. return nil
  242. }
  243. func (e *EventHandler) GetRegisteredGauge(gaugeName string) *prometheus.GaugeVec {
  244. logger.Debugf("guageName: %s", gaugeName)
  245. logger.Debug(e.promMap)
  246. promk, ok := e.promMap.Load(gaugeName)
  247. if ok {
  248. logger.Debug("has guage name")
  249. return promk.(*prometheus.GaugeVec)
  250. } else {
  251. e.RegisterPrometheusGauge()
  252. logger.Debug("recreate prometh guagename")
  253. return e.GetRegisteredGauge(gaugeName)
  254. }
  255. }
  256. func (e *EventHandler) CreateEventRecord(chdb *gorm.DB) *EventHandler {
  257. if e.JR.IsException {
  258. row := "UNSET"
  259. rows := make([]map[string]string, 0)
  260. switch e.RP.RuleValueType {
  261. case "row":
  262. row = fmt.Sprintf("%v", e.JR.RowResult)
  263. case "rows":
  264. for _, d := range e.JR.RowsResult {
  265. tmp := make(map[string]string)
  266. for k, v := range d {
  267. tmp[k] = fmt.Sprintf("%v", v)
  268. }
  269. rows = append(rows, tmp)
  270. }
  271. default:
  272. row = fmt.Sprintf("%v", e.JR.RowGaugeResult)
  273. }
  274. var exceptionName string
  275. if e.RP.RuleValueType == "rows" {
  276. if e.JR.AlertStatus == PENDING {
  277. //业务状态码异常数(近30分钟), x分钟内发生x次
  278. exceptionName = fmt.Sprintf("%s, %d分钟内发生: %v次",
  279. e.RP.RuleName, e.AC.Interval, e.JR.CompareV)
  280. } else if e.JR.AlertStatus == FIRING || e.JR.AlertStatus == SENDSUCCESS {
  281. exceptionName = fmt.Sprintf("%s, %d分钟内发生%s%v次, 当前值为: %v",
  282. e.RP.RuleName, e.AC.Interval, e.AC.Condition, e.JR.CompareV, e.JR.CompareV)
  283. }
  284. } else {
  285. exceptionName = fmt.Sprintf("%s%s %v, 当前值为: %v",
  286. e.RP.RuleName, e.AC.Condition, e.AC.Point, e.JR.CompareV)
  287. }
  288. emo := omodels.Events{
  289. Timestamp: time.Now(),
  290. AppendTime: e.RP.CreateTime,
  291. UID: e.JR.UID,
  292. AppID: int64(e.RP.AppId),
  293. RuleID: int64(e.RP.RuleId),
  294. AppName: e.RP.AppName,
  295. AppAlias: e.RP.AppAlias,
  296. ExceptionName: exceptionName,
  297. RuleInfo: map[string]string{
  298. "policy": e.RP.Policy,
  299. "rule_kind": e.RP.RuleKind,
  300. "rule_group": e.RP.RuleGroup,
  301. "rule_data_source": e.RP.RuleDataSource,
  302. "rule_table": e.RP.RuleTable,
  303. "rule_value_type": e.RP.RuleValueType,
  304. },
  305. CompareV: fmt.Sprintf("%v", e.JR.CompareV),
  306. RowResult: row,
  307. RowsResult: rows,
  308. AlertStatus: int64(e.JR.AlertStatus),
  309. }
  310. // logger.Debug("emo: ", emo)
  311. if err := chdb.Table(emo.TableName()).Create(emo).Error; err != nil {
  312. e.Errs["CreateEventRecord_func(Create)"] = err
  313. logger.Error("create emos err:", err.Error())
  314. }
  315. //发生告警,生成告警数据
  316. if e.JR.AlertStatus >= FIRING {
  317. e.Emo = &emo
  318. }
  319. }
  320. return e
  321. }
  322. func (e *EventHandler) CreateAlert(db *gorm.DB) *EventHandler {
  323. if e.JR.AlertStatus >= FIRING {
  324. //TODO: 发送至告警队列
  325. fe := amodels.OtFireEvents{
  326. AppId: e.Emo.AppID,
  327. RuleId: e.Emo.RuleID,
  328. ExceptionName: e.Emo.ExceptionName,
  329. RecordTime: e.Emo.Timestamp,
  330. AppendTime: e.Emo.AppendTime,
  331. Uid: e.Emo.UID,
  332. AppName: e.Emo.AppName,
  333. AppAlias: e.Emo.AppAlias,
  334. RowResult: fmt.Sprintf("%v", e.Emo.RowResult),
  335. AlertStatus: e.Emo.AlertStatus,
  336. IsKnow: 0,
  337. IsResolve: 0,
  338. IsIgnore: 0,
  339. }
  340. var err error
  341. rif, err := json.Marshal(e.Emo.RuleInfo)
  342. if err != nil {
  343. logger.Error("Marshal rule_info err: ", err)
  344. e.Errs["Marshal rule_info err"] = err
  345. }
  346. fe.RuleInfo = string(rif)
  347. rowst, err := json.Marshal(e.Emo.RowsResult)
  348. if err != nil {
  349. logger.Error("Marshal rows_result err: ", err)
  350. e.Errs["Marshal rows_result err"] = err
  351. }
  352. fe.RowsResult = string(rowst)
  353. if err = db.Table(fe.TableName()).Create(&fe).Error; err != nil {
  354. logger.Error("Create ot_fire_event err: ", err)
  355. e.Errs["Create ot_fire_event err"] = err
  356. }
  357. }
  358. return e
  359. }
  360. func (e *EventHandler) assertType(v, point interface{}, t string) {}
  361. // >
  362. func (e *EventHandler) g(v, point interface{}, t string) bool {
  363. switch t {
  364. case "float":
  365. return v.(float64) > point.(float64)
  366. case "int":
  367. return v.(int) > int(point.(float64))
  368. }
  369. return false
  370. }
  371. // >=
  372. func (e *EventHandler) ge(v, point interface{}, t string) bool {
  373. switch t {
  374. case "float":
  375. return v.(float64) >= point.(float64)
  376. case "int":
  377. return v.(int) >= int(point.(float64))
  378. }
  379. return false
  380. }
  381. // <
  382. func (e *EventHandler) l(v, point interface{}, t string) bool {
  383. switch t {
  384. case "float":
  385. return v.(float64) < point.(float64)
  386. case "int":
  387. return v.(int) < int(point.(float64))
  388. }
  389. return false
  390. }
  391. // <=
  392. func (e *EventHandler) le(v, point interface{}, t string) bool {
  393. switch t {
  394. case "float":
  395. return v.(float64) <= point.(float64)
  396. case "int":
  397. return v.(int) <= int(point.(float64))
  398. }
  399. return false
  400. }
  401. // ==
  402. func (e *EventHandler) e(v, point interface{}, t string) bool {
  403. switch t {
  404. case "float":
  405. return v.(float64) == point.(float64)
  406. case "int":
  407. return v.(int) == int(point.(float64))
  408. case "string":
  409. return v.(string) == point.(string)
  410. }
  411. return false
  412. }
  413. // !=
  414. func (e *EventHandler) ne(v, point interface{}, t string) bool {
  415. switch t {
  416. case "float":
  417. return v.(float64) != point.(float64)
  418. case "int":
  419. return v.(int) != int(point.(float64))
  420. case "string":
  421. return v.(string) != point.(string)
  422. }
  423. return false
  424. }