From a5d7a5bdfaddb9805080ef39d8f480d21e4d2cbd Mon Sep 17 00:00:00 2001 From: tomsun28 Date: Thu, 4 Jan 2024 13:52:06 +0800 Subject: [PATCH] bugfix alarm trigger-times not work when alarm and recovered trigger cyclically (#1468) Signed-off-by: tomsun28 --- .../alert/calculate/CalculateAlarm.java | 59 ++++++++++--------- web-app/src/assets/i18n/zh-CN.json | 2 +- web-app/src/assets/i18n/zh-TW.json | 2 +- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java b/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java index 96ce9cb6705..5ea6b55ed7c 100644 --- a/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java +++ b/alerter/src/main/java/org/dromara/hertzbeat/alert/calculate/CalculateAlarm.java @@ -69,7 +69,7 @@ public class CalculateAlarm { * key - monitorId+alertDefineId 为普通阈值告警 | The alarm is a common threshold alarm * key - monitorId 为任务状态可用性可达性告警 | Indicates the monitoring status availability reachability alarm */ - private final Map triggeredAlertMap; + private final Map triggeredAlertMap; /** * The not recover alert * key - monitorId + alertDefineId + (instance) @@ -91,8 +91,8 @@ public CalculateAlarm(AlerterWorkerPool workerPool, CommonDataQueue dataQueue, this.alertDefineService = alertDefineService; this.alertService = alertService; this.bundle = ResourceBundleUtil.getBundle("alerter"); - this.triggeredAlertMap = new ConcurrentHashMap<>(128); - this.notRecoveredAlertMap = new ConcurrentHashMap<>(128); + this.triggeredAlertMap = new ConcurrentHashMap<>(16); + this.notRecoveredAlertMap = new ConcurrentHashMap<>(16); // Initialize stateAlertMap List monitors = monitorDao.findMonitorsByStatus(CommonConstants.UN_AVAILABLE_CODE); if (monitors != null) { @@ -102,7 +102,7 @@ public CalculateAlarm(AlerterWorkerPool workerPool, CommonDataQueue dataQueue, tags.put(TAG_MONITOR_NAME, monitor.getName()); tags.put(TAG_MONITOR_APP, monitor.getApp()); this.notRecoveredAlertMap.put(monitor.getId() + CommonConstants.AVAILABILITY, - Alert.builder().tags(tags).target(AVAILABILITY).status(UN_AVAILABLE_CODE).build()); + Alert.builder().tags(tags).target(AVAILABILITY).status(ALERT_STATUS_CODE_PENDING).build()); } } startCalculate(); @@ -145,17 +145,15 @@ private void calculate(CollectRep.MetricsData metricsData) { return; } List fields = metricsData.getFieldsList(); - Map fieldValueMap = new HashMap<>(16); + Map fieldValueMap = new HashMap<>(8); int valueRowCount = metricsData.getValuesCount(); for (Map.Entry> entry : defineMap.entrySet()) { List defines = entry.getValue(); for (AlertDefine define : defines) { final String expr = define.getExpr(); - if (StringUtils.isBlank(expr)) { continue; } - if (expr.contains(SYSTEM_VALUE_ROW_COUNT)) { fieldValueMap.put(SYSTEM_VALUE_ROW_COUNT, valueRowCount); try { @@ -166,9 +164,13 @@ private void calculate(CollectRep.MetricsData metricsData) { afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define); // 若此阈值已被触发,则其它数据行的触发忽略 continue; - } else if (define.isRecoverNotice()) { - String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + null; - handleRecoveredAlert(currentTimeMilli, monitorId, app, define, expr, notResolvedAlertKey); + } else { + String monitorAlertKey = String.valueOf(monitorId) + define.getId(); + triggeredAlertMap.remove(monitorAlertKey); + if (define.isRecoverNotice()) { + String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + null; + handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey); + } } } catch (Exception e) { log.warn(e.getMessage(), e); @@ -219,9 +221,13 @@ private void calculate(CollectRep.MetricsData metricsData) { afterThresholdRuleMatch(currentTimeMilli, monitorId, app, metrics, fieldValueMap, define); // 若此阈值已被触发,则其它数据行的触发忽略 break; - } else if (define.isRecoverNotice()) { - String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + (instanceBuilder.length() == 0 ? null : instanceBuilder.toString()); - handleRecoveredAlert(currentTimeMilli, monitorId, app, define, expr, notResolvedAlertKey); + } else { + String monitorAlertKey = String.valueOf(monitorId) + define.getId(); + triggeredAlertMap.remove(monitorAlertKey); + if (define.isRecoverNotice()) { + String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + (instanceBuilder.length() == 0 ? null : instanceBuilder.toString()); + handleRecoveredAlert(currentTimeMilli, define, expr, notResolvedAlertKey); + } } } catch (Exception e) { log.warn(e.getMessage(), e); @@ -231,7 +237,7 @@ private void calculate(CollectRep.MetricsData metricsData) { } } - private void handleRecoveredAlert(long currentTimeMilli, long monitorId, String app, AlertDefine define, String expr, String notResolvedAlertKey) { + private void handleRecoveredAlert(long currentTimeMilli, AlertDefine define, String expr, String notResolvedAlertKey) { Alert notResolvedAlert = notRecoveredAlertMap.remove(notResolvedAlertKey); if (notResolvedAlert != null) { // Sending an alarm Restore @@ -261,6 +267,7 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri triggeredAlert.setLastAlarmTime(currentTimeMilli); int defineTimes = define.getTimes() == null ? 1 : define.getTimes(); if (times >= defineTimes) { + triggeredAlert.setStatus(ALERT_STATUS_CODE_PENDING); String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance"); triggeredAlertMap.remove(monitorAlertKey); notRecoveredAlertMap.put(notResolvedAlertKey, triggeredAlert); @@ -270,7 +277,7 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri fieldValueMap.put("app", app); fieldValueMap.put("metrics", metrics); fieldValueMap.put("metric", define.getField()); - Map tags = new HashMap<>(6); + Map tags = new HashMap<>(8); tags.put(CommonConstants.TAG_MONITOR_ID, String.valueOf(monitorId)); tags.put(CommonConstants.TAG_MONITOR_APP, app); tags.put(CommonConstants.TAG_THRESHOLD_ID, String.valueOf(define.getId())); @@ -283,7 +290,7 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri Alert alert = Alert.builder() .tags(tags) .priority(define.getPriority()) - .status(ALERT_STATUS_CODE_PENDING) + .status(ALERT_STATUS_CODE_NOT_REACH) .target(app + "." + metrics + "." + define.getField()) .triggerTimes(1) .firstAlarmTime(currentTimeMilli) @@ -293,9 +300,10 @@ private void afterThresholdRuleMatch(long currentTimeMilli, long monitorId, Stri .build(); int defineTimes = define.getTimes() == null ? 1 : define.getTimes(); if (1 >= defineTimes) { + alert.setStatus(ALERT_STATUS_CODE_PENDING); String notResolvedAlertKey = String.valueOf(monitorId) + define.getId() + fieldValueMap.get("instance"); notRecoveredAlertMap.put(notResolvedAlertKey, alert); - alarmCommonReduce.reduceAndSendAlarm(alert); + alarmCommonReduce.reduceAndSendAlarm(alert.clone()); } else { triggeredAlertMap.put(monitorAlertKey, alert); } @@ -346,7 +354,7 @@ private void handlerAvailableMetrics(long monitorId, String app, CollectRep.Metr Alert.AlertBuilder alertBuilder = Alert.builder() .tags(tags) .priority(avaAlertDefine.getPriority()) - .status(ALERT_STATUS_CODE_PENDING) + .status(ALERT_STATUS_CODE_NOT_REACH) .target(CommonConstants.AVAILABILITY) .content(AlertTemplateUtil.render(avaAlertDefine.getTemplate(), valueMap)) .firstAlarmTime(currentTimeMill) @@ -354,19 +362,14 @@ private void handlerAvailableMetrics(long monitorId, String app, CollectRep.Metr .triggerTimes(1); if (avaAlertDefine.getTimes() == null || avaAlertDefine.getTimes() <= 1) { String notResolvedAlertKey = monitorId + CommonConstants.AVAILABILITY; + alertBuilder.status(ALERT_STATUS_CODE_PENDING); notRecoveredAlertMap.put(notResolvedAlertKey, alertBuilder.build()); - alarmCommonReduce.reduceAndSendAlarm(alertBuilder.build().clone()); + alarmCommonReduce.reduceAndSendAlarm(alertBuilder.build()); } else { - alertBuilder.status(CommonConstants.ALERT_STATUS_CODE_NOT_REACH); + triggeredAlertMap.put(String.valueOf(monitorId), alertBuilder.build()); } - triggeredAlertMap.put(String.valueOf(monitorId), alertBuilder.build()); } else { int times = preAlert.getTriggerTimes() + 1; - if (preAlert.getStatus() == ALERT_STATUS_CODE_PENDING) { - times = 1; - preAlert.setContent(AlertTemplateUtil.render(avaAlertDefine.getTemplate(), valueMap)); - preAlert.setTags(tags); - } preAlert.setTriggerTimes(times); preAlert.setFirstAlarmTime(currentTimeMill); preAlert.setLastAlarmTime(currentTimeMill); @@ -376,14 +379,14 @@ private void handlerAvailableMetrics(long monitorId, String app, CollectRep.Metr String notResolvedAlertKey = monitorId + CommonConstants.AVAILABILITY; notRecoveredAlertMap.put(notResolvedAlertKey, preAlert.clone()); alarmCommonReduce.reduceAndSendAlarm(preAlert.clone()); - } else { - preAlert.setStatus(CommonConstants.ALERT_STATUS_CODE_NOT_REACH); + triggeredAlertMap.remove(String.valueOf(monitorId)); } } } else { // Check whether an availability or unreachable alarm is generated before the association monitoring // and send a clear alarm to clear the monitoring status // 判断关联监控之前是否有可用性或者不可达告警,发送恢复告警进行任务状态恢复 + triggeredAlertMap.remove(String.valueOf(monitorId)); String notResolvedAlertKey = monitorId + CommonConstants.AVAILABILITY; Alert notResolvedAlert = notRecoveredAlertMap.remove(notResolvedAlertKey); if (notResolvedAlert != null) { diff --git a/web-app/src/assets/i18n/zh-CN.json b/web-app/src/assets/i18n/zh-CN.json index fa97878d399..3dcf5f0144c 100644 --- a/web-app/src/assets/i18n/zh-CN.json +++ b/web-app/src/assets/i18n/zh-CN.json @@ -214,7 +214,7 @@ "alert.center.tags": "标签", "alert.center.status": "状态", "alert.center.time": "告警时间", - "alert.center.time.tip": "此告警期间统计触发 {{times}} 次告警", + "alert.center.time.tip": "此告警期间累计触发 {{times}} 次告警", "alert.center.first-time": "开始", "alert.center.last-time": "最新", "alert.center.confirm.delete": "请确认是否删除!", diff --git a/web-app/src/assets/i18n/zh-TW.json b/web-app/src/assets/i18n/zh-TW.json index 395e31e1b69..13ee7117f12 100644 --- a/web-app/src/assets/i18n/zh-TW.json +++ b/web-app/src/assets/i18n/zh-TW.json @@ -213,7 +213,7 @@ "alert.center.tags": "標簽", "alert.center.status": "狀態", "alert.center.time": "告警時間", - "alert.center.time.tip": "此告警期間統計觸發 {{times}} 次告警", + "alert.center.time.tip": "此告警期間累計觸發 {{times}} 次告警", "alert.center.first-time": "開始", "alert.center.last-time": "最新", "alert.center.confirm.delete": "請確認是否刪除!",