Alert Severity (#5055)

* define alerts severity * show severity in email * introduce alert severity in doc and examples * show severity in web-portal
microsoft · Nov 9, 2020 · 6cb7f8d · 6cb7f8d
1 parent acbec7b
commit 6cb7f8d
Show file tree

Hide file tree

Showing 14 changed files with 72 additions and 5 deletions.
diff --git a/contrib/kubespray/quick-start/services-configuration.yaml.template b/contrib/kubespray/quick-start/services-configuration.yaml.template
@@ -206,6 +206,8 @@ authentication:
 #       - alert: PAIJobGpuPercentLowerThan0_3For1h
 #         expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
 #         for: 1h
+#         labels:
+#           severity: warn
 #         annotations:
 #           summary: "{{$labels.job_name}} has a job gpu percent lower than 30% for 1 hour"
 #           description: Monitor job level gpu utilization in certain virtual clusters.

diff --git a/deployment/quick-start/services-configuration.yaml.template b/deployment/quick-start/services-configuration.yaml.template
@@ -110,6 +110,8 @@ rest-server:
 #       - alert: PAIJobGpuPercentLowerThan0_3For1h
 #         expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
 #         for: 1h
+#         labels:
+#           severity: warn
 #         annotations:
 #           summary: "{{$labels.job_name}} has a job gpu percent lower than 30% for 1 hour"
 #           description: Monitor job level gpu utilization in certain virtual clusters.

diff --git a/docs/manual/cluster-admin/how-to-use-alert-system.md b/docs/manual/cluster-admin/how-to-use-alert-system.md
@@ -33,7 +33,8 @@ To view existing alert rules based on the metrics, you can go to `http(s)://<you
 
 ### How to Add Customized Alerts
 
-You can define customized alerts in the `prometheus` field in [`services-configuration.yml`](./basic-management-operations.md#pai-service-management-and-paictl). For example, We can add a customized alert `PAIJobGpuPercentLowerThan0_3For1h` by adding:
+You can define customized alerts in the `prometheus` field in [`services-configuration.yml`](./basic-management-operations.md#pai-service-management-and-paictl).
+For example, We can add a customized alert `PAIJobGpuPercentLowerThan0_3For1h` by adding:
 
 ``` yaml
 prometheus:
@@ -44,12 +45,16 @@ prometheus:
       - alert: PAIJobGpuPercentLowerThan0_3For1h
         expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
         for: 1h
+        labels:
+          severity: warn
         annotations:
           summary: "{{$labels.job_name}} has a job gpu percent lower than 30% for 1 hour"
           description: Monitor job level gpu utilization in certain virtual clusters.
 ```
 
 The `PAIJobGpuPercentLowerThan0_3For1h` alert will be fired when the job on virtual cluster `default` has a task level average GPU percent lower than `30%` for more than `1 hour`.
+The alert severity can be defined as `info`, `warn`, `error` or `fatal` by adding a label.
+Here we use `warn`.
 Here the metric `task_gpu_percent` is used, which describes the GPU utilization at task level. 
 
 Remember to push service config to the cluster and restart the `prometheus` service after your modification with the following commands [in the dev-box container](./basic-management-operations.md#pai-service-management-and-paictl):

diff --git a/docs_zh_CN/manual/cluster-admin/how-to-use-alert-system.md b/docs_zh_CN/manual/cluster-admin/how-to-use-alert-system.md
@@ -44,6 +44,8 @@ prometheus:
         - alert: PAIJobGpuPercentLowerThan0_3For1h
           expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
           for: 1h
+          labels:
+            severity: warn
           annotations:
             summary: "{{$labels.job_name}} has a job gpu percent lower than 30% for 1 hour"
             description: Monitor job level gpu utilization in certain virtual clusters.

diff --git a/examples/cluster-configuration/services-configuration.yaml b/examples/cluster-configuration/services-configuration.yaml
@@ -144,6 +144,8 @@ rest-server:
 #       - alert: PAIJobGpuPercentLowerThan0_3For1h
 #         expr: avg(task_gpu_percent{virtual_cluster=~"default"}) by (job_name) < 0.3
 #         for: 1h
+#         labels:
+#           severity: warn
 #         annotations:
 #           summary: "{{$labels.job_name}} has a job gpu percent lower than 30% for 1 hour"
 #           description: Monitor job level gpu utilization in certain virtual clusters.

diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template
@@ -30,7 +30,7 @@ data:
       group_wait: 30s
       group_interval: 5m
       repeat_interval: {{ cluster_cfg["alert-manager"]["repeat-interval"] }}
-      group_by: [alertname, alertstate]
+      group_by: [alertname, alertstate, severity]
 
       routes:
       - receiver: pai-cordon-nodes

diff --git a/src/alert-manager/src/alert-handler/emails/general-templates/subject.ejs b/src/alert-manager/src/alert-handler/emails/general-templates/subject.ejs
@@ -1,4 +1,4 @@
-<%= cluster_id %>:
+<%= cluster_id %> <%= groupLabels.severity %>: 
 <% if (alerts.filter( element => element.status =="firing").length > 0) { %>
 [FIRING: <%= alerts.filter( element => element.status =="firing").length %> ]
 <% } %>

diff --git a/src/prometheus/deploy/alerting/customized.rules.template b/src/prometheus/deploy/alerting/customized.rules.template
@@ -17,4 +17,6 @@
 
 # Rule Syntax Reference: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
 
+# select alert severity from `info`, `warn`, `error` or `fatal`
+
 {{ cluster_cfg['prometheus']['customized-alerts'] }}
diff --git a/src/prometheus/deploy/alerting/gpu.rules b/src/prometheus/deploy/alerting/gpu.rules
@@ -17,47 +17,63 @@
 
 # Rule Syntax Reference: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
 
+# select alert severity from `info`, `warn`, `error` or `fatal`
+
 groups:
     - name: gpu_related
       rules:
       - alert: NvidiaSmiLatencyTooLarge
         expr: histogram_quantile(0.95, sum(rate(cmd_nvidia_smi_latency_seconds_bucket[5m])) by (le, instance)) > 40
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "95th nvidia-smi call latency is larger than 40s in {{$labels.instance}}, should check the gpu status manually"
 
       - alert: NvidiaSmiDoubleEccError
         expr: nvidiasmi_ecc_error_count{type="double"} > 0
         for: 5m
+        labels:
+          severity: fatal
         annotations:
           summary: "nvidia card from {{$labels.instance}} minor number {{$labels.minor_number}} has {{$labels.type}} ecc error, count {{$value}}"
 
       - alert: NvidiaMemoryLeak
         expr: nvidiasmi_memory_leak_count > 0
         for: 5m
+        labels:
+          severity: error
         annotations:
           summary: "found nvidia memory leak from {{$labels.instance}} minor number {{$labels.minor_number}}"
 
       - alert: NvidiaZombieProcess
         expr: zombie_process_count{command="nvidia-smi"} > 0
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "found nvidia zombie process in {{$labels.instance}}"
 
       - alert: GpuUsedByExternalProcess
         expr: gpu_used_by_external_process_count > 0
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "found nvidia used by external process in {{$labels.instance}}"
 
       - alert: GpuUsedByZombieContainer
         expr: gpu_used_by_zombie_container_count > 0
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "found nvidia used by zombie container in {{$labels.instance}}"
 
       - alert: NodeGpuCountChanged
         expr: changes(node:gpu_utilization:count[5m]) > 0
+        labels:
+          severity: fatal
         annotations:
           summary: "found gpu count changes in {{$labels.instance}}"
 
diff --git a/src/prometheus/deploy/alerting/jobs.rules b/src/prometheus/deploy/alerting/jobs.rules
@@ -17,16 +17,22 @@
 
 # Rule Syntax Reference: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
 
+# select alert severity from `info`, `warn`, `error` or `fatal`
+
 groups:
 - name: pai-jobs
   rules:
   - alert: PaiJobsZombie
     expr: zombie_container_count > 0
     for: 1h # only when it exceed 1 hour
+    labels:
+      severity: info
     annotations:
-      summary: "zombie container in {{$labels.instance}}detected"
+      summary: "zombie container in {{$labels.instance}} detected"
   - alert: PaiJobPending
     expr: pai_job_pod_count{pod_bound="true", phase="pending"} > 0
     for: 30m
+    labels:
+      severity: warn
     annotations:
       summary: "Job {{$labels.job_name}}in pending status detected"
diff --git a/src/prometheus/deploy/alerting/k8s.rules b/src/prometheus/deploy/alerting/k8s.rules
@@ -17,17 +17,23 @@
 
 # Rule Syntax Reference: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
 
+# select alert severity from `info`, `warn`, `error` or `fatal`
+
 groups:
     - name: k8s_component
       rules:
       - alert: k8sApiServerNotOk
         expr: k8s_api_server_count{error!="ok"} > 0
         for: 10m
+        labels:
+          severity: fatal
         annotations:
           summary: "api server in {{$labels.host_ip}} is {{$labels.error}}"
 
       - alert: k8sDockerDaemonNotOk
         expr: docker_daemon_count{error!="ok"} > 0
         for: 5m
+        labels:
+          severity: fatal
         annotations:
           summary: "docker daemon in {{$labels.ip}} is {{$labels.error}}"
diff --git a/src/prometheus/deploy/alerting/node.rules b/src/prometheus/deploy/alerting/node.rules
@@ -17,47 +17,63 @@
 
 # Rule Syntax Reference: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
 
+# select alert severity from `info`, `warn`, `error` or `fatal`
+
 groups:
     - name: node-rules
       rules:
       - alert: NodeFilesystemUsage
         expr: node_filesystem_avail_bytes{mountpoint=~"/host-root.*", device=~"/dev.*"} / node_filesystem_size_bytes * 100 <= 20
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "Free space in {{$labels.device}} from {{$labels.instance}} is less than 20% (current value is: {{ $value }})"
 
       - alert: NodeMemoryUsage
         expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 95
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "Memory usage in {{$labels.instance}} is above 95% (current value is: {{ $value }})"
 
       - alert: NodeCPUUsage
         expr: (100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)) > 98
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "CPU usage in {{$labels.instance}} is above 98% (current value is: {{ $value }})"
 
       - alert: NodeDiskPressure
         expr: pai_node_count{disk_pressure="true"} > 0
         for: 10m
+        labels:
+          severity: error
         annotations:
           summary: "{{$labels.name}} is under disk pressure"
 
       - alert: NodeOutOfDisk
         expr: pai_node_count{out_of_disk="true"} > 0
         for: 10m
+        labels:
+          severity: error
         annotations:
           summary: "{{$labels.name}} is out of disk"
 
       - alert: NodeNotReady
         expr: pai_node_count{ready!="true"} > 0
         for: 10m
+        labels:
+          severity: error
         annotations:
           summary: "{{$labels.name}} is not ready"
 
       - alert: AzureAgentConsumeTooMuchMem
         expr: process_mem_usage_byte{cmd=~".*om[is]agent.*"} > 1073741824 # 1G
         for: 5m
+        labels:
+          severity: warn
         annotations:
           summary: "{{$labels.cmd}} with pid {{$labels.pid}} in {{$labels.instance}} consume more than 1G of memory"
diff --git a/src/prometheus/deploy/alerting/pai-services.rules b/src/prometheus/deploy/alerting/pai-services.rules
@@ -17,12 +17,17 @@
 
 # Rule Syntax Reference: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
 
+# select alert severity from `info`, `warn`, `error` or `fatal`
+
 groups:
     - name: pai-services
       rules:
       - alert: PaiServicePodNotRunning
         expr: pai_pod_count{phase!="running"} > 0
         for: 10m
+        labels:
+          type: pai_service
+          severity: error
         annotations:
           summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected"
 
@@ -31,6 +36,7 @@ groups:
         for: 10m
         labels:
           type: pai_service
+          severity: error
         annotations:
           summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
 
@@ -39,6 +45,7 @@ groups:
         for: 5m
         labels:
           type: pai_service
+          severity: error
         annotations:
           summary: "{{$labels.pai_service_name}} in {{$labels.instance}} not up detected"
 
@@ -47,5 +54,6 @@ groups:
         for: 5m
         labels:
           type: pai_service
+          severity: warn
         annotations:
           summary: "{{$labels.name}} in {{$labels.instance}} hangs detected"
diff --git a/src/webportal/src/app/layout/components/alerts.jsx b/src/webportal/src/app/layout/components/alerts.jsx
@@ -140,7 +140,7 @@ export const NotificationButton = () => {
               <div className={classNames.itemCell} data-is-focusable={true}>
                 {'Issue time: ' + new Date(item.startsAt).toLocaleString()}
                 <br />
-                {'Summary: ' + item.annotations.summary}
+                {item.labels.severity + ':' + item.annotations.summary}
               </div>
             );
           }}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -17,4 +17,6 @@

		# Rule Syntax Reference: https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/

		# select alert severity from `info`, `warn`, `error` or `fatal`

		{{ cluster_cfg['prometheus']['customized-alerts'] }}