From 8d4ab7035fa85d5069b0839f39edbb99d85ff7c7 Mon Sep 17 00:00:00 2001 From: Guoxin Date: Tue, 10 Nov 2020 09:32:27 +0800 Subject: [PATCH] Remove redundant alerts (#5052) --- docs/manual/cluster-admin/troubleshooting.md | 2 +- .../deploy/alert-manager-configmap.yaml.template | 8 ++++++++ src/prometheus/deploy/alerting/node.rules | 6 +++--- src/prometheus/deploy/alerting/pai-services.rules | 10 +++++----- src/prometheus/deploy/service.yaml | 1 - .../deploy/{start.sh.template => start.sh} | 0 .../src/pkg/watchdog/prom_metric_collector.go | 12 ++++++++---- .../src/pkg/watchdog/prom_metric_collector_test.go | 14 +++++++------- src/watchdog/src/testdata/framework_list.json | 2 +- src/watchdog/src/testdata/no_condition_pod.json | 2 +- src/watchdog/src/testdata/node_list.json | 2 +- src/watchdog/src/testdata/pod_list.json | 4 ++-- 12 files changed, 37 insertions(+), 26 deletions(-) rename src/prometheus/deploy/{start.sh.template => start.sh} (100%) diff --git a/docs/manual/cluster-admin/troubleshooting.md b/docs/manual/cluster-admin/troubleshooting.md index c3f3fc3ebe..072eb58ef0 100644 --- a/docs/manual/cluster-admin/troubleshooting.md +++ b/docs/manual/cluster-admin/troubleshooting.md @@ -23,7 +23,7 @@ Solutions: This is a kind of alert from alert manager, and is reported by watchdog service. Watchdog gets such metrics from Kubernetes API. Example metrics is like: ``` -pai_node_count{disk_pressure="false",instance="10.0.0.1:9101",job="pai_serivce_exporter",memory_pressure="false",name="10.0.0.2",out_of_disk="false",pai_service_name="watchdog",ready="true",scraped_from="watchdog-5ddd945975-kwhpr"} +pai_node_count{disk_pressure="false",instance="10.0.0.1:9101",job="pai_serivce_exporter",memory_pressure="false",host_ip="10.0.0.2",out_of_disk="false",pai_service_name="watchdog",ready="true",scraped_from="watchdog-5ddd945975-kwhpr"} ``` The name label indicate what node this metric represents. diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index 38927906f1..434bc084c9 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -111,3 +111,11 @@ data: {% endif %} {% endfor %} + + inhibit_rules: + - source_match: + alertname: 'NodeNotReady' + target_match_re: + alertname: ^PaiServicePodNotRunning|PaiServicePodNotReady$ + # Apply inhibition if `node_name` is the same. + equal: ['node_name'] diff --git a/src/prometheus/deploy/alerting/node.rules b/src/prometheus/deploy/alerting/node.rules index fac023efb0..a652f6605a 100644 --- a/src/prometheus/deploy/alerting/node.rules +++ b/src/prometheus/deploy/alerting/node.rules @@ -52,7 +52,7 @@ groups: labels: severity: error annotations: - summary: "{{$labels.name}} is under disk pressure" + summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is under disk pressure" - alert: NodeOutOfDisk expr: pai_node_count{out_of_disk="true"} > 0 @@ -60,7 +60,7 @@ groups: labels: severity: error annotations: - summary: "{{$labels.name}} is out of disk" + summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is out of disk" - alert: NodeNotReady expr: pai_node_count{ready!="true"} > 0 @@ -68,7 +68,7 @@ groups: labels: severity: error annotations: - summary: "{{$labels.name}} is not ready" + summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is not ready" - alert: AzureAgentConsumeTooMuchMem expr: process_mem_usage_byte{cmd=~".*om[is]agent.*"} > 1073741824 # 1G diff --git a/src/prometheus/deploy/alerting/pai-services.rules b/src/prometheus/deploy/alerting/pai-services.rules index 497e0dc430..50746bcd96 100644 --- a/src/prometheus/deploy/alerting/pai-services.rules +++ b/src/prometheus/deploy/alerting/pai-services.rules @@ -29,7 +29,7 @@ groups: type: pai_service severity: error annotations: - summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected" + summary: "{{$labels.name}} in {{$labels.node_name}} ({{$labels.host_ip}}) not running detected" - alert: PaiServicePodNotReady expr: pai_pod_count{phase="running", ready="false"} > 0 @@ -38,16 +38,16 @@ groups: type: pai_service severity: error annotations: - summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected" + summary: "{{$labels.name}} in {{$labels.node_name}} ({{$labels.host_ip}}) not ready detected" - - alert: PaiServiceNotUp - expr: up != 1 + - alert: WatchdogNotUp + expr: up{pai_service_name="watchdog"} != 1 for: 5m labels: type: pai_service severity: error annotations: - summary: "{{$labels.pai_service_name}} in {{$labels.instance}} not up detected" + summary: "OpenPAI service watchdog in {{$labels.instance}} not up detected" - alert: JobExporterHangs expr: rate(collector_iteration_count_total[10m]) == 0 diff --git a/src/prometheus/deploy/service.yaml b/src/prometheus/deploy/service.yaml index e887bdc018..ef1641ce3c 100644 --- a/src/prometheus/deploy/service.yaml +++ b/src/prometheus/deploy/service.yaml @@ -25,7 +25,6 @@ prerequisite: template-list: - prometheus-configmap.yaml - prometheus-deployment.yaml - - start.sh - delete.yaml - alerting/customized.rules diff --git a/src/prometheus/deploy/start.sh.template b/src/prometheus/deploy/start.sh similarity index 100% rename from src/prometheus/deploy/start.sh.template rename to src/prometheus/deploy/start.sh diff --git a/src/watchdog/src/pkg/watchdog/prom_metric_collector.go b/src/watchdog/src/pkg/watchdog/prom_metric_collector.go index a5729fa086..5526535b90 100644 --- a/src/watchdog/src/pkg/watchdog/prom_metric_collector.go +++ b/src/watchdog/src/pkg/watchdog/prom_metric_collector.go @@ -46,28 +46,28 @@ var ( "pai_pod_count", "count of pai pod", []string{ - "service_name", "name", "namespace", "phase", "host_ip", "initialized", "pod_scheduled", "ready", + "service_name", "name", "namespace", "phase", "host_ip", "node_name", "initialized", "pod_scheduled", "ready", }, ), "jobPodCount": createMetric( "pai_job_pod_count", "count of pai job pod", []string{ - "job_name", "name", "phase", "host_ip", "initialized", "pod_bound", "pod_scheduled", "ready", + "job_name", "name", "phase", "host_ip", "node_name", "initialized", "pod_bound", "pod_scheduled", "ready", }, ), "paiContainerCount": createMetric( "pai_container_count", "count of container pod", []string{ - "service_name", "pod_name", "name", "namespace", "state", "host_ip", "ready", + "service_name", "pod_name", "name", "namespace", "state", "host_ip", "node_name", "ready", }, ), "paiNodeCount": createMetric( "pai_node_count", "count of pai node", []string{ - "name", "disk_pressure", "memory_pressure", "ready", "unschedulable", + "host_ip", "node_name", "disk_pressure", "memory_pressure", "ready", "unschedulable", }, ), } @@ -273,6 +273,7 @@ func (p *PromMetricCollector) getPaiNodeMetrics(nodeMetric nodeMetric) []prometh prometheus.GaugeValue, 1, nodeMetric.ip, + nodeMetric.name, nodeMetric.diskPressure, nodeMetric.memoryPressure, nodeMetric.ready, @@ -342,6 +343,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me podMetric.namespace, podMetric.phase, podMetric.hostIP, + podMetric.nodeName, podMetric.initialized, podMetric.scheduled, podMetric.ready, @@ -360,6 +362,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me podMetric.namespace, c.status, podMetric.hostIP, + podMetric.nodeName, strconv.FormatBool(c.ready), )) } @@ -372,6 +375,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me podMetric.name, podMetric.phase, podMetric.hostIP, + podMetric.nodeName, podMetric.initialized, strconv.FormatBool(podMetric.bound), podMetric.scheduled, diff --git a/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go b/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go index 9635bf989d..8eb3e56650 100644 --- a/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go +++ b/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go @@ -75,21 +75,21 @@ func TestGeneratePodsMetrics(t *testing.T) { expectLables := [][]map[string]string{ { { - "host_ip": "10.151.41.8", "initialized": "true", "name": "log-manager-ds-nxm2k", "namespace": "default", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "initialized": "true", "name": "log-manager-ds-nxm2k", "namespace": "default", "phase": "running", "pod_scheduled": "true", "ready": "true", "service_name": "log-manager", }, { - "host_ip": "10.151.41.8", "name": "log-manager-logrotate", "namespace": "default", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "name": "log-manager-logrotate", "namespace": "default", "pod_name": "log-manager-ds-nxm2k", "state": "running", "ready": "true", "service_name": "log-manager", }, { - "host_ip": "10.151.41.8", "name": "log-manager-nginx", "namespace": "default", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "name": "log-manager-nginx", "namespace": "default", "pod_name": "log-manager-ds-nxm2k", "state": "running", "ready": "true", "service_name": "log-manager", }, }, {}, { { - "host_ip": "10.1.3.29", "initialized": "true", "job_name": "it_it_batch052_infer_80-159_bs2_V1", + "host_ip": "10.1.3.29", "node_name": "test_node_1", "initialized": "true", "job_name": "it_it_batch052_infer_80-159_bs2_V1", "name": "f1up4zk9ehfpjx2zc9gq8rv860uk4qv9dtk6awjz70r2uc9n75fp4wtjbxb32-taskrole-28", "namespace": "default", "phase": "pending", "pod_bound": "true", "pod_scheduled": "true", "ready": "false", }, @@ -118,7 +118,7 @@ func TestGenerateNodesMetrics(t *testing.T) { metrics := mo.pc.getPaiNodeMetrics(nodeMetrics[0]) expectLables := []map[string]string{ { - "name": "10.151.41.8", "disk_pressure": "false", "memory_pressure": "false", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "disk_pressure": "false", "memory_pressure": "false", "ready": "true", "unschedulable": "false", }, } @@ -176,7 +176,7 @@ func TestParseNoConditionPods(t *testing.T) { promMetrics := mo.pc.getPodMetrics(podMetrics[0]) expectLables := []map[string]string{ { - "host_ip": "unscheduled", "initialized": "unknown", "name": "yarn-frameworklauncher-ds-2684q", "namespace": "default", + "host_ip": "unscheduled", "node_name": "test_node_0", "initialized": "unknown", "name": "yarn-frameworklauncher-ds-2684q", "namespace": "default", "phase": "failed", "pod_scheduled": "unknown", "ready": "unknown", "service_name": "frameworklauncher", }, } @@ -197,7 +197,7 @@ func TestParseDLWSUnschedulableNodes(t *testing.T) { promMetrics := mo.pc.getPaiNodeMetrics(nodeMetrics[0]) expectLables := []map[string]string{ { - "name": "192.168.255.1", "disk_pressure": "false", "memory_pressure": "false", + "host_ip": "192.168.255.1", "node_name": "dltsp40-infra01", "disk_pressure": "false", "memory_pressure": "false", "ready": "true", "unschedulable": "true", }, } diff --git a/src/watchdog/src/testdata/framework_list.json b/src/watchdog/src/testdata/framework_list.json index d162462817..ce3f015601 100644 --- a/src/watchdog/src/testdata/framework_list.json +++ b/src/watchdog/src/testdata/framework_list.json @@ -454,7 +454,7 @@ "podHostIP": "10.151.41.8", "podIP": "10.151.41.8", "podName": "059cf3d85cb5f6280e9606d47551554d-taskrole-0", - "podNodeName": "10.151.41.8", + "podNodeName": "test_node_0", "podUID": "72c6d156-3b64-11ea-9b7f-000d3ab25bb6", "runTime": null, "startTime": "2020-01-20T09:08:39Z" diff --git a/src/watchdog/src/testdata/no_condition_pod.json b/src/watchdog/src/testdata/no_condition_pod.json index 026ec90cff..7295732ff8 100644 --- a/src/watchdog/src/testdata/no_condition_pod.json +++ b/src/watchdog/src/testdata/no_condition_pod.json @@ -139,7 +139,7 @@ "nodeSelector": { "launcher": "true" }, - "nodeName": "10.151.40.4", + "nodeName": "test_node_0", "hostNetwork": true, "hostPID": true, "securityContext": { diff --git a/src/watchdog/src/testdata/node_list.json b/src/watchdog/src/testdata/node_list.json index cf2e93d0c5..c1e490f6fa 100644 --- a/src/watchdog/src/testdata/node_list.json +++ b/src/watchdog/src/testdata/node_list.json @@ -23,7 +23,7 @@ "kubernetes.io/os": "linux", "pai-worker": "true" }, - "name": "10.151.41.8", + "name": "test_node_0", "resourceVersion": "28455741", "selfLink": "/api/v1/nodes/10.151.41.8", "uid": "62340679-d056-11e9-97f6-000d3ab25bb6" diff --git a/src/watchdog/src/testdata/pod_list.json b/src/watchdog/src/testdata/pod_list.json index 5c13dcebbf..1cfc055177 100644 --- a/src/watchdog/src/testdata/pod_list.json +++ b/src/watchdog/src/testdata/pod_list.json @@ -117,7 +117,7 @@ "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", - "nodeName": "10.151.41.8", + "nodeName": "test_node_0", "securityContext": {}, "imagePullSecrets": [ { @@ -685,7 +685,7 @@ "dnsPolicy": "ClusterFirst", "serviceAccountName": "frameworkbarrier", "serviceAccount": "frameworkbarrier", - "nodeName": "10.1.3.29", + "nodeName": "test_node_1", "hostNetwork": true, "securityContext": {}, "imagePullSecrets": [