diff --git a/src/alert-manager/deploy/alert-manager-configmap.yaml.template b/src/alert-manager/deploy/alert-manager-configmap.yaml.template index c47256cb1f2..b9cd6c1469f 100644 --- a/src/alert-manager/deploy/alert-manager-configmap.yaml.template +++ b/src/alert-manager/deploy/alert-manager-configmap.yaml.template @@ -117,5 +117,5 @@ data: alertname: 'NodeNotReady' target_match_re: alertname: ^PaiServicePodNotRunning|PaiServicePodNotReady$ - # Apply inhibition if `host_ip` is the same. - equal: ['host_ip'] + # Apply inhibition if `node_name` is the same. + equal: ['node_name'] diff --git a/src/prometheus/deploy/alerting/node.rules b/src/prometheus/deploy/alerting/node.rules index 0ff58d99fde..84245d988df 100644 --- a/src/prometheus/deploy/alerting/node.rules +++ b/src/prometheus/deploy/alerting/node.rules @@ -42,19 +42,19 @@ groups: expr: pai_node_count{disk_pressure="true"} > 0 for: 10m annotations: - summary: "{{$labels.host_ip}} is under disk pressure" + summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is under disk pressure" - alert: NodeOutOfDisk expr: pai_node_count{out_of_disk="true"} > 0 for: 10m annotations: - summary: "{{$labels.host_ip}} is out of disk" + summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is out of disk" - alert: NodeNotReady expr: pai_node_count{ready!="true"} > 0 for: 10m annotations: - summary: "{{$labels.host_ip}} is not ready" + summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is not ready" - alert: AzureAgentConsumeTooMuchMem expr: process_mem_usage_byte{cmd=~".*om[is]agent.*"} > 1073741824 # 1G diff --git a/src/prometheus/deploy/alerting/pai-services.rules b/src/prometheus/deploy/alerting/pai-services.rules index b4553a9c485..0ae8a35e899 100644 --- a/src/prometheus/deploy/alerting/pai-services.rules +++ b/src/prometheus/deploy/alerting/pai-services.rules @@ -24,7 +24,7 @@ groups: expr: pai_pod_count{phase!="running"} > 0 for: 10m annotations: - summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected" + summary: "{{$labels.name}} in {{$labels.node_name}} ({{$labels.host_ip}}) not running detected" - alert: PaiServicePodNotReady expr: pai_pod_count{phase="running", ready="false"} > 0 @@ -32,15 +32,15 @@ groups: labels: type: pai_service annotations: - summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected" + summary: "{{$labels.name}} in {{$labels.node_name}} ({{$labels.host_ip}}) not ready detected" - - alert: PaiServiceWatchDogNotUp + - alert: WatchdogNotUp expr: up{pai_service_name="watchdog"} != 1 for: 5m labels: type: pai_service annotations: - summary: "{{$labels.pai_service_name}} in {{$labels.instance}} not up detected" + summary: "OpenPAI service watchdog in {{$labels.instance}} not up detected" - alert: JobExporterHangs expr: rate(collector_iteration_count_total[10m]) == 0 diff --git a/src/watchdog/src/pkg/watchdog/prom_metric_collector.go b/src/watchdog/src/pkg/watchdog/prom_metric_collector.go index 4dca7b64cdc..5526535b90d 100644 --- a/src/watchdog/src/pkg/watchdog/prom_metric_collector.go +++ b/src/watchdog/src/pkg/watchdog/prom_metric_collector.go @@ -46,28 +46,28 @@ var ( "pai_pod_count", "count of pai pod", []string{ - "service_name", "name", "namespace", "phase", "host_ip", "initialized", "pod_scheduled", "ready", + "service_name", "name", "namespace", "phase", "host_ip", "node_name", "initialized", "pod_scheduled", "ready", }, ), "jobPodCount": createMetric( "pai_job_pod_count", "count of pai job pod", []string{ - "job_name", "name", "phase", "host_ip", "initialized", "pod_bound", "pod_scheduled", "ready", + "job_name", "name", "phase", "host_ip", "node_name", "initialized", "pod_bound", "pod_scheduled", "ready", }, ), "paiContainerCount": createMetric( "pai_container_count", "count of container pod", []string{ - "service_name", "pod_name", "name", "namespace", "state", "host_ip", "ready", + "service_name", "pod_name", "name", "namespace", "state", "host_ip", "node_name", "ready", }, ), "paiNodeCount": createMetric( "pai_node_count", "count of pai node", []string{ - "host_ip", "disk_pressure", "memory_pressure", "ready", "unschedulable", + "host_ip", "node_name", "disk_pressure", "memory_pressure", "ready", "unschedulable", }, ), } @@ -273,6 +273,7 @@ func (p *PromMetricCollector) getPaiNodeMetrics(nodeMetric nodeMetric) []prometh prometheus.GaugeValue, 1, nodeMetric.ip, + nodeMetric.name, nodeMetric.diskPressure, nodeMetric.memoryPressure, nodeMetric.ready, @@ -342,6 +343,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me podMetric.namespace, podMetric.phase, podMetric.hostIP, + podMetric.nodeName, podMetric.initialized, podMetric.scheduled, podMetric.ready, @@ -360,6 +362,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me podMetric.namespace, c.status, podMetric.hostIP, + podMetric.nodeName, strconv.FormatBool(c.ready), )) } @@ -372,6 +375,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me podMetric.name, podMetric.phase, podMetric.hostIP, + podMetric.nodeName, podMetric.initialized, strconv.FormatBool(podMetric.bound), podMetric.scheduled, diff --git a/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go b/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go index 1216e09850f..e166d366679 100644 --- a/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go +++ b/src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go @@ -75,21 +75,21 @@ func TestGeneratePodsMetrics(t *testing.T) { expectLables := [][]map[string]string{ { { - "host_ip": "10.151.41.8", "initialized": "true", "name": "log-manager-ds-nxm2k", "namespace": "default", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "initialized": "true", "name": "log-manager-ds-nxm2k", "namespace": "default", "phase": "running", "pod_scheduled": "true", "ready": "true", "service_name": "log-manager", }, { - "host_ip": "10.151.41.8", "name": "log-manager-logrotate", "namespace": "default", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "name": "log-manager-logrotate", "namespace": "default", "pod_name": "log-manager-ds-nxm2k", "state": "running", "ready": "true", "service_name": "log-manager", }, { - "host_ip": "10.151.41.8", "name": "log-manager-nginx", "namespace": "default", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "name": "log-manager-nginx", "namespace": "default", "pod_name": "log-manager-ds-nxm2k", "state": "running", "ready": "true", "service_name": "log-manager", }, }, {}, { { - "host_ip": "10.1.3.29", "initialized": "true", "job_name": "it_it_batch052_infer_80-159_bs2_V1", + "host_ip": "10.1.3.29", "node_name": "test_node_1", "initialized": "true", "job_name": "it_it_batch052_infer_80-159_bs2_V1", "name": "f1up4zk9ehfpjx2zc9gq8rv860uk4qv9dtk6awjz70r2uc9n75fp4wtjbxb32-taskrole-28", "namespace": "default", "phase": "pending", "pod_bound": "true", "pod_scheduled": "true", "ready": "false", }, @@ -118,7 +118,7 @@ func TestGenerateNodesMetrics(t *testing.T) { metrics := mo.pc.getPaiNodeMetrics(nodeMetrics[0]) expectLables := []map[string]string{ { - "host_ip": "10.151.41.8", "disk_pressure": "false", "memory_pressure": "false", + "host_ip": "10.151.41.8", "node_name": "test_node_0", "disk_pressure": "false", "memory_pressure": "false", "ready": "true", "unschedulable": "false", }, } @@ -176,7 +176,7 @@ func TestParseNoConditionPods(t *testing.T) { promMetrics := mo.pc.getPodMetrics(podMetrics[0]) expectLables := []map[string]string{ { - "host_ip": "unscheduled", "initialized": "unknown", "name": "yarn-frameworklauncher-ds-2684q", "namespace": "default", + "host_ip": "unscheduled", "node_name": "test_node_0", "initialized": "unknown", "name": "yarn-frameworklauncher-ds-2684q", "namespace": "default", "phase": "failed", "pod_scheduled": "unknown", "ready": "unknown", "service_name": "frameworklauncher", }, } @@ -197,7 +197,7 @@ func TestParseDLWSUnschedulableNodes(t *testing.T) { promMetrics := mo.pc.getPaiNodeMetrics(nodeMetrics[0]) expectLables := []map[string]string{ { - "host_ip": "192.168.255.1", "disk_pressure": "false", "memory_pressure": "false", + "host_ip": "192.168.255.1", "node_name": "test_node_0", "disk_pressure": "false", "memory_pressure": "false", "ready": "true", "unschedulable": "true", }, } diff --git a/src/watchdog/src/testdata/framework_list.json b/src/watchdog/src/testdata/framework_list.json index d162462817b..ce3f0156019 100644 --- a/src/watchdog/src/testdata/framework_list.json +++ b/src/watchdog/src/testdata/framework_list.json @@ -454,7 +454,7 @@ "podHostIP": "10.151.41.8", "podIP": "10.151.41.8", "podName": "059cf3d85cb5f6280e9606d47551554d-taskrole-0", - "podNodeName": "10.151.41.8", + "podNodeName": "test_node_0", "podUID": "72c6d156-3b64-11ea-9b7f-000d3ab25bb6", "runTime": null, "startTime": "2020-01-20T09:08:39Z" diff --git a/src/watchdog/src/testdata/no_condition_pod.json b/src/watchdog/src/testdata/no_condition_pod.json index 026ec90cff4..7295732ff80 100644 --- a/src/watchdog/src/testdata/no_condition_pod.json +++ b/src/watchdog/src/testdata/no_condition_pod.json @@ -139,7 +139,7 @@ "nodeSelector": { "launcher": "true" }, - "nodeName": "10.151.40.4", + "nodeName": "test_node_0", "hostNetwork": true, "hostPID": true, "securityContext": { diff --git a/src/watchdog/src/testdata/node_list.json b/src/watchdog/src/testdata/node_list.json index cf2e93d0c5b..c1e490f6faf 100644 --- a/src/watchdog/src/testdata/node_list.json +++ b/src/watchdog/src/testdata/node_list.json @@ -23,7 +23,7 @@ "kubernetes.io/os": "linux", "pai-worker": "true" }, - "name": "10.151.41.8", + "name": "test_node_0", "resourceVersion": "28455741", "selfLink": "/api/v1/nodes/10.151.41.8", "uid": "62340679-d056-11e9-97f6-000d3ab25bb6" diff --git a/src/watchdog/src/testdata/pod_list.json b/src/watchdog/src/testdata/pod_list.json index 5c13dcebbf7..1cfc055177f 100644 --- a/src/watchdog/src/testdata/pod_list.json +++ b/src/watchdog/src/testdata/pod_list.json @@ -117,7 +117,7 @@ "restartPolicy": "Always", "terminationGracePeriodSeconds": 30, "dnsPolicy": "ClusterFirst", - "nodeName": "10.151.41.8", + "nodeName": "test_node_0", "securityContext": {}, "imagePullSecrets": [ { @@ -685,7 +685,7 @@ "dnsPolicy": "ClusterFirst", "serviceAccountName": "frameworkbarrier", "serviceAccount": "frameworkbarrier", - "nodeName": "10.1.3.29", + "nodeName": "test_node_1", "hostNetwork": true, "securityContext": {}, "imagePullSecrets": [