Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
Remove redundant alerts (#5052)
Browse files Browse the repository at this point in the history
  • Loading branch information
suiguoxin authored Nov 10, 2020
1 parent 6cb7f8d commit 8d4ab70
Show file tree
Hide file tree
Showing 12 changed files with 37 additions and 26 deletions.
2 changes: 1 addition & 1 deletion docs/manual/cluster-admin/troubleshooting.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Solutions:
This is a kind of alert from alert manager, and is reported by watchdog service. Watchdog gets such metrics from Kubernetes API. Example metrics is like:

```
pai_node_count{disk_pressure="false",instance="10.0.0.1:9101",job="pai_serivce_exporter",memory_pressure="false",name="10.0.0.2",out_of_disk="false",pai_service_name="watchdog",ready="true",scraped_from="watchdog-5ddd945975-kwhpr"}
pai_node_count{disk_pressure="false",instance="10.0.0.1:9101",job="pai_serivce_exporter",memory_pressure="false",host_ip="10.0.0.2",out_of_disk="false",pai_service_name="watchdog",ready="true",scraped_from="watchdog-5ddd945975-kwhpr"}
```

The name label indicate what node this metric represents.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,11 @@ data:
{% endif %}

{% endfor %}

inhibit_rules:
- source_match:
alertname: 'NodeNotReady'
target_match_re:
alertname: ^PaiServicePodNotRunning|PaiServicePodNotReady$
# Apply inhibition if `node_name` is the same.
equal: ['node_name']
6 changes: 3 additions & 3 deletions src/prometheus/deploy/alerting/node.rules
Original file line number Diff line number Diff line change
Expand Up @@ -52,23 +52,23 @@ groups:
labels:
severity: error
annotations:
summary: "{{$labels.name}} is under disk pressure"
summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is under disk pressure"

- alert: NodeOutOfDisk
expr: pai_node_count{out_of_disk="true"} > 0
for: 10m
labels:
severity: error
annotations:
summary: "{{$labels.name}} is out of disk"
summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is out of disk"

- alert: NodeNotReady
expr: pai_node_count{ready!="true"} > 0
for: 10m
labels:
severity: error
annotations:
summary: "{{$labels.name}} is not ready"
summary: "{{$labels.node_name}} ({{$labels.host_ip}}) is not ready"

- alert: AzureAgentConsumeTooMuchMem
expr: process_mem_usage_byte{cmd=~".*om[is]agent.*"} > 1073741824 # 1G
Expand Down
10 changes: 5 additions & 5 deletions src/prometheus/deploy/alerting/pai-services.rules
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ groups:
type: pai_service
severity: error
annotations:
summary: "{{$labels.name}} in {{$labels.host_ip}} not running detected"
summary: "{{$labels.name}} in {{$labels.node_name}} ({{$labels.host_ip}}) not running detected"

- alert: PaiServicePodNotReady
expr: pai_pod_count{phase="running", ready="false"} > 0
Expand All @@ -38,16 +38,16 @@ groups:
type: pai_service
severity: error
annotations:
summary: "{{$labels.name}} in {{$labels.host_ip}} not ready detected"
summary: "{{$labels.name}} in {{$labels.node_name}} ({{$labels.host_ip}}) not ready detected"

- alert: PaiServiceNotUp
expr: up != 1
- alert: WatchdogNotUp
expr: up{pai_service_name="watchdog"} != 1
for: 5m
labels:
type: pai_service
severity: error
annotations:
summary: "{{$labels.pai_service_name}} in {{$labels.instance}} not up detected"
summary: "OpenPAI service watchdog in {{$labels.instance}} not up detected"

- alert: JobExporterHangs
expr: rate(collector_iteration_count_total[10m]) == 0
Expand Down
1 change: 0 additions & 1 deletion src/prometheus/deploy/service.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ prerequisite:
template-list:
- prometheus-configmap.yaml
- prometheus-deployment.yaml
- start.sh
- delete.yaml
- alerting/customized.rules

Expand Down
File renamed without changes.
12 changes: 8 additions & 4 deletions src/watchdog/src/pkg/watchdog/prom_metric_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,28 +46,28 @@ var (
"pai_pod_count",
"count of pai pod",
[]string{
"service_name", "name", "namespace", "phase", "host_ip", "initialized", "pod_scheduled", "ready",
"service_name", "name", "namespace", "phase", "host_ip", "node_name", "initialized", "pod_scheduled", "ready",
},
),
"jobPodCount": createMetric(
"pai_job_pod_count",
"count of pai job pod",
[]string{
"job_name", "name", "phase", "host_ip", "initialized", "pod_bound", "pod_scheduled", "ready",
"job_name", "name", "phase", "host_ip", "node_name", "initialized", "pod_bound", "pod_scheduled", "ready",
},
),
"paiContainerCount": createMetric(
"pai_container_count",
"count of container pod",
[]string{
"service_name", "pod_name", "name", "namespace", "state", "host_ip", "ready",
"service_name", "pod_name", "name", "namespace", "state", "host_ip", "node_name", "ready",
},
),
"paiNodeCount": createMetric(
"pai_node_count",
"count of pai node",
[]string{
"name", "disk_pressure", "memory_pressure", "ready", "unschedulable",
"host_ip", "node_name", "disk_pressure", "memory_pressure", "ready", "unschedulable",
},
),
}
Expand Down Expand Up @@ -273,6 +273,7 @@ func (p *PromMetricCollector) getPaiNodeMetrics(nodeMetric nodeMetric) []prometh
prometheus.GaugeValue,
1,
nodeMetric.ip,
nodeMetric.name,
nodeMetric.diskPressure,
nodeMetric.memoryPressure,
nodeMetric.ready,
Expand Down Expand Up @@ -342,6 +343,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me
podMetric.namespace,
podMetric.phase,
podMetric.hostIP,
podMetric.nodeName,
podMetric.initialized,
podMetric.scheduled,
podMetric.ready,
Expand All @@ -360,6 +362,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me
podMetric.namespace,
c.status,
podMetric.hostIP,
podMetric.nodeName,
strconv.FormatBool(c.ready),
))
}
Expand All @@ -372,6 +375,7 @@ func (p *PromMetricCollector) getPodMetrics(podMetric podMetric) []prometheus.Me
podMetric.name,
podMetric.phase,
podMetric.hostIP,
podMetric.nodeName,
podMetric.initialized,
strconv.FormatBool(podMetric.bound),
podMetric.scheduled,
Expand Down
14 changes: 7 additions & 7 deletions src/watchdog/src/pkg/watchdog/prom_metric_collector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,21 +75,21 @@ func TestGeneratePodsMetrics(t *testing.T) {
expectLables := [][]map[string]string{
{
{
"host_ip": "10.151.41.8", "initialized": "true", "name": "log-manager-ds-nxm2k", "namespace": "default",
"host_ip": "10.151.41.8", "node_name": "test_node_0", "initialized": "true", "name": "log-manager-ds-nxm2k", "namespace": "default",
"phase": "running", "pod_scheduled": "true", "ready": "true", "service_name": "log-manager",
}, {
"host_ip": "10.151.41.8", "name": "log-manager-logrotate", "namespace": "default",
"host_ip": "10.151.41.8", "node_name": "test_node_0", "name": "log-manager-logrotate", "namespace": "default",
"pod_name": "log-manager-ds-nxm2k", "state": "running", "ready": "true", "service_name": "log-manager",
},
{
"host_ip": "10.151.41.8", "name": "log-manager-nginx", "namespace": "default",
"host_ip": "10.151.41.8", "node_name": "test_node_0", "name": "log-manager-nginx", "namespace": "default",
"pod_name": "log-manager-ds-nxm2k", "state": "running", "ready": "true", "service_name": "log-manager",
},
},
{},
{
{
"host_ip": "10.1.3.29", "initialized": "true", "job_name": "it_it_batch052_infer_80-159_bs2_V1",
"host_ip": "10.1.3.29", "node_name": "test_node_1", "initialized": "true", "job_name": "it_it_batch052_infer_80-159_bs2_V1",
"name": "f1up4zk9ehfpjx2zc9gq8rv860uk4qv9dtk6awjz70r2uc9n75fp4wtjbxb32-taskrole-28", "namespace": "default",
"phase": "pending", "pod_bound": "true", "pod_scheduled": "true", "ready": "false",
},
Expand Down Expand Up @@ -118,7 +118,7 @@ func TestGenerateNodesMetrics(t *testing.T) {
metrics := mo.pc.getPaiNodeMetrics(nodeMetrics[0])
expectLables := []map[string]string{
{
"name": "10.151.41.8", "disk_pressure": "false", "memory_pressure": "false",
"host_ip": "10.151.41.8", "node_name": "test_node_0", "disk_pressure": "false", "memory_pressure": "false",
"ready": "true", "unschedulable": "false",
},
}
Expand Down Expand Up @@ -176,7 +176,7 @@ func TestParseNoConditionPods(t *testing.T) {
promMetrics := mo.pc.getPodMetrics(podMetrics[0])
expectLables := []map[string]string{
{
"host_ip": "unscheduled", "initialized": "unknown", "name": "yarn-frameworklauncher-ds-2684q", "namespace": "default",
"host_ip": "unscheduled", "node_name": "test_node_0", "initialized": "unknown", "name": "yarn-frameworklauncher-ds-2684q", "namespace": "default",
"phase": "failed", "pod_scheduled": "unknown", "ready": "unknown", "service_name": "frameworklauncher",
},
}
Expand All @@ -197,7 +197,7 @@ func TestParseDLWSUnschedulableNodes(t *testing.T) {
promMetrics := mo.pc.getPaiNodeMetrics(nodeMetrics[0])
expectLables := []map[string]string{
{
"name": "192.168.255.1", "disk_pressure": "false", "memory_pressure": "false",
"host_ip": "192.168.255.1", "node_name": "dltsp40-infra01", "disk_pressure": "false", "memory_pressure": "false",
"ready": "true", "unschedulable": "true",
},
}
Expand Down
2 changes: 1 addition & 1 deletion src/watchdog/src/testdata/framework_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -454,7 +454,7 @@
"podHostIP": "10.151.41.8",
"podIP": "10.151.41.8",
"podName": "059cf3d85cb5f6280e9606d47551554d-taskrole-0",
"podNodeName": "10.151.41.8",
"podNodeName": "test_node_0",
"podUID": "72c6d156-3b64-11ea-9b7f-000d3ab25bb6",
"runTime": null,
"startTime": "2020-01-20T09:08:39Z"
Expand Down
2 changes: 1 addition & 1 deletion src/watchdog/src/testdata/no_condition_pod.json
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@
"nodeSelector": {
"launcher": "true"
},
"nodeName": "10.151.40.4",
"nodeName": "test_node_0",
"hostNetwork": true,
"hostPID": true,
"securityContext": {
Expand Down
2 changes: 1 addition & 1 deletion src/watchdog/src/testdata/node_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
"kubernetes.io/os": "linux",
"pai-worker": "true"
},
"name": "10.151.41.8",
"name": "test_node_0",
"resourceVersion": "28455741",
"selfLink": "/api/v1/nodes/10.151.41.8",
"uid": "62340679-d056-11e9-97f6-000d3ab25bb6"
Expand Down
4 changes: 2 additions & 2 deletions src/watchdog/src/testdata/pod_list.json
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@
"restartPolicy": "Always",
"terminationGracePeriodSeconds": 30,
"dnsPolicy": "ClusterFirst",
"nodeName": "10.151.41.8",
"nodeName": "test_node_0",
"securityContext": {},
"imagePullSecrets": [
{
Expand Down Expand Up @@ -685,7 +685,7 @@
"dnsPolicy": "ClusterFirst",
"serviceAccountName": "frameworkbarrier",
"serviceAccount": "frameworkbarrier",
"nodeName": "10.1.3.29",
"nodeName": "test_node_1",
"hostNetwork": true,
"securityContext": {},
"imagePullSecrets": [
Expand Down

0 comments on commit 8d4ab70

Please sign in to comment.