forked from onyekaugochukwu/sre-task-repo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprometheus.yml
52 lines (51 loc) · 2.55 KB
/
prometheus.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
serverFiles:
# Insert the correct values in all parameters marked with "#TODO"
alerting_rules.yml:
groups:
- name: NodeDown
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up{job="kubernetes-nodes"} == 0
for: 2m
labels:
severity: page
annotations:
host: "{{$labels.kubernetes_io_hostname}}"
summary: "Instance down"
description: "Node {{$labels.kebernetes_io_hostname}} has been down for more than 5 minutes."
- name: low_memory_alert
rules:
- alert: LowMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 < 85
for: 2m
labels:
severity: warning
annotations:
host: "{{$labels.kubernetes_node}}"
summary: "{{$labels.kubernetes_node}} Host is low on memory. Only {{$value}}%left"
description: "{{$labels.kubernetes_node}} node is low on memory. Only {{$value}}%left"
- alert: KubePersistentVolumeErrors
expr: kube_persistentvolume_status_phase{job="kubernetes-service-endpoints", phase=~"Failed|Pending"} > 0
for: 2m
labels:
severity: critical
annotations:
description: The persistent volume {{$labels.persistentvolume}} has status {{$labels.phase}}.
summary: PersistentVolume is having issues with provisioning.
- alert: KubePodCrashLooping
expr: rate(kube_pod_container_status_restarts_total{job="kubernetes-service-endpoints",namespace=~".*"}[5m]) * 60 * 5 > 0
for: 2m
labels:
severity: warning
annotations:
description: Pod {{$labels.namespace}}/{{$labels.pod}}({{$labels.container}}) is restarting {{printf"%.2f"$value}} times/5minutes
summary: Pod is crash looping.
- alert: KubePodNotReady
expr: sum by(namespace, pod) (max by(namespace, pod) (kube_pod_status_phase{job="kubernetes-service-endpoints",namespace=~".*",phase=~"Pending|Unknown"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))) > 0
for: 2m
labels:
severity: warning
annotations:
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes.
summary: Pod has been in a non-ready state for more than 2 minutes.