Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport fixes to Alertmanager template and alerts annotations to v1.13.x #100

Merged
merged 3 commits into from
Oct 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/prometheus-rules/add-alert.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ spec:
rules:
- alert: MyAppDown
annotations:
message: 'MyApp instance {{ $labels.instance }} has disappered from
description: 'MyApp instance {{ $labels.instance }} has disappered from
Prometheus target discovery.'
doc: "This alert fires if Prometheus target discovery was not able to
reach myapp-metrics in the last 3 minutes."
Expand All @@ -27,7 +27,7 @@ spec:
severity: critical
- alert: MyAppFailureRate
annotations:
message: 'MyApp failure rate is {{ prints "%.2f" $value }}%.'
description: 'MyApp failure rate is {{ prints "%.2f" $value }}%.'
doc: "This alert fires if the failure rate (the rate of 4xx and 5xx
responses) measured on a time window of 2 minutes was higher than 10%
in the last 10 minutes."
Expand Down
5 changes: 3 additions & 2 deletions katalog/alertmanager-operated/alertmanager.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

{{ define "__text" }}{{ range .Alerts }}
*Alert:* {{ .Labels.alertname }} - `{{ .Labels.severity }}`
*Description:* {{ .Annotations.message }}
*Description:* {{ .Annotations.description }}
*Runbook*: {{ .Annotations.runbook_url }}
*Graph:* <{{ .GeneratorURL }}|:chart_with_upwards_trend:>
*Details:*
{{ range .Labels.SortedPairs }} • *{{ .Name }}:* `{{ .Value }}`
Expand All @@ -20,4 +21,4 @@
{{ define "slack.default.text" }}{{ template "__text" . }}{{ end }}
{{ define "slack.default.footer" }}{{ end }}

{{ define "email.default.subject" }}{{ template "__subject" .}} - {{ template "slack.default.username" . }}{{ end }}
{{ define "email.default.subject" }}{{ template "__subject" .}} - {{ template "slack.default.username" . }}{{ end }}
28 changes: 14 additions & 14 deletions katalog/configs/kubeadm/rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ spec:
rules:
- alert: EtcdInsufficientMembers
annotations:
message: 'If one more etcd member goes down the cluster will be
description: 'If one more etcd member goes down the cluster will be
unavailable.'
doc: "This alert fires if less than half of Etcd cluster members were
online in the last 3 minutes."
Expand All @@ -67,7 +67,7 @@ spec:
severity: critical
- alert: EtcdNoLeader
annotations:
message: 'Etcd member {{ $labels.instance }} has no leader.'
description: 'Etcd member {{ $labels.instance }} has no leader.'
doc: "This alert fires if the Etcd cluster had no leader in the last
minute."
expr: |
Expand All @@ -77,7 +77,7 @@ spec:
severity: critical
- alert: EtcdHighNumberOfLeaderChanges
annotations:
message: 'Etcd instance {{ $labels.instance }} has seen {{ $value }}
description: 'Etcd instance {{ $labels.instance }} has seen {{ $value }}
leader changes within the last hour.'
doc: "This alert fires if the Etcd cluster changed leader more than 3
times in the last hour."
Expand All @@ -87,7 +87,7 @@ spec:
severity: warning
# - alert: EtcdHighNumberOfFailedGRPCRequests
# annotations:
# message: '{{ $value | printf "%.2f" }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
# description: '{{ $value | printf "%.2f" }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
# expr: |
# 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd-metrics"}[5m])) by (grpc_service, grpc_method, instance)
# /
Expand All @@ -97,7 +97,7 @@ spec:
# severity: warning
# - alert: EtcdHighNumberOfFailedGRPCRequests
# annotations:
# message: '{{ $value | printf "%.2f" }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
# description: '{{ $value | printf "%.2f" }}% of requests for {{ $labels.grpc_method }} failed on etcd instance {{ $labels.instance }}'
# expr: |
# 100 * (sum(rate(grpc_server_handled_total{grpc_code!="OK",job="etcd-metrics"}[5m])) by (grpc_service, grpc_method, instance)
# /
Expand All @@ -107,7 +107,7 @@ spec:
# severity: critical
# - alert: EtcdGRPCRequestsSlow
# annotations:
# message: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
# description: on etcd instance {{ $labels.instance }} gRPC requests to {{ $labels.grpc_method
# }} are slow
# expr: |
# histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job="etcd-metrics",grpc_type="unary"}[5m])) by (grpc_service, grpc_method, le)) > 0.15
Expand All @@ -116,15 +116,15 @@ spec:
# severity: critical
# - alert: EtcdMemberCommunicationSlow
# annotations:
# message: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
# description: etcd instance {{ $labels.instance }} member communication with {{ $labels.To }} is slow
# expr: |
# histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) > 0.15
# for: 10m
# labels:
# severity: warning
- alert: EtcdHighNumberOfFailedProposals
annotations:
message: 'Etcd instance {{ $labels.instance }} has seen {{ $value }}
description: 'Etcd instance {{ $labels.instance }} has seen {{ $value }}
proposal failures within the last hour.'
doc: "This alert fires if there were more than 5 proposal failure in the
last hour."
Expand All @@ -134,7 +134,7 @@ spec:
severity: warning
- alert: EtcdHighFsyncDurations
annotations:
message: 'Etcd instance {{ $labels.instance }} WAL fsync latency too
description: 'Etcd instance {{ $labels.instance }} WAL fsync latency too
high, current latency is {{ $value | printf "%.2f" }}.'
doc: "This alert fires if the WAL fsync 99th percentile latency was
higher than 0.5s in the last 10 minutes."
Expand All @@ -145,7 +145,7 @@ spec:
severity: warning
- alert: EtcdHighCommitDurations
annotations:
message: 'Etcd instance {{ $labels.instance }} commit latency too high,
description: 'Etcd instance {{ $labels.instance }} commit latency too high,
current latency is {{ $value | printf "%.2f" }}.'
doc: "This alert fires if the backend commit 99th percentile latency was
higher than 0.25s in the last 10 minutes."
Expand All @@ -158,7 +158,7 @@ spec:
rules:
- alert: CoreDNSPanic
annotations:
messages: 'CoreDNS instance {{ $labels.instance }} panic count
description: 'CoreDNS instance {{ $labels.instance }} panic count
increased by {{ $value }}.'
doc: "This alert fires if CoreDNS total panic count increased by at
least 1 in the last 10 minutes."
Expand All @@ -168,7 +168,7 @@ spec:
severity: critical
- alert: CoreDNSRequestsLatency
annotations:
message: 'CoreDNS instance {{ $labels.instance }} requests latency too
description: 'CoreDNS instance {{ $labels.instance }} requests latency too
high, current latency is {{ $value | printf "%.2f" }}.'
doc: "This alert fires if CoreDNS 99th percentile requests latency was
higher than 100ms in the last 10 minutes."
Expand All @@ -179,7 +179,7 @@ spec:
severity: warning
- alert: CoreDNSHealthRequestsLatency
annotations:
message: 'CoreDNS instance {{ $labels.instance }} health requests
description: 'CoreDNS instance {{ $labels.instance }} health requests
latency too high, current latency is {{ $value | printf "%.2f" }}.'
doc: "This alert fires if CoreDNS 99th percentile health requests
latency was higher than 10ms in the last 10 minutes."
Expand All @@ -190,7 +190,7 @@ spec:
severity: warning
- alert: CoreDNSProxyRequestsLatency
annotations:
message: 'CoreDNS instance {{ $labels.instance }} proxy requests
description: 'CoreDNS instance {{ $labels.instance }} proxy requests
latency too high, current latency is {{ $value | printf "%.2f" }}.'
doc: "This alert fires if CoreDNS 99th percentile proxy requests
latency was higher than 500ms in the last 10 minutes."
Expand Down