From 89605361e98209dc9c472752d33cfbcb75eccec3 Mon Sep 17 00:00:00 2001 From: Markos Chandras Date: Thu, 23 Jul 2020 16:25:12 +0300 Subject: [PATCH] mixin: Fix alert about unhealthy sidecar The alert was giving the wrong information as the $value contained the number of pods that failing to send heartbeat instead of the actual number of seconds that each sidecar was being unhealthy. Also the 5 minute interval is probably too low as on large deployments prometheus could take much longer to come up online and for sidecar to become actually useful. As such, we can simply subtract the timestamp of the last heartbeat from the current time and fire if we are lagging for more than 10 minutes. Signed-off-by: Markos Chandras --- CHANGELOG.md | 1 + examples/alerts/alerts.md | 2 +- examples/alerts/alerts.yaml | 2 +- examples/alerts/tests.yaml | 61 +++++++++++++++------------------- mixin/alerts/sidecar.libsonnet | 2 +- 5 files changed, 30 insertions(+), 38 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e75ffcd69..c75cd0bc4b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2936](https://github.com/thanos-io/thanos/pull/2936) Compact: Fix ReplicaLabelRemover panic when replicaLabels are not specified. - [#2956](https://github.com/thanos-io/thanos/pull/2956) Store: Fix fetching of chunks bigger than 16000 bytes. - [#2970](https://github.com/thanos-io/thanos/pull/2970) Store: Upgrade minio-go/v7 to fix slowness when running on EKS. +- [#2929](https://github.com/thanos-io/thanos/pull/2929) Mixin: Fix expression for 'unhealthy sidecar' alert and also increase the timeout for 10 minutes. ### Added diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index cb40e2a9f71..386ec60d435 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -275,7 +275,7 @@ rules: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job) >= 600 labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 72c3279e490..35586fabeb6 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -258,7 +258,7 @@ groups: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job) >= 600 labels: severity: critical - name: thanos-store.rules diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 25df0414e46..47070457c3b 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -22,47 +22,35 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) eval_time: 2m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar"}' value: 43 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 42 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar"}' value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 0 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar"}' value: 0 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 300 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 300 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - labels: '{job="thanos-sidecar"}' + value: 600 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 360 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 360 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) >= 300 + - labels: '{job="thanos-sidecar"}' + value: 660 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job) >= 600 eval_time: 12m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 720 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar"}' value: 720 alert_rule_test: - eval_time: 1m @@ -71,24 +59,27 @@ tests: alertname: ThanosSidecarUnhealthy - eval_time: 3m alertname: ThanosSidecarUnhealthy - - eval_time: 5m + - eval_time: 10m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' - - eval_time: 6m + message: 'Thanos Sidecar thanos-sidecar is unhealthy for 600 seconds.' + - eval_time: 11m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar is unhealthy for 660 seconds.' - eval_time: 12m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar is unhealthy for 720 seconds.' diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index c81e2ba0a91..009d8648777 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -27,7 +27,7 @@ message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.', }, expr: ||| - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job) >= 600 ||| % thanos.sidecar, labels: { severity: 'critical',