From e96710b4aec8973cd2c7a1f9bccdbd6506a16aba Mon Sep 17 00:00:00 2001 From: Max Inden Date: Fri, 2 Oct 2020 10:36:35 +0200 Subject: [PATCH 1/2] .maintain/monitoring: Add alert when continuous task ends Through the `polkadot_tasks_ended_total` Prometheus metric one can tell when a task ended. Use this metric to alert when specific known-to-be-continuous tasks end on a node. --- .../alerting-rules/alerting-rules.yaml | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/.maintain/monitoring/alerting-rules/alerting-rules.yaml index 3dde038d88b4e..f510b1f3cf4ba 100644 --- a/.maintain/monitoring/alerting-rules/alerting-rules.yaml +++ b/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -126,6 +126,30 @@ groups: # Others ############################################################################## + - alert: ContinuousTaskEnded + expr: 'polkadot_tasks_ended_total{task_name=~" + authority-discovery-worker + |babe + |basic-block-import-worker + |grandpa-voter + |informant + |network-worker + |offchain-notifications + |on-transaction-imported + |prometheus-endpoint + |telemetry-periodic-network-state + |telemetry-periodic-send + |telemetry-worker + |txpool-background + |txpool-notifications + "} > 0' + for: 5m + labels: + severity: warning + annotations: + message: 'Continuous task {{ $labels.task_name }} on node + {{ $labels.instance }} ended unexpectedly.' + - alert: AuthorityDiscoveryDiscoveryFailureHigh expr: 'polkadot_authority_discovery_handle_value_found_event_failure / ignoring(name) From d7ff86b7f62238dd5927407f10f89ab39bf96ac4 Mon Sep 17 00:00:00 2001 From: Max Inden Date: Fri, 2 Oct 2020 16:58:18 +0200 Subject: [PATCH 2/2] .maintain/monitoring: Don't hard-code task names --- .../alerting-rules/alerting-rules.yaml | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/.maintain/monitoring/alerting-rules/alerting-rules.yaml b/.maintain/monitoring/alerting-rules/alerting-rules.yaml index f510b1f3cf4ba..16a27c06d3e05 100644 --- a/.maintain/monitoring/alerting-rules/alerting-rules.yaml +++ b/.maintain/monitoring/alerting-rules/alerting-rules.yaml @@ -127,22 +127,8 @@ groups: ############################################################################## - alert: ContinuousTaskEnded - expr: 'polkadot_tasks_ended_total{task_name=~" - authority-discovery-worker - |babe - |basic-block-import-worker - |grandpa-voter - |informant - |network-worker - |offchain-notifications - |on-transaction-imported - |prometheus-endpoint - |telemetry-periodic-network-state - |telemetry-periodic-send - |telemetry-worker - |txpool-background - |txpool-notifications - "} > 0' + expr: '(polkadot_tasks_spawned_total == 1) - on(instance, task_name) + (polkadot_tasks_ended_total == 1)' for: 5m labels: severity: warning