From 4498eb11019e17c0fd6d82506fa535dbe6ddf533 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Tue, 25 May 2021 10:19:43 +0200 Subject: [PATCH 1/3] Extend Alertmanager dashboard with currently unused metrics. Metrics for general operation: - Added "Tenants" stat panel using: `cortex_alertmanager_tenants_discovered` - Added "Tenant Configuration Sync" row using: `cortex_alertmanager_sync_configs_failed_total` `cortex_alertmanager_sync_configs_total` `cortex_alertmanager_ring_check_errors_total` Metrics specific to sharding operation: - Added "Sharding Initial State Sync" row using: `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_completed_total` `cortex_alertmanager_state_initial_sync_duration_seconds` - Added "Sharding State Operations" row using: `cortex_alertmanager_state_fetch_replica_state_total` `cortex_alertmanager_state_fetch_replica_state_failed_total` `cortex_alertmanager_state_replication_total` `cortex_alertmanager_state_replication_failed_total` `cortex_alertmanager_partial_state_merges_total` `cortex_alertmanager_partial_state_merges_failed_total` `cortex_alertmanager_state_persist_total` `cortex_alertmanager_state_persist_failed_total` --- .../dashboards/alertmanager.libsonnet | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index b329ce6ba19..6d7ee562f90 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -17,6 +17,10 @@ local utils = import 'mixin-utils/utils.libsonnet'; $.panel('Total Silences') + $.statPanel('sum(cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), format='short') ) + .addPanel( + $.panel('Tenants') + + $.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher('alertmanager'), format='short') + ) ) .addRow( $.row('Alerts Received') @@ -86,5 +90,150 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) .addRows( $.getObjectStoreRows('Alertmanager Configuration Object Store (Alertmanager accesses)', 'alertmanager-storage') + ) + .addRow( + $.row('Replication') + .addPanel( + $.panel('Tenants (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + .addPanel( + $.panel('Alerts (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + .addPanel( + $.panel('Silences (By Instance)') + + $.queryPanel( + 'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), + '{{pod}}' + ) + + $.stack + ) + ) + .addRow( + $.row('Tenant Configuration Sync') + .addPanel( + $.panel('Syncs/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_sync_configs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Syncs/sec (By Reason)') + + $.queryPanel( + 'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{reason}}' + ) + ) + .addPanel( + $.panel('Ring Check Errors/sec') + + $.queryPanel( + 'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + 'errors' + ) + ) + ) + .addRow( + $.row('Sharding Initial State Sync') + .addPanel( + $.panel('Syncs/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Syncs/sec (By Outcome)') + + $.queryPanel( + 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + '{{outcome}}' + ) + ) + .addPanel( + $.panel('Duration') + + utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds', $.jobSelector('alertmanager')) + ) + ) + .addRow( + $.row('Sharding State Operations') + .addPanel( + $.panel('Replica Fetches/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_fetch_replica_state_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_fetch_replica_state_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Replica Updates/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_replication_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_replication_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Partial Merges/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_partial_state_merges_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_partial_state_merges_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) + .addPanel( + $.panel('Remote Storage Persists/sec') + + $.queryPanel( + [ + ||| + sum(rate(cortex_alertmanager_state_persist_total{%s}[$__rate_interval])) + - + sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval])) + ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], + 'sum(rate(cortex_alertmanager_state_persist_failed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), + ], + ['success', 'failed'] + ) + ) ), } From 00d7414577271824a8b1ae8dfba6f7f8d1986c80 Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 27 May 2021 12:55:33 +0200 Subject: [PATCH 2/3] Review comments + fix latency panel. --- .../dashboards/alertmanager.libsonnet | 52 +++++++------------ 1 file changed, 19 insertions(+), 33 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 6d7ee562f90..922b2861bc2 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -94,26 +94,26 @@ local utils = import 'mixin-utils/utils.libsonnet'; .addRow( $.row('Replication') .addPanel( - $.panel('Tenants (By Instance)') + + $.panel('Per %s Tenants' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_tenants_owned{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) .addPanel( - $.panel('Alerts (By Instance)') + + $.panel('Per %s Alerts' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_alerts{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'sum by(%s) (cortex_alertmanager_alerts{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) .addPanel( - $.panel('Silences (By Instance)') + + $.panel('Per %s Silences' % $._config.per_instance_label) + $.queryPanel( - 'sum by(pod) (cortex_alertmanager_silences{%s})' % $.jobMatcher('alertmanager'), - '{{pod}}' + 'sum by(%s) (cortex_alertmanager_silences{%s})' % [$._config.per_instance_label, $.jobMatcher('alertmanager')], + '{{%s}}' % $._config.per_instance_label ) + $.stack ) @@ -150,37 +150,20 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding Initial State Sync') + $.row('Sharding Runtime State Sync') .addPanel( $.panel('Syncs/sec') + - $.queryPanel( - [ - ||| - sum(rate(cortex_alertmanager_state_initial_sync_total{%s}[$__rate_interval])) - - - sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval])) - ||| % [$.jobMatcher('alertmanager'), $.jobMatcher('alertmanager')], - 'sum(rate(cortex_alertmanager_state_initial_sync_completed_total{outcome="failed",%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), - ], - ['success', 'failed'] - ) - ) - .addPanel( - $.panel('Syncs/sec (By Outcome)') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), '{{outcome}}' ) ) .addPanel( - $.panel('Duration') + - utils.latencyRecordingRulePanel('cortex_alertmanager_state_initial_sync_duration_seconds', $.jobSelector('alertmanager')) + $.panel('Sync duration') + + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) ) - ) - .addRow( - $.row('Sharding State Operations') .addPanel( - $.panel('Replica Fetches/sec') + + $.panel('Fetch state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -193,8 +176,11 @@ local utils = import 'mixin-utils/utils.libsonnet'; ['success', 'failed'] ) ) + ) + .addRow( + $.row('Sharding State Operations') .addPanel( - $.panel('Replica Updates/sec') + + $.panel('Replicate state to other alertmanagers /sec') + $.queryPanel( [ ||| @@ -208,7 +194,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Partial Merges/sec') + + $.panel('Merge state from other alertmanagers /sec') + $.queryPanel( [ ||| @@ -222,7 +208,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addPanel( - $.panel('Remote Storage Persists/sec') + + $.panel('Persist state to remote storage /sec') + $.queryPanel( [ ||| From b0e76f9fcc3ee5ad5d12b99e79338f6d4ddf7a4f Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Fri, 30 Jul 2021 11:49:02 +0200 Subject: [PATCH 3/3] Review comments. --- jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet index 922b2861bc2..6f578b11357 100644 --- a/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet +++ b/jsonnet/mimir-mixin/dashboards/alertmanager.libsonnet @@ -150,16 +150,16 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding Runtime State Sync') + $.row('Sharding Initial State Sync') .addPanel( - $.panel('Syncs/sec') + + $.panel('Initial syncs/sec') + $.queryPanel( 'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher('alertmanager'), '{{outcome}}' ) ) .addPanel( - $.panel('Sync duration') + + $.panel('Initial sync duration') + $.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher('alertmanager')) ) .addPanel( @@ -178,7 +178,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; ) ) .addRow( - $.row('Sharding State Operations') + $.row('Sharding Runtime State Sync') .addPanel( $.panel('Replicate state to other alertmanagers /sec') + $.queryPanel(