diff --git a/CHANGELOG.md b/CHANGELOG.md index d72fbd98..30a2a72d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,7 @@ * [CHANGE] Replace `CortexRulerFailedEvaluations` with two new alerts: `CortexRulerTooManyFailedPushes` and `CortexRulerTooManyFailedQueries`. #347 * [CHANGE] Removed `CortexCacheRequestErrors` alert. This alert was not working because the legacy Cortex cache client instrumentation doesn't track errors. #346 * [CHANGE] Removed `CortexQuerierCapacityFull` alert. #342 +* [CHANGE] Changes blocks storage alerts to group metrics by the configured `cluster_labels` (supporting the deprecated `alert_aggregation_labels`). #351 * [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317 * [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324 * [ENHANCEMENT] Dashboards: defined container functions for common resources panels: containerDiskWritesPanel, containerDiskReadsPanel, containerDiskSpaceUtilization. #331 diff --git a/cortex-mixin/alerts/alerts.libsonnet b/cortex-mixin/alerts/alerts.libsonnet index a0e286cc..9eefe7f8 100644 --- a/cortex-mixin/alerts/alerts.libsonnet +++ b/cortex-mixin/alerts/alerts.libsonnet @@ -13,7 +13,7 @@ severity: 'critical', }, annotations: { - message: 'There are {{ printf "%f" $value }} unhealthy ingester(s).', + message: 'Cortex cluster %(alert_aggregation_variables)s has {{ printf "%%f" $value }} unhealthy ingester(s).' % $._config, }, }, { @@ -35,8 +35,8 @@ }, annotations: { message: ||| - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - |||, + The route {{ $labels.route }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors. + ||| % $._config, }, }, { @@ -98,8 +98,8 @@ }, annotations: { message: ||| - Incorrect results for {{ printf "%.2f" $value }}% of queries. - |||, + The Cortex cluster %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% incorrect query results. + ||| % $._config, }, }, { @@ -113,8 +113,8 @@ }, annotations: { message: ||| - An inconsistent runtime config file is used across cluster {{ $labels.job }}. - |||, + An inconsistent runtime config file is used across cluster %(alert_aggregation_variables)s. + ||| % $._config, }, }, { @@ -145,8 +145,8 @@ }, annotations: { message: ||| - There are {{ $value }} queued up queries in query-frontend. - |||, + There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-frontend. + ||| % $._config, }, }, { @@ -160,8 +160,8 @@ }, annotations: { message: ||| - There are {{ $value }} queued up queries in query-scheduler. - |||, + There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-scheduler. + ||| % $._config, }, }, { @@ -178,8 +178,8 @@ }, annotations: { message: ||| - Memcached {{ $labels.name }} used by Cortex in {{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation. - |||, + Memcached {{ $labels.name }} used by Cortex %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors for {{ $labels.operation }} operation. + ||| % $._config, }, }, { @@ -430,8 +430,8 @@ }, annotations: { message: ||| - Chunk memcached cluster is too small, should be at least {{ printf "%.2f" $value }}GB. - |||, + Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB. + ||| % $._config, }, }, { @@ -448,8 +448,8 @@ }, annotations: { message: ||| - The number of in-memory series per ingester in {{ $labels.namespace }} is too high. - |||, + The number of in-memory series per ingester in %(alert_aggregation_variables)s is too high. + ||| % $._config, }, }, { @@ -464,8 +464,8 @@ }, annotations: { message: ||| - Ingesters in {{ $labels.namespace }} ingest too many samples per second. - |||, + Ingesters in %(alert_aggregation_variables)s ingest too many samples per second. + ||| % $._config, }, }, { @@ -483,8 +483,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. - |||, + Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + ||| % $._config, }, }, { @@ -502,8 +502,8 @@ }, annotations: { message: ||| - Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory. - |||, + Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory. + ||| % $._config, }, }, ], @@ -526,8 +526,8 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write (push) errors. - |||, + Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% write (push) errors. + ||| % $._config, }, }, { @@ -545,8 +545,8 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules. - |||, + Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors while evaluating rules. + ||| % $._config, }, }, { @@ -563,8 +563,8 @@ }, annotations: { message: ||| - Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}. - |||, + Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% missed iterations for the rule group {{ $labels.rule_group }}. + ||| % $._config, }, }, { @@ -579,8 +579,8 @@ }, annotations: { message: ||| - Cortex Rulers {{ $labels.job }} are experiencing errors when checking the ring for rule group ownership. - |||, + Cortex Rulers in %(alert_aggregation_variables)s are experiencing errors when checking the ring for rule group ownership. + ||| % $._config, }, }, ], @@ -600,7 +600,7 @@ severity: 'warning', }, annotations: { - message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.', + message: 'Cortex instance {{ $labels.instance }} in %(alert_aggregation_variables)s sees incorrect number of gossip members.' % $._config, }, }, ], diff --git a/cortex-mixin/alerts/blocks.libsonnet b/cortex-mixin/alerts/blocks.libsonnet index d1157f38..a60ac2da 100644 --- a/cortex-mixin/alerts/blocks.libsonnet +++ b/cortex-mixin/alerts/blocks.libsonnet @@ -9,24 +9,24 @@ alert: 'CortexIngesterHasNotShippedBlocks', 'for': '15m', expr: ||| - (min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) + (min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4) and - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) + (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0) and # Only if the ingester has ingested samples over the last 4h. - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) and # Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance # had ingested samples in the past, then no traffic was received for a long period and then it starts # receiving samples again. Without this check, the alert would fire as soon as it gets back receiving # samples, while the a block shipping is expected within the next 4h. - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0) - |||, + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0) + ||| % $._config, labels: { severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config, }, }, { @@ -35,15 +35,15 @@ alert: 'CortexIngesterHasNotShippedBlocksSinceStart', 'for': '4h', expr: ||| - (max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) + (max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0) and - (max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) - |||, + (max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0) + ||| % $._config, labels: { severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config, }, }, { @@ -61,7 +61,7 @@ severity: 'critical', }, annotations: { - message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.", + message: "Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config, }, }, { @@ -77,7 +77,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config, }, }, { @@ -89,7 +89,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config, }, }, { @@ -101,7 +101,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config, }, }, { @@ -113,7 +113,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config, }, }, { @@ -125,7 +125,7 @@ severity: 'warning', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config, }, }, { @@ -137,7 +137,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config, }, }, { @@ -150,7 +150,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.', + message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config, }, }, { @@ -166,7 +166,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.', + message: 'Cortex Querier {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config, }, }, { @@ -177,20 +177,20 @@ expr: ||| 100 * ( ( - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) - - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m])) ) / - sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) + sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m])) ) > 1 - |||, + ||| % $._config, labels: { severity: 'warning', }, annotations: { - message: 'Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.', + message: 'Cortex Queries in %(alert_aggregation_variables)s are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%%.0f" $value }}%% of queries.' % $._config, }, }, { @@ -206,20 +206,20 @@ severity: 'critical', }, annotations: { - message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.', + message: 'Cortex Store Gateway {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config, }, }, { // Alert if the bucket index has not been updated for a given user. alert: 'CortexBucketIndexNotUpdated', expr: ||| - min by(namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 - |||, + min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200 + ||| % $._config, labels: { severity: 'critical', }, annotations: { - message: 'Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.', + message: 'Cortex bucket index for tenant {{ $labels.user }} in %(alert_aggregation_variables)s has not been updated since {{ $value | humanizeDuration }}.' % $._config, }, }, { @@ -227,13 +227,13 @@ alert: 'CortexTenantHasPartialBlocks', 'for': '6h', expr: ||| - max by(namespace, user) (cortex_bucket_blocks_partials_count) > 0 - |||, + max by(%(alert_aggregation_labels)s, user) (cortex_bucket_blocks_partials_count) > 0 + ||| % $._config, labels: { severity: 'warning', }, annotations: { - message: 'Cortex tenant {{ $labels.user }} in {{ $labels.namespace }} has {{ $value }} partial blocks.', + message: 'Cortex tenant {{ $labels.user }} in %(alert_aggregation_variables)s has {{ $value }} partial blocks.' % $._config, }, }, ], diff --git a/cortex-mixin/alerts/compactor.libsonnet b/cortex-mixin/alerts/compactor.libsonnet index 1f28a7e5..5538545e 100644 --- a/cortex-mixin/alerts/compactor.libsonnet +++ b/cortex-mixin/alerts/compactor.libsonnet @@ -14,7 +14,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not successfully cleaned up blocks in the last 6 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully cleaned up blocks in the last 6 hours.' % $._config, }, }, { @@ -30,7 +30,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, }, }, { @@ -44,7 +44,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not run compaction in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not run compaction in the last 24 hours.' % $._config, }, }, { @@ -57,7 +57,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} failed to run 2 consecutive compactions.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s failed to run 2 consecutive compactions.' % $._config, }, }, { @@ -73,7 +73,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, }, }, { @@ -87,7 +87,7 @@ severity: 'critical', }, annotations: { - message: 'Cortex Compactor {{ $labels.namespace }}/{{ $labels.instance }} has not uploaded any block in the last 24 hours.', + message: 'Cortex Compactor {{ $labels.instance }} in %(alert_aggregation_variables)s has not uploaded any block in the last 24 hours.' % $._config, }, }, ], diff --git a/cortex-mixin/groups.libsonnet b/cortex-mixin/groups.libsonnet index 6d33ea36..c2c35f90 100644 --- a/cortex-mixin/groups.libsonnet +++ b/cortex-mixin/groups.libsonnet @@ -41,5 +41,22 @@ alert_aggregation_labels_override ) else group_by_cluster, + + // This field contains contains the Prometheus template variables that should + // be used to display values of the configured "group_by_cluster" (or the + // deprecated "alert_aggregation_labels"). + alert_aggregation_variables: + std.join( + '/', + // Generate the variable replacement for each label. + std.map( + function(l) '{{ $labels.%s }}' % l, + // Split the configured labels by comma and remove whitespaces. + std.map( + function(l) std.strReplace(l, ' ', ''), + std.split($._config.alert_aggregation_labels, ',') + ), + ), + ), }, }