Skip to content

Commit

Permalink
Merge pull request grafana/cortex-jsonnet#351 from grafana/build-aler…
Browse files Browse the repository at this point in the history
…t-messages-based-on-group-labels

Improved alert messages with Cortex cluster
  • Loading branch information
pracucci authored Jul 2, 2021
2 parents e9a89a7 + 66e36d8 commit 17bc2eb
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 69 deletions.
64 changes: 32 additions & 32 deletions jsonnet/mimir-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
severity: 'critical',
},
annotations: {
message: 'There are {{ printf "%f" $value }} unhealthy ingester(s).',
message: 'Cortex cluster %(alert_aggregation_variables)s has {{ printf "%%f" $value }} unhealthy ingester(s).' % $._config,
},
},
{
Expand All @@ -35,8 +35,8 @@
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
The route {{ $labels.route }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors.
||| % $._config,
},
},
{
Expand Down Expand Up @@ -98,8 +98,8 @@
},
annotations: {
message: |||
Incorrect results for {{ printf "%.2f" $value }}% of queries.
|||,
The Cortex cluster %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% incorrect query results.
||| % $._config,
},
},
{
Expand All @@ -113,8 +113,8 @@
},
annotations: {
message: |||
An inconsistent runtime config file is used across cluster {{ $labels.job }}.
|||,
An inconsistent runtime config file is used across cluster %(alert_aggregation_variables)s.
||| % $._config,
},
},
{
Expand Down Expand Up @@ -145,8 +145,8 @@
},
annotations: {
message: |||
There are {{ $value }} queued up queries in query-frontend.
|||,
There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-frontend.
||| % $._config,
},
},
{
Expand All @@ -160,8 +160,8 @@
},
annotations: {
message: |||
There are {{ $value }} queued up queries in query-scheduler.
|||,
There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-scheduler.
||| % $._config,
},
},
{
Expand All @@ -178,8 +178,8 @@
},
annotations: {
message: |||
Memcached {{ $labels.name }} used by Cortex in {{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.
|||,
Memcached {{ $labels.name }} used by Cortex %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors for {{ $labels.operation }} operation.
||| % $._config,
},
},
{
Expand Down Expand Up @@ -430,8 +430,8 @@
},
annotations: {
message: |||
Chunk memcached cluster is too small, should be at least {{ printf "%.2f" $value }}GB.
|||,
Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB.
||| % $._config,
},
},
{
Expand All @@ -448,8 +448,8 @@
},
annotations: {
message: |||
The number of in-memory series per ingester in {{ $labels.namespace }} is too high.
|||,
The number of in-memory series per ingester in %(alert_aggregation_variables)s is too high.
||| % $._config,
},
},
{
Expand All @@ -464,8 +464,8 @@
},
annotations: {
message: |||
Ingesters in {{ $labels.namespace }} ingest too many samples per second.
|||,
Ingesters in %(alert_aggregation_variables)s ingest too many samples per second.
||| % $._config,
},
},
{
Expand All @@ -483,8 +483,8 @@
},
annotations: {
message: |||
Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory.
|||,
Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory.
||| % $._config,
},
},
{
Expand All @@ -502,8 +502,8 @@
},
annotations: {
message: |||
Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory.
|||,
Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory.
||| % $._config,
},
},
],
Expand All @@ -526,8 +526,8 @@
},
annotations: {
message: |||
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write (push) errors.
|||,
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% write (push) errors.
||| % $._config,
},
},
{
Expand All @@ -545,8 +545,8 @@
},
annotations: {
message: |||
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.
|||,
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors while evaluating rules.
||| % $._config,
},
},
{
Expand All @@ -563,8 +563,8 @@
},
annotations: {
message: |||
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}.
|||,
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% missed iterations for the rule group {{ $labels.rule_group }}.
||| % $._config,
},
},
{
Expand All @@ -579,8 +579,8 @@
},
annotations: {
message: |||
Cortex Rulers {{ $labels.job }} are experiencing errors when checking the ring for rule group ownership.
|||,
Cortex Rulers in %(alert_aggregation_variables)s are experiencing errors when checking the ring for rule group ownership.
||| % $._config,
},
},
],
Expand All @@ -600,7 +600,7 @@
severity: 'warning',
},
annotations: {
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.',
message: 'Cortex instance {{ $labels.instance }} in %(alert_aggregation_variables)s sees incorrect number of gossip members.' % $._config,
},
},
],
Expand Down
62 changes: 31 additions & 31 deletions jsonnet/mimir-mixin/alerts/blocks.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@
alert: 'CortexIngesterHasNotShippedBlocks',
'for': '15m',
expr: |||
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
(min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
and
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
|||,
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand All @@ -35,15 +35,15 @@
alert: 'CortexIngesterHasNotShippedBlocksSinceStart',
'for': '4h',
expr: |||
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
and
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
|||,
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand All @@ -61,7 +61,7 @@
severity: 'critical',
},
annotations: {
message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.",
message: "Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config,
},
},
{
Expand All @@ -77,7 +77,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config,
},
},
{
Expand All @@ -89,7 +89,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config,
},
},
{
Expand All @@ -101,7 +101,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config,
},
},
{
Expand All @@ -113,7 +113,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config,
},
},
{
Expand All @@ -125,7 +125,7 @@
severity: 'warning',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config,
},
},
{
Expand All @@ -137,7 +137,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config,
},
},
{
Expand All @@ -150,7 +150,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config,
},
},
{
Expand All @@ -166,7 +166,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.',
message: 'Cortex Querier {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
Expand All @@ -177,20 +177,20 @@
expr: |||
100 * (
(
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
-
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
)
/
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
)
> 1
|||,
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.',
message: 'Cortex Queries in %(alert_aggregation_variables)s are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%%.0f" $value }}%% of queries.' % $._config,
},
},
{
Expand All @@ -206,34 +206,34 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.',
message: 'Cortex Store Gateway {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
// Alert if the bucket index has not been updated for a given user.
alert: 'CortexBucketIndexNotUpdated',
expr: |||
min by(namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
|||,
min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.',
message: 'Cortex bucket index for tenant {{ $labels.user }} in %(alert_aggregation_variables)s has not been updated since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
// Alert if a we consistently find partial blocks for a given tenant over a relatively large time range.
alert: 'CortexTenantHasPartialBlocks',
'for': '6h',
expr: |||
max by(namespace, user) (cortex_bucket_blocks_partials_count) > 0
|||,
max by(%(alert_aggregation_labels)s, user) (cortex_bucket_blocks_partials_count) > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Cortex tenant {{ $labels.user }} in {{ $labels.namespace }} has {{ $value }} partial blocks.',
message: 'Cortex tenant {{ $labels.user }} in %(alert_aggregation_variables)s has {{ $value }} partial blocks.' % $._config,
},
},
],
Expand Down
Loading

0 comments on commit 17bc2eb

Please sign in to comment.