Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improved alert messages with Cortex cluster #351

Merged
merged 1 commit into from
Jul 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
* [CHANGE] Replace `CortexRulerFailedEvaluations` with two new alerts: `CortexRulerTooManyFailedPushes` and `CortexRulerTooManyFailedQueries`. #347
* [CHANGE] Removed `CortexCacheRequestErrors` alert. This alert was not working because the legacy Cortex cache client instrumentation doesn't track errors. #346
* [CHANGE] Removed `CortexQuerierCapacityFull` alert. #342
* [CHANGE] Changes blocks storage alerts to group metrics by the configured `cluster_labels` (supporting the deprecated `alert_aggregation_labels`). #351
* [ENHANCEMENT] cortex-mixin: Make `cluster_namespace_deployment:kube_pod_container_resource_requests_{cpu_cores,memory_bytes}:sum` backwards compatible with `kube-state-metrics` v2.0.0. #317
* [ENHANCEMENT] Added documentation text panels and descriptions to reads and writes dashboards. #324
* [ENHANCEMENT] Dashboards: defined container functions for common resources panels: containerDiskWritesPanel, containerDiskReadsPanel, containerDiskSpaceUtilization. #331
Expand Down
64 changes: 32 additions & 32 deletions cortex-mixin/alerts/alerts.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
severity: 'critical',
},
annotations: {
message: 'There are {{ printf "%f" $value }} unhealthy ingester(s).',
message: 'Cortex cluster %(alert_aggregation_variables)s has {{ printf "%%f" $value }} unhealthy ingester(s).' % $._config,
},
},
{
Expand All @@ -35,8 +35,8 @@
},
annotations: {
message: |||
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
|||,
The route {{ $labels.route }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors.
||| % $._config,
},
},
{
Expand Down Expand Up @@ -98,8 +98,8 @@
},
annotations: {
message: |||
Incorrect results for {{ printf "%.2f" $value }}% of queries.
|||,
The Cortex cluster %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% incorrect query results.
||| % $._config,
},
},
{
Expand All @@ -113,8 +113,8 @@
},
annotations: {
message: |||
An inconsistent runtime config file is used across cluster {{ $labels.job }}.
|||,
An inconsistent runtime config file is used across cluster %(alert_aggregation_variables)s.
||| % $._config,
},
},
{
Expand Down Expand Up @@ -145,8 +145,8 @@
},
annotations: {
message: |||
There are {{ $value }} queued up queries in query-frontend.
|||,
There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-frontend.
||| % $._config,
},
},
{
Expand All @@ -160,8 +160,8 @@
},
annotations: {
message: |||
There are {{ $value }} queued up queries in query-scheduler.
|||,
There are {{ $value }} queued up queries in %(alert_aggregation_variables)s query-scheduler.
||| % $._config,
},
},
{
Expand All @@ -178,8 +178,8 @@
},
annotations: {
message: |||
Memcached {{ $labels.name }} used by Cortex in {{ $labels.namespace }} is experiencing {{ printf "%.2f" $value }}% errors for {{ $labels.operation }} operation.
|||,
Memcached {{ $labels.name }} used by Cortex %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors for {{ $labels.operation }} operation.
||| % $._config,
},
},
{
Expand Down Expand Up @@ -430,8 +430,8 @@
},
annotations: {
message: |||
Chunk memcached cluster is too small, should be at least {{ printf "%.2f" $value }}GB.
|||,
Chunk memcached cluster in %(alert_aggregation_variables)s is too small, should be at least {{ printf "%%.2f" $value }}GB.
||| % $._config,
},
},
{
Expand All @@ -448,8 +448,8 @@
},
annotations: {
message: |||
The number of in-memory series per ingester in {{ $labels.namespace }} is too high.
|||,
The number of in-memory series per ingester in %(alert_aggregation_variables)s is too high.
||| % $._config,
},
},
{
Expand All @@ -464,8 +464,8 @@
},
annotations: {
message: |||
Ingesters in {{ $labels.namespace }} ingest too many samples per second.
|||,
Ingesters in %(alert_aggregation_variables)s ingest too many samples per second.
||| % $._config,
},
},
{
Expand All @@ -483,8 +483,8 @@
},
annotations: {
message: |||
Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory.
|||,
Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory.
||| % $._config,
},
},
{
Expand All @@ -502,8 +502,8 @@
},
annotations: {
message: |||
Ingester {{ $labels.namespace }}/{{ $labels.pod }} is using too much memory.
|||,
Ingester {{ $labels.pod }} in %(alert_aggregation_variables)s is using too much memory.
||| % $._config,
},
},
],
Expand All @@ -526,8 +526,8 @@
},
annotations: {
message: |||
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% write (push) errors.
|||,
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% write (push) errors.
||| % $._config,
},
},
{
Expand All @@ -545,8 +545,8 @@
},
annotations: {
message: |||
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% errors while evaluating rules.
|||,
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% errors while evaluating rules.
||| % $._config,
},
},
{
Expand All @@ -563,8 +563,8 @@
},
annotations: {
message: |||
Cortex Ruler {{ $labels.instance }} is experiencing {{ printf "%.2f" $value }}% missed iterations for the rule group {{ $labels.rule_group }}.
|||,
Cortex Ruler {{ $labels.instance }} in %(alert_aggregation_variables)s is experiencing {{ printf "%%.2f" $value }}%% missed iterations for the rule group {{ $labels.rule_group }}.
||| % $._config,
},
},
{
Expand All @@ -579,8 +579,8 @@
},
annotations: {
message: |||
Cortex Rulers {{ $labels.job }} are experiencing errors when checking the ring for rule group ownership.
|||,
Cortex Rulers in %(alert_aggregation_variables)s are experiencing errors when checking the ring for rule group ownership.
||| % $._config,
},
},
],
Expand All @@ -600,7 +600,7 @@
severity: 'warning',
},
annotations: {
message: '{{ $labels.job }}/{{ $labels.instance }} sees incorrect number of gossip members.',
message: 'Cortex instance {{ $labels.instance }} in %(alert_aggregation_variables)s sees incorrect number of gossip members.' % $._config,
},
},
],
Expand Down
62 changes: 31 additions & 31 deletions cortex-mixin/alerts/blocks.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -9,24 +9,24 @@
alert: 'CortexIngesterHasNotShippedBlocks',
'for': '15m',
expr: |||
(min by(namespace, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
(min by(%(alert_aggregation_labels)s, instance) (time() - thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 60 * 60 * 4)
and
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) > 0)
and
# Only if the ingester has ingested samples over the last 4h.
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
and
# Only if the ingester was ingesting samples 4h ago. This protects from the case the ingester instance
# had ingested samples in the past, then no traffic was received for a long period and then it starts
# receiving samples again. Without this check, the alert would fire as soon as it gets back receiving
# samples, while the a block shipping is expected within the next 4h.
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
|||,
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[1h] offset 4h)) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand All @@ -35,15 +35,15 @@
alert: 'CortexIngesterHasNotShippedBlocksSinceStart',
'for': '4h',
expr: |||
(max by(namespace, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
(max by(%(alert_aggregation_labels)s, instance) (thanos_objstore_bucket_last_successful_upload_time{job=~".+/ingester.*"}) == 0)
and
(max by(namespace, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
|||,
(max by(%(alert_aggregation_labels)s, instance) (rate(cortex_ingester_ingested_samples_total[4h])) > 0)
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has not shipped any block in the last 4 hours.' % $._config,
},
},
{
Expand All @@ -61,7 +61,7 @@
severity: 'critical',
},
annotations: {
message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.",
message: "Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet." % $._config,
},
},
{
Expand All @@ -77,7 +77,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to compact TSDB head.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to compact TSDB head.' % $._config,
},
},
{
Expand All @@ -89,7 +89,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB head.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB head.' % $._config,
},
},
{
Expand All @@ -101,7 +101,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to create TSDB checkpoint.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to create TSDB checkpoint.' % $._config,
},
},
{
Expand All @@ -113,7 +113,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to delete TSDB checkpoint.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to delete TSDB checkpoint.' % $._config,
},
},
{
Expand All @@ -125,7 +125,7 @@
severity: 'warning',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to truncate TSDB WAL.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to truncate TSDB WAL.' % $._config,
},
},
{
Expand All @@ -137,7 +137,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} got a corrupted TSDB WAL.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s got a corrupted TSDB WAL.' % $._config,
},
},
{
Expand All @@ -150,7 +150,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} is failing to write to TSDB WAL.',
message: 'Cortex Ingester {{ $labels.instance }} in %(alert_aggregation_variables)s is failing to write to TSDB WAL.' % $._config,
},
},
{
Expand All @@ -166,7 +166,7 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Querier {{ $labels.namespace }}/{{ $labels.instance }} has not successfully scanned the bucket since {{ $value | humanizeDuration }}.',
message: 'Cortex Querier {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully scanned the bucket since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
Expand All @@ -177,20 +177,20 @@
expr: |||
100 * (
(
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
-
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_bucket{le="0.0"}[5m]))
)
/
sum by(namespace) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
sum by(%(alert_aggregation_labels)s) (rate(cortex_querier_storegateway_refetches_per_query_count[5m]))
)
> 1
|||,
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Cortex Queries in {{ $labels.namespace }} are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%.0f" $value }}% of queries.',
message: 'Cortex Queries in %(alert_aggregation_variables)s are refetching series from different store-gateways (because of missing blocks) for the {{ printf "%%.0f" $value }}%% of queries.' % $._config,
},
},
{
Expand All @@ -206,34 +206,34 @@
severity: 'critical',
},
annotations: {
message: 'Cortex Store Gateway {{ $labels.namespace }}/{{ $labels.instance }} has not successfully synched the bucket since {{ $value | humanizeDuration }}.',
message: 'Cortex Store Gateway {{ $labels.instance }} in %(alert_aggregation_variables)s has not successfully synched the bucket since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
// Alert if the bucket index has not been updated for a given user.
alert: 'CortexBucketIndexNotUpdated',
expr: |||
min by(namespace, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
|||,
min by(%(alert_aggregation_labels)s, user) (time() - cortex_bucket_index_last_successful_update_timestamp_seconds) > 7200
||| % $._config,
labels: {
severity: 'critical',
},
annotations: {
message: 'Cortex bucket index for tenant {{ $labels.user }} in {{ $labels.namespace }} has not been updated since {{ $value | humanizeDuration }}.',
message: 'Cortex bucket index for tenant {{ $labels.user }} in %(alert_aggregation_variables)s has not been updated since {{ $value | humanizeDuration }}.' % $._config,
},
},
{
// Alert if a we consistently find partial blocks for a given tenant over a relatively large time range.
alert: 'CortexTenantHasPartialBlocks',
'for': '6h',
expr: |||
max by(namespace, user) (cortex_bucket_blocks_partials_count) > 0
|||,
max by(%(alert_aggregation_labels)s, user) (cortex_bucket_blocks_partials_count) > 0
||| % $._config,
labels: {
severity: 'warning',
},
annotations: {
message: 'Cortex tenant {{ $labels.user }} in {{ $labels.namespace }} has {{ $value }} partial blocks.',
message: 'Cortex tenant {{ $labels.user }} in %(alert_aggregation_variables)s has {{ $value }} partial blocks.' % $._config,
},
},
],
Expand Down
Loading