From 269b8b5dde03b941fc55af3c628256f1cb30215c Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 25 Jan 2021 14:15:00 +0100 Subject: [PATCH 1/2] Added alert CortexIngesterHasUnshippedBlocks Signed-off-by: Marco Pracucci --- CHANGELOG.md | 1 + cortex-mixin/alerts/blocks.libsonnet | 18 ++++++++++++++++++ cortex-mixin/docs/playbooks.md | 7 +++++++ 3 files changed, 26 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 51028843..d6c6e733 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ - Alerts: added "CortexBucketIndexNotUpdated" (bucket index only) and "CortexTenantHasPartialBlocks" * [ENHANCEMENT] The name of the overrides configmap is now customisable via `$._config.overrides_configmap`. #244 * [ENHANCEMENT] Added flag to control usage of bucket-index, and enable it by default when using blocks. #254 +* [ENHANCEMENT] Added the alert `CortexIngesterHasUnshippedBlocks`. #248 * [BUGFIX] Honor configured `per_instance_label` in all panels. #239 * [BUGFIX] `CortexRequestLatency` alert now ignores long-running requests on query-scheduler. #242 * [BUGFIX] Honor configured `job_names` in the "Memory (go heap inuse)" panel. #247 diff --git a/cortex-mixin/alerts/blocks.libsonnet b/cortex-mixin/alerts/blocks.libsonnet index 12e9160d..beeb6240 100644 --- a/cortex-mixin/alerts/blocks.libsonnet +++ b/cortex-mixin/alerts/blocks.libsonnet @@ -39,6 +39,24 @@ message: 'Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has not shipped any block in the last 4 hours.', }, }, + { + // Alert if the ingester has compacted some blocks that haven't been successfully uploaded to the storage yet since + // more than 1 hour. The metric tracks the time of the oldest unshipped block, measured as the time when the + // TSDB head has been compacted to a block. The metric is 0 if all blocks have been shipped. + alert: 'CortexIngesterHasUnshippedBlocks', + 'for': '15m', + expr: ||| + (time() - cortex_ingester_oldest_unshipped_block_timestamp_seconds > 3600) + and + (cortex_ingester_oldest_unshipped_block_timestamp_seconds > 0) + |||, + labels: { + severity: 'critical', + }, + annotations: { + message: "Cortex Ingester {{ $labels.namespace }}/{{ $labels.instance }} has compacted a block {{ $value | humanizeDuration }} ago but it hasn't been successfully uploaded to the storage yet.", + }, + }, { // Alert if the ingester is failing to compact TSDB head into a block, for any opened TSDB. Once the TSDB head is // compactable, the ingester will try to compact it every 1 minute. Repeatedly failing it is a critical condition diff --git a/cortex-mixin/docs/playbooks.md b/cortex-mixin/docs/playbooks.md index b8046409..6ce4e7a3 100644 --- a/cortex-mixin/docs/playbooks.md +++ b/cortex-mixin/docs/playbooks.md @@ -92,6 +92,13 @@ If the ingester hit the disk capacity, any attempt to append samples will fail. Same as [`CortexIngesterHasNotShippedBlocks`](#CortexIngesterHasNotShippedBlocks). +### CortexIngesterHasUnshippedBlocks + +This alert fires when a Cortex ingester has compacted some blocks but such blocks haven't been successfully uploaded to the storage yet. + +How to **investigate**: +- Look for details in the ingester logs + ### CortexIngesterTSDBHeadCompactionFailed This alert fires when a Cortex ingester is failing to compact the TSDB head into a block. From 55efcd2ec2dcaeb18157436582d7e86393eeecb5 Mon Sep 17 00:00:00 2001 From: Marco Pracucci Date: Mon, 25 Jan 2021 14:16:15 +0100 Subject: [PATCH 2/2] Fix CHANGELOG entry Signed-off-by: Marco Pracucci --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d6c6e733..a37e3aaa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,7 +9,7 @@ - Alerts: added "CortexBucketIndexNotUpdated" (bucket index only) and "CortexTenantHasPartialBlocks" * [ENHANCEMENT] The name of the overrides configmap is now customisable via `$._config.overrides_configmap`. #244 * [ENHANCEMENT] Added flag to control usage of bucket-index, and enable it by default when using blocks. #254 -* [ENHANCEMENT] Added the alert `CortexIngesterHasUnshippedBlocks`. #248 +* [ENHANCEMENT] Added the alert `CortexIngesterHasUnshippedBlocks`. #255 * [BUGFIX] Honor configured `per_instance_label` in all panels. #239 * [BUGFIX] `CortexRequestLatency` alert now ignores long-running requests on query-scheduler. #242 * [BUGFIX] Honor configured `job_names` in the "Memory (go heap inuse)" panel. #247