From 19f534614a4c56e4f5a396e90491f4d04ccc1ca2 Mon Sep 17 00:00:00 2001 From: Piotr <17101802+thampiotr@users.noreply.github.com> Date: Fri, 20 Sep 2024 13:40:00 +0200 Subject: [PATCH 1/3] tweak OTEL alerts to use SR --- .../alerts/opentelemetry.libsonnet | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/operations/alloy-mixin/alerts/opentelemetry.libsonnet b/operations/alloy-mixin/alerts/opentelemetry.libsonnet index 611034c33..72efa0173 100644 --- a/operations/alloy-mixin/alerts/opentelemetry.libsonnet +++ b/operations/alloy-mixin/alerts/opentelemetry.libsonnet @@ -1,6 +1,17 @@ local alert = import './utils/alert.jsonnet'; { + local successRateQuery(enableK8sCluster, failed, success) = + local sumBy = if enableK8sCluster then "cluster, namespace, job" else "job"; + ||| + (1 - sum by (%s) ( + rate(%s{}[1m]) + / + (rate(%s{}[1m]) + rate(%s{}[1m])) + ) + ) < 0.95 + ||| % [sumBy, failed, failed, success], + newOpenTelemetryAlertsGroup(enableK8sCluster=true): alert.newGroup( 'alloy_otelcol', @@ -10,28 +21,20 @@ local alert = import './utils/alert.jsonnet'; // imposed by otelcol.processor.memory_limiter. alert.newRule( 'OtelcolReceiverRefusedSpans', - if enableK8sCluster then - 'sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0' - else - 'sum by (job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0' - , + successRateQuery(enableK8sCluster, "otelcol_receiver_refused_spans_total", "otelcol_receiver_accepted_spans_total"), 'The receiver could not push some spans to the pipeline.', 'The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.', '5m', ), - // The exporter failed to send spans to their destination. + // The exporter success rate is below 95%. // There could be an issue with the payload or with the destination endpoint. alert.newRule( 'OtelcolExporterFailedSpans', - if enableK8sCluster then - 'sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0' - else - 'sum by (job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0' - , - 'The exporter failed to send spans to their destination.', + successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_spans_total", "otelcol_exporter_sent_spans_total"), + 'The exporter span sending success rate is below 95%.', 'The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.', - '5m', + '10m', ), ] ) From 627488ab4b4c362ed41cd2ec0bc5eb706ed72bec Mon Sep 17 00:00:00 2001 From: Piotr <17101802+thampiotr@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:34:27 +0200 Subject: [PATCH 2/3] comments --- operations/alloy-mixin/alerts/opentelemetry.libsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/operations/alloy-mixin/alerts/opentelemetry.libsonnet b/operations/alloy-mixin/alerts/opentelemetry.libsonnet index 72efa0173..0150ea593 100644 --- a/operations/alloy-mixin/alerts/opentelemetry.libsonnet +++ b/operations/alloy-mixin/alerts/opentelemetry.libsonnet @@ -16,15 +16,15 @@ local alert = import './utils/alert.jsonnet'; alert.newGroup( 'alloy_otelcol', [ - // An otelcol.exporter component rcould not push some spans to the pipeline. + // An otelcol.receiver component could not push over 5% of spans to the pipeline. // This could be due to reaching a limit such as the ones // imposed by otelcol.processor.memory_limiter. alert.newRule( 'OtelcolReceiverRefusedSpans', successRateQuery(enableK8sCluster, "otelcol_receiver_refused_spans_total", "otelcol_receiver_accepted_spans_total"), - 'The receiver could not push some spans to the pipeline.', + 'The receiver pushing spans to the pipeline success rate is below 95%.', 'The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter.', - '5m', + '10m', ), // The exporter success rate is below 95%. @@ -32,7 +32,7 @@ local alert = import './utils/alert.jsonnet'; alert.newRule( 'OtelcolExporterFailedSpans', successRateQuery(enableK8sCluster, "otelcol_exporter_send_failed_spans_total", "otelcol_exporter_sent_spans_total"), - 'The exporter span sending success rate is below 95%.', + 'The exporter sending spans success rate is below 95%.', 'The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint.', '10m', ), From 0e37512b3ff3a32f605b20f01b5113fd23970825 Mon Sep 17 00:00:00 2001 From: Piotr <17101802+thampiotr@users.noreply.github.com> Date: Fri, 20 Sep 2024 14:38:17 +0200 Subject: [PATCH 3/3] changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 30d401597..c1fc54d8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,8 @@ Main (unreleased) - Small fix in UI stylesheet to fit more content into visible table area. (@defanator) +- Changed OTEL alerts in Alloy mixin to use success rate for tracing. (@thampiotr) + v1.4.0-rc.3 -----------------