From 2b50c31bd707b259835c0fc8e3ecad5f7d512fcb Mon Sep 17 00:00:00 2001 From: Gavin Burris <66969321+GavinBurris42@users.noreply.github.com> Date: Thu, 25 Jan 2024 11:36:25 -0600 Subject: [PATCH] reduce prometheus metric cardinality (#950) --- README.md | 9 ++++---- .../aws-node-termination-handler/README.md | 2 +- pkg/observability/opentelemetry.go | 16 ++++++++++++++ test/e2e/prometheus-metrics-test | 22 ++++++++++++++++--- 4 files changed, 41 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 0a906375..ba8efb71 100644 --- a/README.md +++ b/README.md @@ -573,10 +573,11 @@ For build instructions please consult [BUILD.md](./BUILD.md). ## Metrics Available Prometheus metrics: -| Metric name | Description | -| -------------- | ------------------------------------- | -| `actions_node` | Number of actions per node | -| `events_error` | Number of errors in events processing | +| Metric name | Description | +| -------------- | -------------------------------------------------------------------| +| `actions` | Number of actions | +| `actions_node` | Number of actions per node (Deprecated: Use actions metric instead)| +| `events_error` | Number of errors in events processing | ## Communication diff --git a/config/helm/aws-node-termination-handler/README.md b/config/helm/aws-node-termination-handler/README.md index 17e5d1a9..06ceac1a 100644 --- a/config/helm/aws-node-termination-handler/README.md +++ b/config/helm/aws-node-termination-handler/README.md @@ -174,6 +174,6 @@ The configuration in this table applies to AWS Node Termination Handler testing ## Metrics Endpoint Considerations -AWS Node Termination HAndler in IMDS mode runs as a DaemonSet with `useHostNetwork: true` by default. If the Prometheus server is enabled with `enablePrometheusServer: true` nothing else will be able to bind to the configured port (by default `prometheusServerPort: 9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint. +AWS Node Termination Handler in IMDS mode runs as a DaemonSet with `useHostNetwork: true` by default. If the Prometheus server is enabled with `enablePrometheusServer: true` nothing else will be able to bind to the configured port (by default `prometheusServerPort: 9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint. You can switch NTH in IMDS mode to run w/ `useHostNetwork: false`, but you will need to make sure that IMDSv1 is enabled or IMDSv2 IP hop count will need to be incremented to 2 (see the [IMDSv2 documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html). diff --git a/pkg/observability/opentelemetry.go b/pkg/observability/opentelemetry.go index 91cf1a7f..39a08c05 100644 --- a/pkg/observability/opentelemetry.go +++ b/pkg/observability/opentelemetry.go @@ -45,6 +45,7 @@ type Metrics struct { enabled bool meter api.Meter actionsCounter instrument.Int64Counter + actionsCounterV2 instrument.Int64Counter errorEventsCounter instrument.Int64Counter } @@ -89,18 +90,24 @@ func (m Metrics) NodeActionsInc(action, nodeName string, eventID string, err err } labels := []attribute.KeyValue{labelNodeActionKey.String(action), labelNodeNameKey.String(nodeName), labelEventIDKey.String(eventID)} + labelsV2 := []attribute.KeyValue{labelNodeActionKey.String(action)} if err != nil { labels = append(labels, labelNodeStatusKey.String("error")) + labelsV2 = append(labelsV2, labelNodeStatusKey.String("error")) } else { labels = append(labels, labelNodeStatusKey.String("success")) + labelsV2 = append(labelsV2, labelNodeStatusKey.String("success")) } m.actionsCounter.Add(context.Background(), 1, labels...) + m.actionsCounterV2.Add(context.Background(), 1, labelsV2...) } func registerMetricsWith(provider *metric.MeterProvider) (Metrics, error) { meter := provider.Meter("aws.node.termination.handler") + // Deprecated: actionsCounter metric has a high label cardinality, resulting in numerous time-series which utilize + // a large amount of memory. Use actionsCounterV2 metric instead. name := "actions.node" actionsCounter, err := meter.Int64Counter(name, instrument.WithDescription("Number of actions per node")) if err != nil { @@ -108,6 +115,14 @@ func registerMetricsWith(provider *metric.MeterProvider) (Metrics, error) { } actionsCounter.Add(context.Background(), 0) + // Recommended replacement for actionsCounter metric + name = "actions" + actionsCounterV2, err := meter.Int64Counter(name, instrument.WithDescription("Number of actions")) + if err != nil { + return Metrics{}, fmt.Errorf("failed to create Prometheus counter %q: %w", name, err) + } + actionsCounterV2.Add(context.Background(), 0) + name = "events.error" errorEventsCounter, err := meter.Int64Counter(name, instrument.WithDescription("Number of errors in events processing")) if err != nil { @@ -118,6 +133,7 @@ func registerMetricsWith(provider *metric.MeterProvider) (Metrics, error) { meter: meter, errorEventsCounter: errorEventsCounter, actionsCounter: actionsCounter, + actionsCounterV2: actionsCounterV2, }, nil } diff --git a/test/e2e/prometheus-metrics-test b/test/e2e/prometheus-metrics-test index 4fd998cc..d3932609 100755 --- a/test/e2e/prometheus-metrics-test +++ b/test/e2e/prometheus-metrics-test @@ -162,11 +162,27 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do fi done if [ -z $failed ]; then - exit 0 + break fi echo "Metrics Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds" sleep $TAINT_CHECK_SLEEP done -echo "❌ Failed checking metric for $METRIC" -exit 3 +if [[ -n $failed ]];then + exit 4 +fi + +metric_name="actions_total" +for action in cordon-and-drain pre-drain; do + labels='node_action="'$action'",node_status="success",otel_scope_name="aws.node.termination.handler",otel_scope_version=""' + query="$metric_name{$labels}" + counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}') + if (($counter_value <= 1)); then + echo "❌ Failed counter count for metric action:$action" + exit 5 + fi + echo "✅ Fetched counter:$counter_value for metric with action:$action" +done + + +exit 0