Skip to content

Commit

Permalink
reduce prometheus metric cardinality (#950)
Browse files Browse the repository at this point in the history
  • Loading branch information
GavinBurris42 authored Jan 25, 2024
1 parent aed263a commit 2b50c31
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 8 deletions.
9 changes: 5 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -573,10 +573,11 @@ For build instructions please consult [BUILD.md](./BUILD.md).
## Metrics
Available Prometheus metrics:

| Metric name | Description |
| -------------- | ------------------------------------- |
| `actions_node` | Number of actions per node |
| `events_error` | Number of errors in events processing |
| Metric name | Description |
| -------------- | -------------------------------------------------------------------|
| `actions` | Number of actions |
| `actions_node` | Number of actions per node (Deprecated: Use actions metric instead)|
| `events_error` | Number of errors in events processing |


## Communication
Expand Down
2 changes: 1 addition & 1 deletion config/helm/aws-node-termination-handler/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,6 @@ The configuration in this table applies to AWS Node Termination Handler testing

## Metrics Endpoint Considerations

AWS Node Termination HAndler in IMDS mode runs as a DaemonSet with `useHostNetwork: true` by default. If the Prometheus server is enabled with `enablePrometheusServer: true` nothing else will be able to bind to the configured port (by default `prometheusServerPort: 9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint.
AWS Node Termination Handler in IMDS mode runs as a DaemonSet with `useHostNetwork: true` by default. If the Prometheus server is enabled with `enablePrometheusServer: true` nothing else will be able to bind to the configured port (by default `prometheusServerPort: 9092`) in the root network namespace. Therefore, it will need to have a firewall/security group configured on the nodes to block access to the `/metrics` endpoint.

You can switch NTH in IMDS mode to run w/ `useHostNetwork: false`, but you will need to make sure that IMDSv1 is enabled or IMDSv2 IP hop count will need to be incremented to 2 (see the [IMDSv2 documentation](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/configuring-instance-metadata-service.html).
16 changes: 16 additions & 0 deletions pkg/observability/opentelemetry.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type Metrics struct {
enabled bool
meter api.Meter
actionsCounter instrument.Int64Counter
actionsCounterV2 instrument.Int64Counter
errorEventsCounter instrument.Int64Counter
}

Expand Down Expand Up @@ -89,25 +90,39 @@ func (m Metrics) NodeActionsInc(action, nodeName string, eventID string, err err
}

labels := []attribute.KeyValue{labelNodeActionKey.String(action), labelNodeNameKey.String(nodeName), labelEventIDKey.String(eventID)}
labelsV2 := []attribute.KeyValue{labelNodeActionKey.String(action)}
if err != nil {
labels = append(labels, labelNodeStatusKey.String("error"))
labelsV2 = append(labelsV2, labelNodeStatusKey.String("error"))
} else {
labels = append(labels, labelNodeStatusKey.String("success"))
labelsV2 = append(labelsV2, labelNodeStatusKey.String("success"))
}

m.actionsCounter.Add(context.Background(), 1, labels...)
m.actionsCounterV2.Add(context.Background(), 1, labelsV2...)
}

func registerMetricsWith(provider *metric.MeterProvider) (Metrics, error) {
meter := provider.Meter("aws.node.termination.handler")

// Deprecated: actionsCounter metric has a high label cardinality, resulting in numerous time-series which utilize
// a large amount of memory. Use actionsCounterV2 metric instead.
name := "actions.node"
actionsCounter, err := meter.Int64Counter(name, instrument.WithDescription("Number of actions per node"))
if err != nil {
return Metrics{}, fmt.Errorf("failed to create Prometheus counter %q: %w", name, err)
}
actionsCounter.Add(context.Background(), 0)

// Recommended replacement for actionsCounter metric
name = "actions"
actionsCounterV2, err := meter.Int64Counter(name, instrument.WithDescription("Number of actions"))
if err != nil {
return Metrics{}, fmt.Errorf("failed to create Prometheus counter %q: %w", name, err)
}
actionsCounterV2.Add(context.Background(), 0)

name = "events.error"
errorEventsCounter, err := meter.Int64Counter(name, instrument.WithDescription("Number of errors in events processing"))
if err != nil {
Expand All @@ -118,6 +133,7 @@ func registerMetricsWith(provider *metric.MeterProvider) (Metrics, error) {
meter: meter,
errorEventsCounter: errorEventsCounter,
actionsCounter: actionsCounter,
actionsCounterV2: actionsCounterV2,
}, nil
}

Expand Down
22 changes: 19 additions & 3 deletions test/e2e/prometheus-metrics-test
Original file line number Diff line number Diff line change
Expand Up @@ -162,11 +162,27 @@ for i in $(seq 1 $TAINT_CHECK_CYCLES); do
fi
done
if [ -z $failed ]; then
exit 0
break
fi
echo "Metrics Loop $i/$TAINT_CHECK_CYCLES, sleeping for $TAINT_CHECK_SLEEP seconds"
sleep $TAINT_CHECK_SLEEP
done

echo "❌ Failed checking metric for $METRIC"
exit 3
if [[ -n $failed ]];then
exit 4
fi

metric_name="actions_total"
for action in cordon-and-drain pre-drain; do
labels='node_action="'$action'",node_status="success",otel_scope_name="aws.node.termination.handler",otel_scope_version=""'
query="$metric_name{$labels}"
counter_value=$(echo "$METRICS_RESPONSE" | grep -E "${query}[[:space:]]+[0-9]+" | awk '{print $NF}')
if (($counter_value <= 1)); then
echo "❌ Failed counter count for metric action:$action"
exit 5
fi
echo "✅ Fetched counter:$counter_value for metric with action:$action"
done


exit 0

0 comments on commit 2b50c31

Please sign in to comment.