diff --git a/src/prometheus/deploy/alerting/gpu.rules b/src/prometheus/deploy/alerting/gpu.rules index 90e670f947..025323da77 100644 --- a/src/prometheus/deploy/alerting/gpu.rules +++ b/src/prometheus/deploy/alerting/gpu.rules @@ -71,7 +71,8 @@ groups: summary: "found nvidia used by zombie container in {{$labels.instance}}" - alert: NodeGpuCountChanged - expr: changes(node:gpu_utilization:count[5m]) > 0 + expr: node:gpu_utilization:count != on (instance) configured_gpu_count + for: 5m labels: severity: fatal annotations: