From 5ec234525c06347751bbb215e25454c99d1ff110 Mon Sep 17 00:00:00 2001 From: Tommy Sauer Date: Tue, 15 Oct 2024 10:20:42 +0200 Subject: [PATCH] adding alert for exhausted PVC (#7200) * adding alert for exhausted PVC * adding hard boundary for > 90% usage --- .../prometheus-controlplane-rules/Chart.yaml | 2 +- .../alerts/kubernetes-node.alerts | 27 ++++++++++++++----- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/prometheus-rules/prometheus-controlplane-rules/Chart.yaml b/prometheus-rules/prometheus-controlplane-rules/Chart.yaml index b4edf1cba4..b51e55a856 100644 --- a/prometheus-rules/prometheus-controlplane-rules/Chart.yaml +++ b/prometheus-rules/prometheus-controlplane-rules/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 name: prometheus-controlplane-rules -version: 1.0.24 +version: 1.1.0 description: A collection of Prometheus alerting and aggregation rules for controlplane. dependencies: [] diff --git a/prometheus-rules/prometheus-controlplane-rules/alerts/kubernetes-node.alerts b/prometheus-rules/prometheus-controlplane-rules/alerts/kubernetes-node.alerts index 9f43e039a4..7bdcdf38b5 100644 --- a/prometheus-rules/prometheus-controlplane-rules/alerts/kubernetes-node.alerts +++ b/prometheus-rules/prometheus-controlplane-rules/alerts/kubernetes-node.alerts @@ -118,19 +118,32 @@ groups: summary: Interface {{ $labels.device }} is down. Node network connectivity is degraded. Check ESX node state in vCenter. ### PVC usage ### + - alert: KubernetesPVCNoSpaceLeft + expr: kubelet_volume_stats_available_percent < 10 + for: 10m + labels: + tier: k8s + support_group: '{{ if $labels.label_ccloud_support_group }}{{ $labels.label_ccloud_support_group }}{{ else }}containers{{ end }}' + service: '{{ if $labels.label_ccloud_service }}{{ $labels.label_ccloud_service }}{{ else }}resources{{ end }}' + severity: info + context: storage + meta: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} free space is less than 10%." + playbook: 'docs/support/playbook/kubernetes/pvc_usage' + annotations: + description: "The PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is almost full. Increase or delete files." + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} free space is less than 10%." - - alert: KubernetesHighPVCUsagePredicted - # NOTE: The labels for support-group and service are already present in `kubelet_volume_stats_available_percent`, cf. the aggregation rule that defines it. - expr: sum((kubelet_volume_stats_available_percent < 30) and (predict_linear(kubelet_volume_stats_available_percent[1d], 7 * 24 * 3600) < 10)) by (label_ccloud_support_group, label_ccloud_service, namespace, persistentvolumeclaim) - for: 1h + - alert: KubernetesPVCNoSpaceLeft + expr: kubelet_volume_stats_available_percent < 2 + for: 10m labels: tier: k8s support_group: '{{ if $labels.label_ccloud_support_group }}{{ $labels.label_ccloud_support_group }}{{ else }}containers{{ end }}' service: '{{ if $labels.label_ccloud_service }}{{ $labels.label_ccloud_service }}{{ else }}resources{{ end }}' severity: warning context: storage - meta: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is set to exceed 90% usage soon" + meta: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage is over 98%." playbook: 'docs/support/playbook/kubernetes/pvc_usage' annotations: - description: "The PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is predicted to exceed 90% storage consumption in the next 7 days." - summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is set to exceed 90% usage soon" + description: "The PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is full. Programs will stop working if relying upon free storage." + summary: "PVC {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} usage is over 98%."