From 16dc613279096b156c56e8028f3bbcc3514a4736 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 4 Nov 2021 16:59:46 +0100 Subject: [PATCH] pmem-csi: avoid false negative liveness probe During a long-running DeleteVolume for a 200GB volume the LVM lock is held and the liveness probe for metrics data timed out after a minute, killing the container, because metrics retrieval must take the LVM lock. As operation duration is non-deterministic, the solution is to probe only metrics data which can be served immediately when the process is up and running. This is less precise (= doesn't really check any functionality), but that's an inherent problem of liveness probes: they simply cannot cover the full functionality. (cherry picked from commit 3e92fe3aae0652691e6bb5c5f8ad631d6c2406cc) --- deploy/kubernetes-1.19/direct/pmem-csi.yaml | 8 +-- .../direct/testing/pmem-csi.yaml | 8 +-- deploy/kubernetes-1.19/lvm/pmem-csi.yaml | 8 +-- .../kubernetes-1.19/lvm/testing/pmem-csi.yaml | 8 +-- .../pmem-csi-direct-testing.yaml | 8 +-- deploy/kubernetes-1.19/pmem-csi-direct.yaml | 8 +-- .../kubernetes-1.19/pmem-csi-lvm-testing.yaml | 8 +-- deploy/kubernetes-1.19/pmem-csi-lvm.yaml | 8 +-- deploy/kubernetes-1.20/direct/pmem-csi.yaml | 8 +-- .../direct/testing/pmem-csi.yaml | 8 +-- deploy/kubernetes-1.20/lvm/pmem-csi.yaml | 8 +-- .../kubernetes-1.20/lvm/testing/pmem-csi.yaml | 8 +-- .../pmem-csi-direct-testing.yaml | 8 +-- deploy/kubernetes-1.20/pmem-csi-direct.yaml | 8 +-- .../kubernetes-1.20/pmem-csi-lvm-testing.yaml | 8 +-- deploy/kubernetes-1.20/pmem-csi-lvm.yaml | 8 +-- deploy/kubernetes-1.21/direct/pmem-csi.yaml | 8 +-- .../direct/testing/pmem-csi.yaml | 8 +-- deploy/kubernetes-1.21/lvm/pmem-csi.yaml | 8 +-- .../kubernetes-1.21/lvm/testing/pmem-csi.yaml | 8 +-- .../pmem-csi-direct-testing.yaml | 8 +-- deploy/kubernetes-1.21/pmem-csi-direct.yaml | 8 +-- .../kubernetes-1.21/pmem-csi-lvm-testing.yaml | 8 +-- deploy/kubernetes-1.21/pmem-csi-lvm.yaml | 8 +-- .../kustomize/patches/metrics-controller.yaml | 4 +- deploy/kustomize/patches/metrics-node.yaml | 13 ++-- pkg/pmem-csi-driver/pmem-csi-driver.go | 4 ++ .../deployment/controller_driver.go | 16 ++--- test/e2e/metrics/metrics.go | 67 +++++++++++++------ 29 files changed, 166 insertions(+), 130 deletions(-) diff --git a/deploy/kubernetes-1.19/direct/pmem-csi.yaml b/deploy/kubernetes-1.19/direct/pmem-csi.yaml index 3f86697b8..34bc72630 100644 --- a/deploy/kubernetes-1.19/direct/pmem-csi.yaml +++ b/deploy/kubernetes-1.19/direct/pmem-csi.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.19/direct/testing/pmem-csi.yaml b/deploy/kubernetes-1.19/direct/testing/pmem-csi.yaml index 4bccb93f0..d1203cc3a 100644 --- a/deploy/kubernetes-1.19/direct/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.19/direct/testing/pmem-csi.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.19/lvm/pmem-csi.yaml b/deploy/kubernetes-1.19/lvm/pmem-csi.yaml index 8ca23751c..9a4639648 100644 --- a/deploy/kubernetes-1.19/lvm/pmem-csi.yaml +++ b/deploy/kubernetes-1.19/lvm/pmem-csi.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.19/lvm/testing/pmem-csi.yaml b/deploy/kubernetes-1.19/lvm/testing/pmem-csi.yaml index a586ab89b..ffa7b95f2 100644 --- a/deploy/kubernetes-1.19/lvm/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.19/lvm/testing/pmem-csi.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.19/pmem-csi-direct-testing.yaml b/deploy/kubernetes-1.19/pmem-csi-direct-testing.yaml index 4bccb93f0..d1203cc3a 100644 --- a/deploy/kubernetes-1.19/pmem-csi-direct-testing.yaml +++ b/deploy/kubernetes-1.19/pmem-csi-direct-testing.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.19/pmem-csi-direct.yaml b/deploy/kubernetes-1.19/pmem-csi-direct.yaml index 3f86697b8..34bc72630 100644 --- a/deploy/kubernetes-1.19/pmem-csi-direct.yaml +++ b/deploy/kubernetes-1.19/pmem-csi-direct.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.19/pmem-csi-lvm-testing.yaml b/deploy/kubernetes-1.19/pmem-csi-lvm-testing.yaml index a586ab89b..ffa7b95f2 100644 --- a/deploy/kubernetes-1.19/pmem-csi-lvm-testing.yaml +++ b/deploy/kubernetes-1.19/pmem-csi-lvm-testing.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.19/pmem-csi-lvm.yaml b/deploy/kubernetes-1.19/pmem-csi-lvm.yaml index 8ca23751c..9a4639648 100644 --- a/deploy/kubernetes-1.19/pmem-csi-lvm.yaml +++ b/deploy/kubernetes-1.19/pmem-csi-lvm.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/direct/pmem-csi.yaml b/deploy/kubernetes-1.20/direct/pmem-csi.yaml index 3f86697b8..34bc72630 100644 --- a/deploy/kubernetes-1.20/direct/pmem-csi.yaml +++ b/deploy/kubernetes-1.20/direct/pmem-csi.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/direct/testing/pmem-csi.yaml b/deploy/kubernetes-1.20/direct/testing/pmem-csi.yaml index 4bccb93f0..d1203cc3a 100644 --- a/deploy/kubernetes-1.20/direct/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.20/direct/testing/pmem-csi.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/lvm/pmem-csi.yaml b/deploy/kubernetes-1.20/lvm/pmem-csi.yaml index 8ca23751c..9a4639648 100644 --- a/deploy/kubernetes-1.20/lvm/pmem-csi.yaml +++ b/deploy/kubernetes-1.20/lvm/pmem-csi.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/lvm/testing/pmem-csi.yaml b/deploy/kubernetes-1.20/lvm/testing/pmem-csi.yaml index a586ab89b..ffa7b95f2 100644 --- a/deploy/kubernetes-1.20/lvm/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.20/lvm/testing/pmem-csi.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/pmem-csi-direct-testing.yaml b/deploy/kubernetes-1.20/pmem-csi-direct-testing.yaml index 4bccb93f0..d1203cc3a 100644 --- a/deploy/kubernetes-1.20/pmem-csi-direct-testing.yaml +++ b/deploy/kubernetes-1.20/pmem-csi-direct-testing.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/pmem-csi-direct.yaml b/deploy/kubernetes-1.20/pmem-csi-direct.yaml index 3f86697b8..34bc72630 100644 --- a/deploy/kubernetes-1.20/pmem-csi-direct.yaml +++ b/deploy/kubernetes-1.20/pmem-csi-direct.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/pmem-csi-lvm-testing.yaml b/deploy/kubernetes-1.20/pmem-csi-lvm-testing.yaml index a586ab89b..ffa7b95f2 100644 --- a/deploy/kubernetes-1.20/pmem-csi-lvm-testing.yaml +++ b/deploy/kubernetes-1.20/pmem-csi-lvm-testing.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.20/pmem-csi-lvm.yaml b/deploy/kubernetes-1.20/pmem-csi-lvm.yaml index 8ca23751c..9a4639648 100644 --- a/deploy/kubernetes-1.20/pmem-csi-lvm.yaml +++ b/deploy/kubernetes-1.20/pmem-csi-lvm.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/direct/pmem-csi.yaml b/deploy/kubernetes-1.21/direct/pmem-csi.yaml index 5cca737be..f156a7934 100644 --- a/deploy/kubernetes-1.21/direct/pmem-csi.yaml +++ b/deploy/kubernetes-1.21/direct/pmem-csi.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/direct/testing/pmem-csi.yaml b/deploy/kubernetes-1.21/direct/testing/pmem-csi.yaml index e78d4bc68..75326f608 100644 --- a/deploy/kubernetes-1.21/direct/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.21/direct/testing/pmem-csi.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/lvm/pmem-csi.yaml b/deploy/kubernetes-1.21/lvm/pmem-csi.yaml index 04f823d9a..84c1509cd 100644 --- a/deploy/kubernetes-1.21/lvm/pmem-csi.yaml +++ b/deploy/kubernetes-1.21/lvm/pmem-csi.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/lvm/testing/pmem-csi.yaml b/deploy/kubernetes-1.21/lvm/testing/pmem-csi.yaml index 7924bf077..847b87ffb 100644 --- a/deploy/kubernetes-1.21/lvm/testing/pmem-csi.yaml +++ b/deploy/kubernetes-1.21/lvm/testing/pmem-csi.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/pmem-csi-direct-testing.yaml b/deploy/kubernetes-1.21/pmem-csi-direct-testing.yaml index e78d4bc68..75326f608 100644 --- a/deploy/kubernetes-1.21/pmem-csi-direct-testing.yaml +++ b/deploy/kubernetes-1.21/pmem-csi-direct-testing.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/pmem-csi-direct.yaml b/deploy/kubernetes-1.21/pmem-csi-direct.yaml index 5cca737be..f156a7934 100644 --- a/deploy/kubernetes-1.21/pmem-csi-direct.yaml +++ b/deploy/kubernetes-1.21/pmem-csi-direct.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/pmem-csi-lvm-testing.yaml b/deploy/kubernetes-1.21/pmem-csi-lvm-testing.yaml index 7924bf077..847b87ffb 100644 --- a/deploy/kubernetes-1.21/pmem-csi-lvm-testing.yaml +++ b/deploy/kubernetes-1.21/pmem-csi-lvm-testing.yaml @@ -383,7 +383,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -402,7 +402,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -482,7 +482,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -502,7 +502,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kubernetes-1.21/pmem-csi-lvm.yaml b/deploy/kubernetes-1.21/pmem-csi-lvm.yaml index 04f823d9a..84c1509cd 100644 --- a/deploy/kubernetes-1.21/pmem-csi-lvm.yaml +++ b/deploy/kubernetes-1.21/pmem-csi-lvm.yaml @@ -382,7 +382,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -401,7 +401,7 @@ spec: startupProbe: failureThreshold: 60 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 @@ -480,7 +480,7 @@ spec: livenessProbe: failureThreshold: 6 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 10 @@ -500,7 +500,7 @@ spec: startupProbe: failureThreshold: 300 httpGet: - path: /metrics + path: /metrics/simple port: metrics scheme: HTTP periodSeconds: 1 diff --git a/deploy/kustomize/patches/metrics-controller.yaml b/deploy/kustomize/patches/metrics-controller.yaml index d3293a0ca..b8ac31c16 100644 --- a/deploy/kustomize/patches/metrics-controller.yaml +++ b/deploy/kustomize/patches/metrics-controller.yaml @@ -21,7 +21,7 @@ # then it is alive. httpGet: scheme: HTTP - path: /metrics + path: /metrics/simple port: metrics # Allow it to for a total duration of one minute. # This is conservative because the probe is new. @@ -34,7 +34,7 @@ value: httpGet: scheme: HTTP - path: /metrics + path: /metrics/simple port: metrics # Check more frequently while the container starts up # to get it into a ready state quickly. diff --git a/deploy/kustomize/patches/metrics-node.yaml b/deploy/kustomize/patches/metrics-node.yaml index 94ac74060..2b5116252 100644 --- a/deploy/kustomize/patches/metrics-node.yaml +++ b/deploy/kustomize/patches/metrics-node.yaml @@ -18,11 +18,16 @@ path: /spec/template/spec/containers/0/livenessProbe value: # If the PMEM-CSI driver is able to serve metrics, - # then it is alive. In particular this covers capacity - # checking. + # then it is alive, for some definition of "alive". + # + # In particular this does *not covers capacity + # checking, because that needs to take a lock + # which can take an unpredictable amount of time + # when there is an operation in progress like + # scrubbing a volume. httpGet: scheme: HTTP - path: /metrics + path: /metrics/simple port: metrics # Allow it to for a total duration of one minute. # This is conservative because the probe is new. @@ -35,7 +40,7 @@ value: httpGet: scheme: HTTP - path: /metrics + path: /metrics/simple port: metrics # Startup may be slower when LVM needs to be set up first. # Check more frequently to get it into a ready state quickly. diff --git a/pkg/pmem-csi-driver/pmem-csi-driver.go b/pkg/pmem-csi-driver/pmem-csi-driver.go index 2a6faa8ff..32fe818bc 100644 --- a/pkg/pmem-csi-driver/pmem-csi-driver.go +++ b/pkg/pmem-csi-driver/pmem-csi-driver.go @@ -87,10 +87,13 @@ var ( }, []string{"version"}, ) + + simpleMetrics = prometheus.NewPedanticRegistry() ) func init() { prometheus.MustRegister(buildInfo) + simpleMetrics.MustRegister(buildInfo) } //Config type for driver configuration @@ -388,6 +391,7 @@ func (csid *csiDriver) startMetrics(ctx context.Context, cancel func()) (string, promhttp.HandlerFor(csid.gatherers, promhttp.HandlerOpts{}), ), ) + mux.Handle(csid.cfg.metricsPath+"/simple", promhttp.HandlerFor(simpleMetrics, promhttp.HandlerOpts{})) return csid.startHTTPSServer(ctx, cancel, csid.cfg.metricsListen, mux, false /* no TLS */) } diff --git a/pkg/pmem-csi-operator/controller/deployment/controller_driver.go b/pkg/pmem-csi-operator/controller/deployment/controller_driver.go index d28245511..af6a97ad0 100644 --- a/pkg/pmem-csi-operator/controller/deployment/controller_driver.go +++ b/pkg/pmem-csi-operator/controller/deployment/controller_driver.go @@ -1430,8 +1430,8 @@ func (d *pmemCSIDeployment) getControllerContainer() corev1.Container { SecurityContext: &corev1.SecurityContext{ ReadOnlyRootFilesystem: &true, }, - LivenessProbe: getMetricsProbe(6, 10), - StartupProbe: getMetricsProbe(60, 1), + LivenessProbe: getMetricsProbe(6, 10, "/simple"), + StartupProbe: getMetricsProbe(60, 1, "/simple"), } if d.Spec.ControllerTLSSecret != "" { @@ -1515,8 +1515,8 @@ func (d *pmemCSIDeployment) getNodeDriverContainer() corev1.Container { }, TerminationMessagePath: "/tmp/termination-log", TerminationMessagePolicy: corev1.TerminationMessageReadFile, - LivenessProbe: getMetricsProbe(6, 10), - StartupProbe: getMetricsProbe(300, 1), + LivenessProbe: getMetricsProbe(6, 10, "/simple"), + StartupProbe: getMetricsProbe(300, 1, "/simple"), } return c @@ -1565,8 +1565,8 @@ func (d *pmemCSIDeployment) getProvisionerContainer() corev1.Container { }, TerminationMessagePath: corev1.TerminationMessagePathDefault, TerminationMessagePolicy: corev1.TerminationMessageReadFile, - LivenessProbe: getMetricsProbe(6, 10), - StartupProbe: getMetricsProbe(300, 1), + LivenessProbe: getMetricsProbe(6, 10, ""), + StartupProbe: getMetricsProbe(300, 1, ""), } if d.withStorageCapacity() { @@ -1802,12 +1802,12 @@ func (d *pmemCSIDeployment) getObjectMeta(name string, isClusterResource bool) m return meta } -func getMetricsProbe(failureThreshold int32, periodSeconds int32) *corev1.Probe { +func getMetricsProbe(failureThreshold int32, periodSeconds int32, pathSuffix string) *corev1.Probe { return &corev1.Probe{ Handler: corev1.Handler{ HTTPGet: &corev1.HTTPGetAction{ Scheme: "HTTP", - Path: "/metrics", + Path: "/metrics" + pathSuffix, Port: intstr.FromString("metrics"), }, }, diff --git a/test/e2e/metrics/metrics.go b/test/e2e/metrics/metrics.go index 1752df519..6259b4512 100644 --- a/test/e2e/metrics/metrics.go +++ b/test/e2e/metrics/metrics.go @@ -17,6 +17,7 @@ import ( . "github.com/onsi/ginkgo" . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/klog/v2/klogr" "k8s.io/kubernetes/test/e2e/framework" @@ -51,15 +52,8 @@ var _ = deploy.Describe("direct-testing", "direct-testing-metrics", "", func(d * } }) - It("works", func() { - // WaitForPMEMDriver already verified that "version" - // is returned and that "pmem_nodes" is correct. - // Here we check metrics support of each pod (= annotations + - // metrics endpoint). - pods, err := f.ClientSet.CoreV1().Pods(d.Namespace).List(context.Background(), metav1.ListOptions{}) - framework.ExpectNoError(err, "list pods") - - test := func() { + Context("data", func() { + testData := func(simple bool, pods *corev1.PodList) { numPods := 0 for _, pod := range pods.Items { if pod.Annotations["pmem-csi.intel.com/scrape"] != "containers" { @@ -69,6 +63,10 @@ var _ = deploy.Describe("direct-testing", "direct-testing-metrics", "", func(d * numPorts := 0 for _, container := range pod.Spec.Containers { + isPmemCSI := strings.HasPrefix(container.Name, "pmem") + if simple && !isPmemCSI { + continue + } for _, port := range container.Ports { if port.Name == "metrics" { numPorts++ @@ -80,6 +78,9 @@ var _ = deploy.Describe("direct-testing", "direct-testing-metrics", "", func(d * url := fmt.Sprintf("http://%s.%s:%d/metrics", pod.Namespace, pod.Name, port.ContainerPort) + if simple { + url += "/simple" + } resp, err := client.Get(url) framework.ExpectNoError(err, "GET failed") // When wrapped with InterceptGomegaFailures, err == nil doesn't @@ -92,16 +93,22 @@ var _ = deploy.Describe("direct-testing", "direct-testing-metrics", "", func(d * data, err := ioutil.ReadAll(resp.Body) framework.ExpectNoError(err, "read GET response") name := pod.Name + "/" + container.Name - if strings.HasPrefix(container.Name, "pmem") { - Expect(data).To(ContainSubstring("go_threads "), name) - Expect(data).To(ContainSubstring("process_open_fds "), name) + if isPmemCSI { + Expect(data).To(ContainSubstring("build_info"), name) + expect := Expect(data).To + if simple { + // All other metrices are not part of metrics/simple. + expect = Expect(data).NotTo + } + expect(ContainSubstring("go_threads "), name) + expect(ContainSubstring("process_open_fds "), name) if !strings.Contains(pod.Name, "controller") { // Only the node driver implements CSI and manages volumes. - Expect(data).To(ContainSubstring("csi_plugin_operations_seconds "), name) - Expect(data).To(ContainSubstring("pmem_amount_available "), name) - Expect(data).To(ContainSubstring("pmem_amount_managed "), name) - Expect(data).To(ContainSubstring("pmem_amount_max_volume_size "), name) - Expect(data).To(ContainSubstring("pmem_amount_total "), name) + expect(ContainSubstring("csi_plugin_operations_seconds "), name) + expect(ContainSubstring("pmem_amount_available "), name) + expect(ContainSubstring("pmem_amount_managed "), name) + expect(ContainSubstring("pmem_amount_max_volume_size "), name) + expect(ContainSubstring("pmem_amount_total "), name) } } else { Expect(data).To(ContainSubstring("csi_sidecar_operations_seconds "), name) @@ -113,9 +120,29 @@ var _ = deploy.Describe("direct-testing", "direct-testing-metrics", "", func(d * } Expect(numPods).NotTo(Equal(0), "at least one container should have a 'metrics' port") } - Eventually(func() string { - return strings.Join(InterceptGomegaFailures(test), "\n") - }, "10s", "1s").Should(BeEmpty()) + + test := func(simple bool) { + // WaitForPMEMDriver already verified that "version" + // is returned and that "pmem_nodes" is correct. + // Here we check metrics support of each pod (= annotations + + // metrics endpoint). + pods, err := f.ClientSet.CoreV1().Pods(d.Namespace).List(context.Background(), metav1.ListOptions{}) + framework.ExpectNoError(err, "list pods") + + Eventually(func() string { + return strings.Join(InterceptGomegaFailures(func() { + testData(simple, pods) + }), "\n") + }, "10s", "1s").Should(BeEmpty()) + } + + It("full", func() { + test(false) + }) + + It("simple", func() { + test(true) + }) }) It("rejects large headers", func() {