From 2a4791b603e38d557649a2772c1f72c419a09a17 Mon Sep 17 00:00:00 2001 From: Moshe Immermam Date: Thu, 21 Nov 2024 00:18:05 +0200 Subject: [PATCH] chore: pod health fixes --- pkg/health/health.go | 1 - pkg/health/health_deployment.go | 2 +- pkg/health/health_pod.go | 11 ++- pkg/health/health_test.go | 4 +- .../Kubernetes/Pod/container-creating.yaml | 78 +++++++++++++++++++ .../testdata/Kubernetes/Pod/pod-pending.yaml | 1 + 6 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 pkg/health/testdata/Kubernetes/Pod/container-creating.yaml diff --git a/pkg/health/health.go b/pkg/health/health.go index dab6de6..e05097b 100644 --- a/pkg/health/health.go +++ b/pkg/health/health.go @@ -91,7 +91,6 @@ const ( HealthStatusEvicted HealthStatusCode = "Evicted" HealthStatusCompleted HealthStatusCode = "Completed" HealthStatusCrashLoopBackoff HealthStatusCode = "CrashLoopBackOff" - HealthStatusCrashLoop HealthStatusCode = "CrashLoop" HealthStatusCrashed HealthStatusCode = "Crashed" HealthStatusCreating HealthStatusCode = "Creating" HealthStatusDeleted HealthStatusCode = "Deleted" diff --git a/pkg/health/health_deployment.go b/pkg/health/health_deployment.go index 0903804..69cae8f 100644 --- a/pkg/health/health_deployment.go +++ b/pkg/health/health_deployment.go @@ -80,7 +80,7 @@ func getReplicaHealth(s ReplicaStatus) *HealthStatus { hs.Status = HealthStatusStarting } else if s.Ready == 0 && !isStarting { hs.Health = HealthUnhealthy - hs.Status = HealthStatusCrashLoop + hs.Status = HealthStatusCrashLoopBackoff } else if s.Desired == 0 && s.Replicas > 0 { hs.Status = HealthStatusScalingDown hs.Health = lo.Ternary(isProgressDeadlineExceeded, HealthWarning, HealthHealthy) diff --git a/pkg/health/health_pod.go b/pkg/health/health_pod.go index 9197207..fe8bdc3 100644 --- a/pkg/health/health_pod.go +++ b/pkg/health/health_pod.go @@ -158,11 +158,16 @@ func getCorev1PodHealth(pod *corev1.Pod) (*HealthStatus, error) { hr.Health = hr.Health.Worst(lo.Ternary(isReady, HealthHealthy, HealthUnhealthy)) } - if isStarting && hr.Health.IsWorseThan(HealthWarning) && - (terminated != nil && terminated.Status != HealthStatusOOMKilled) { + if isStarting && hr.Health.IsWorseThan(HealthWarning) { hr.Health = HealthUnknown - hr.Message = fmt.Sprintf("%s %s", string(hr.Status), hr.Message) + hr.Message = strings.TrimSpace(fmt.Sprintf("%s %s", string(hr.Status), hr.Message)) hr.Status = HealthStatusStarting + + if terminated != nil && terminated.Status == HealthStatusOOMKilled { + // an OOMKilled on startup is likely not going to resolve after some time + hr.Health = HealthUnhealthy + hr.Status = "Starting OOMKilled" + } } return &hr, nil diff --git a/pkg/health/health_test.go b/pkg/health/health_test.go index 1139539..e17b26a 100644 --- a/pkg/health/health_test.go +++ b/pkg/health/health_test.go @@ -409,7 +409,7 @@ func TestStatefulSetHealth(t *testing.T) { assertAppHealthMsg( t, "./testdata/statefulset-starting.yaml", - health.HealthStatusCrashLoop, + health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, true, "0/1 ready", @@ -419,7 +419,7 @@ func TestStatefulSetHealth(t *testing.T) { assertAppHealthMsg( t, "./testdata/statefulset-starting.yaml", - health.HealthStatusCrashLoop, + health.HealthStatusCrashLoopBackoff, health.HealthUnhealthy, true, "0/1 ready", diff --git a/pkg/health/testdata/Kubernetes/Pod/container-creating.yaml b/pkg/health/testdata/Kubernetes/Pod/container-creating.yaml new file mode 100644 index 0000000..b3dd84d --- /dev/null +++ b/pkg/health/testdata/Kubernetes/Pod/container-creating.yaml @@ -0,0 +1,78 @@ +apiVersion: v1 +kind: Pod +metadata: + creationTimestamp: "@now-5m" + name: image-pull-backoff + namespace: argocd + annotations: + expected-status: Starting + expected-health: unknown + resourceVersion: "155333" + selfLink: /api/v1/namespaces/argocd/pods/image-pull-backoff + uid: 46c1e8de-f61b-11e8-a057-fe5f49266390 +spec: + containers: + - image: does-not-exist + imagePullPolicy: Always + name: main + resources: {} + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + volumeMounts: + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount + name: default-token-f9jvj + readOnly: true + dnsPolicy: ClusterFirst + nodeName: minikube + restartPolicy: Always + schedulerName: default-scheduler + securityContext: {} + serviceAccount: default + serviceAccountName: default + terminationGracePeriodSeconds: 30 + tolerations: + - effect: NoExecute + key: node.kubernetes.io/not-ready + operator: Exists + tolerationSeconds: 300 + - effect: NoExecute + key: node.kubernetes.io/unreachable + operator: Exists + tolerationSeconds: 300 + volumes: + - name: default-token-f9jvj + secret: + defaultMode: 420 + secretName: default-token-f9jvj +status: + conditions: + - lastProbeTime: null + lastTransitionTime: 2018-12-02T10:16:04Z + status: "True" + type: Initialized + - lastProbeTime: null + lastTransitionTime: 2018-12-02T10:16:04Z + message: 'containers with unready status: [main]' + reason: ContainersNotReady + status: "False" + type: Ready + - lastProbeTime: null + lastTransitionTime: 2018-12-02T10:16:04Z + status: "True" + type: PodScheduled + containerStatuses: + - image: does-not-exist + imageID: "" + lastState: {} + name: main + ready: false + started: false + restartCount: 0 + state: + waiting: + reason: ContainerCreating + hostIP: 192.168.64.41 + phase: Pending + podIP: 172.17.0.9 + qosClass: BestEffort + startTime: 2018-12-02T10:16:04Z diff --git a/pkg/health/testdata/Kubernetes/Pod/pod-pending.yaml b/pkg/health/testdata/Kubernetes/Pod/pod-pending.yaml index 6a99ec1..151be39 100644 --- a/pkg/health/testdata/Kubernetes/Pod/pod-pending.yaml +++ b/pkg/health/testdata/Kubernetes/Pod/pod-pending.yaml @@ -66,6 +66,7 @@ status: lastState: {} name: main ready: false + started: true restartCount: 0 state: waiting: