From 9470e842fca2d7dd0dae185bff7210a8af355445 Mon Sep 17 00:00:00 2001 From: Andrey Smirnov Date: Mon, 16 Dec 2024 15:06:02 +0400 Subject: [PATCH] test: cleanup failed Kubernetes pods See #9870 Signed-off-by: Andrey Smirnov --- internal/integration/api/apply-config.go | 2 ++ internal/integration/api/common.go | 2 ++ internal/integration/api/etcd.go | 4 ++- internal/integration/api/extensions_qemu.go | 3 +++ internal/integration/api/reboot.go | 3 ++- internal/integration/api/update-hostname.go | 1 + internal/integration/api/volumes.go | 1 + internal/integration/base/api.go | 6 ++++- internal/integration/base/k8s.go | 29 +++++++++++++++++++++ 9 files changed, 48 insertions(+), 3 deletions(-) diff --git a/internal/integration/api/apply-config.go b/internal/integration/api/apply-config.go index 69ed6a6b80..c7c9676d7b 100644 --- a/internal/integration/api/apply-config.go +++ b/internal/integration/api/apply-config.go @@ -128,6 +128,7 @@ func (suite *ApplyConfigSuite) TestApply() { return nil }, assertRebootedRebootTimeout, + suite.CleanupFailedPods, ) // Verify configuration change @@ -319,6 +320,7 @@ func (suite *ApplyConfigSuite) TestApplyConfigRotateEncryptionSecrets() { return nil }, assertRebootedRebootTimeout, + suite.CleanupFailedPods, ) suite.ClearConnectionRefused(suite.ctx, node) diff --git a/internal/integration/api/common.go b/internal/integration/api/common.go index edcd916a3d..fef05b25eb 100644 --- a/internal/integration/api/common.go +++ b/internal/integration/api/common.go @@ -201,6 +201,7 @@ func (suite *CommonSuite) TestBaseOCISpec() { return nil }, assertRebootedRebootTimeout, + suite.CleanupFailedPods, ) suite.ClearConnectionRefused(suite.ctx, node) @@ -239,6 +240,7 @@ func (suite *CommonSuite) TestBaseOCISpec() { return nil }, assertRebootedRebootTimeout, + suite.CleanupFailedPods, ) suite.ClearConnectionRefused(suite.ctx, node) diff --git a/internal/integration/api/etcd.go b/internal/integration/api/etcd.go index 0ac7f204eb..f0182e6fc8 100644 --- a/internal/integration/api/etcd.go +++ b/internal/integration/api/etcd.go @@ -26,7 +26,7 @@ import ( // EtcdSuite ... type EtcdSuite struct { - base.APISuite + base.K8sSuite ctx context.Context //nolint:containedctx ctxCancel context.CancelFunc @@ -151,6 +151,7 @@ func (suite *EtcdSuite) TestLeaveCluster() { return err }, 10*time.Minute, + suite.CleanupFailedPods, ) } @@ -259,6 +260,7 @@ func (suite *EtcdSuite) TestRemoveMember() { return err }, 10*time.Minute, + suite.CleanupFailedPods, ) } diff --git a/internal/integration/api/extensions_qemu.go b/internal/integration/api/extensions_qemu.go index 7363253d50..d48f737ac8 100644 --- a/internal/integration/api/extensions_qemu.go +++ b/internal/integration/api/extensions_qemu.go @@ -303,6 +303,7 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsQEMUGuestAgent() { return err }, 5*time.Minute, + suite.CleanupFailedPods, ) } @@ -494,6 +495,7 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsMdADM() { suite.ctx, node, func(nodeCtx context.Context) error { return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx)) }, 5*time.Minute, + suite.CleanupFailedPods, ) suite.Require().True(suite.mdADMArrayExists(), "expected mdadm array to be present") @@ -581,6 +583,7 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsZFS() { suite.ctx, node, func(nodeCtx context.Context) error { return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx)) }, 5*time.Minute, + suite.CleanupFailedPods, ) suite.Require().True(suite.checkZFSPoolMounted(), "expected zfs pool to be mounted") diff --git a/internal/integration/api/reboot.go b/internal/integration/api/reboot.go index 32c4d126c8..dc274990bd 100644 --- a/internal/integration/api/reboot.go +++ b/internal/integration/api/reboot.go @@ -22,7 +22,7 @@ import ( // RebootSuite ... type RebootSuite struct { - base.APISuite + base.K8sSuite ctx context.Context //nolint:containedctx ctxCancel context.CancelFunc @@ -66,6 +66,7 @@ func (suite *RebootSuite) TestRebootNodeByNode() { suite.ctx, node, func(nodeCtx context.Context) error { return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx)) }, 10*time.Minute, + suite.CleanupFailedPods, ) } } diff --git a/internal/integration/api/update-hostname.go b/internal/integration/api/update-hostname.go index c262c4fa5f..a39e5e43b4 100644 --- a/internal/integration/api/update-hostname.go +++ b/internal/integration/api/update-hostname.go @@ -115,6 +115,7 @@ func (suite *UpdateHostnameSuite) TestUpdateHostname() { suite.ctx, nodeInternalIP, func(nodeCtx context.Context) error { return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx)) }, 10*time.Minute, + suite.CleanupFailedPods, ) }() diff --git a/internal/integration/api/volumes.go b/internal/integration/api/volumes.go index 48af39ef46..75f1bc71c8 100644 --- a/internal/integration/api/volumes.go +++ b/internal/integration/api/volumes.go @@ -277,6 +277,7 @@ func (suite *VolumesSuite) TestLVMActivation() { suite.ctx, node, func(nodeCtx context.Context) error { return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx)) }, 5*time.Minute, + suite.CleanupFailedPods, ) suite.T().Logf("verifying LVM activation %s/%s", node, nodeName) diff --git a/internal/integration/base/api.go b/internal/integration/base/api.go index 0ac6d95af7..0b9b88d576 100644 --- a/internal/integration/base/api.go +++ b/internal/integration/base/api.go @@ -279,11 +279,15 @@ func (apiSuite *APISuite) ReadBootIDWithRetry(ctx context.Context, timeout time. // AssertRebooted verifies that node got rebooted as result of running some API call. // // Verification happens via reading boot_id of the node. -func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) { +func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration, extraHooks ...func(context.Context, string)) { apiSuite.AssertRebootedNoChecks(ctx, node, rebootFunc, timeout) apiSuite.WaitForBootDone(ctx) + for _, hook := range extraHooks { + hook(ctx, node) + } + if apiSuite.Cluster != nil { // without cluster state we can't do deep checks, but basic reboot test still works // NB: using `ctx` here to have client talking to init node by default diff --git a/internal/integration/base/k8s.go b/internal/integration/base/k8s.go index eeca138722..9fa70b9497 100644 --- a/internal/integration/base/k8s.go +++ b/internal/integration/base/k8s.go @@ -111,6 +111,35 @@ func (k8sSuite *K8sSuite) GetK8sNodeByInternalIP(ctx context.Context, internalIP return nil, fmt.Errorf("node with internal IP %s not found", internalIP) } +// CleanupFailedPods deletes all pods in kube-system namespace with the status Failed. +func (k8sSuite *K8sSuite) CleanupFailedPods(ctx context.Context, internalIP string) { + nodeName, err := k8sSuite.GetK8sNodeByInternalIP(ctx, internalIP) + if err != nil { + k8sSuite.T().Logf("failed to get node by internal IP %s: %v", internalIP, err) + + return + } + + pods, err := k8sSuite.Clientset.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{ + FieldSelector: fields.OneTermEqualSelector("spec.nodeName", nodeName.Name).String(), + }) + if err != nil { + k8sSuite.T().Logf("failed to list pods in kube-system namespace: %v", err) + + return + } + + for _, pod := range pods.Items { + if pod.Status.Phase == corev1.PodFailed { + if err := k8sSuite.Clientset.CoreV1().Pods("kube-system").Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + k8sSuite.T().Logf("failed to delete pod %s: %v", pod.Name, err) + } else { + k8sSuite.T().Logf("deleted pod %s", pod.Name) + } + } + } +} + // WaitForK8sNodeReadinessStatus waits for node to have the given status. // It retries until the node with the name is found and matches the expected condition. func (k8sSuite *K8sSuite) WaitForK8sNodeReadinessStatus(ctx context.Context, nodeName string, checkFn func(corev1.ConditionStatus) bool) error {