Skip to content

Commit

Permalink
test: cleanup failed Kubernetes pods
Browse files Browse the repository at this point in the history
See #9870

Signed-off-by: Andrey Smirnov <andrey.smirnov@siderolabs.com>
  • Loading branch information
smira committed Dec 16, 2024
1 parent c9c6851 commit 9470e84
Show file tree
Hide file tree
Showing 9 changed files with 48 additions and 3 deletions.
2 changes: 2 additions & 0 deletions internal/integration/api/apply-config.go
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ func (suite *ApplyConfigSuite) TestApply() {

return nil
}, assertRebootedRebootTimeout,
suite.CleanupFailedPods,
)

// Verify configuration change
Expand Down Expand Up @@ -319,6 +320,7 @@ func (suite *ApplyConfigSuite) TestApplyConfigRotateEncryptionSecrets() {

return nil
}, assertRebootedRebootTimeout,
suite.CleanupFailedPods,
)

suite.ClearConnectionRefused(suite.ctx, node)
Expand Down
2 changes: 2 additions & 0 deletions internal/integration/api/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ func (suite *CommonSuite) TestBaseOCISpec() {

return nil
}, assertRebootedRebootTimeout,
suite.CleanupFailedPods,
)

suite.ClearConnectionRefused(suite.ctx, node)
Expand Down Expand Up @@ -239,6 +240,7 @@ func (suite *CommonSuite) TestBaseOCISpec() {

return nil
}, assertRebootedRebootTimeout,
suite.CleanupFailedPods,
)

suite.ClearConnectionRefused(suite.ctx, node)
Expand Down
4 changes: 3 additions & 1 deletion internal/integration/api/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (

// EtcdSuite ...
type EtcdSuite struct {
base.APISuite
base.K8sSuite

ctx context.Context //nolint:containedctx
ctxCancel context.CancelFunc
Expand Down Expand Up @@ -151,6 +151,7 @@ func (suite *EtcdSuite) TestLeaveCluster() {

return err
}, 10*time.Minute,
suite.CleanupFailedPods,
)
}

Expand Down Expand Up @@ -259,6 +260,7 @@ func (suite *EtcdSuite) TestRemoveMember() {

return err
}, 10*time.Minute,
suite.CleanupFailedPods,
)
}

Expand Down
3 changes: 3 additions & 0 deletions internal/integration/api/extensions_qemu.go
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,7 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsQEMUGuestAgent() {

return err
}, 5*time.Minute,
suite.CleanupFailedPods,
)
}

Expand Down Expand Up @@ -494,6 +495,7 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsMdADM() {
suite.ctx, node, func(nodeCtx context.Context) error {
return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx))
}, 5*time.Minute,
suite.CleanupFailedPods,
)

suite.Require().True(suite.mdADMArrayExists(), "expected mdadm array to be present")
Expand Down Expand Up @@ -581,6 +583,7 @@ func (suite *ExtensionsSuiteQEMU) TestExtensionsZFS() {
suite.ctx, node, func(nodeCtx context.Context) error {
return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx))
}, 5*time.Minute,
suite.CleanupFailedPods,
)

suite.Require().True(suite.checkZFSPoolMounted(), "expected zfs pool to be mounted")
Expand Down
3 changes: 2 additions & 1 deletion internal/integration/api/reboot.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (

// RebootSuite ...
type RebootSuite struct {
base.APISuite
base.K8sSuite

ctx context.Context //nolint:containedctx
ctxCancel context.CancelFunc
Expand Down Expand Up @@ -66,6 +66,7 @@ func (suite *RebootSuite) TestRebootNodeByNode() {
suite.ctx, node, func(nodeCtx context.Context) error {
return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx))
}, 10*time.Minute,
suite.CleanupFailedPods,
)
}
}
Expand Down
1 change: 1 addition & 0 deletions internal/integration/api/update-hostname.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ func (suite *UpdateHostnameSuite) TestUpdateHostname() {
suite.ctx, nodeInternalIP, func(nodeCtx context.Context) error {
return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx))
}, 10*time.Minute,
suite.CleanupFailedPods,
)
}()

Expand Down
1 change: 1 addition & 0 deletions internal/integration/api/volumes.go
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ func (suite *VolumesSuite) TestLVMActivation() {
suite.ctx, node, func(nodeCtx context.Context) error {
return base.IgnoreGRPCUnavailable(suite.Client.Reboot(nodeCtx))
}, 5*time.Minute,
suite.CleanupFailedPods,
)

suite.T().Logf("verifying LVM activation %s/%s", node, nodeName)
Expand Down
6 changes: 5 additions & 1 deletion internal/integration/base/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -279,11 +279,15 @@ func (apiSuite *APISuite) ReadBootIDWithRetry(ctx context.Context, timeout time.
// AssertRebooted verifies that node got rebooted as result of running some API call.
//
// Verification happens via reading boot_id of the node.
func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration) {
func (apiSuite *APISuite) AssertRebooted(ctx context.Context, node string, rebootFunc func(nodeCtx context.Context) error, timeout time.Duration, extraHooks ...func(context.Context, string)) {
apiSuite.AssertRebootedNoChecks(ctx, node, rebootFunc, timeout)

apiSuite.WaitForBootDone(ctx)

for _, hook := range extraHooks {
hook(ctx, node)
}

if apiSuite.Cluster != nil {
// without cluster state we can't do deep checks, but basic reboot test still works
// NB: using `ctx` here to have client talking to init node by default
Expand Down
29 changes: 29 additions & 0 deletions internal/integration/base/k8s.go
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,35 @@ func (k8sSuite *K8sSuite) GetK8sNodeByInternalIP(ctx context.Context, internalIP
return nil, fmt.Errorf("node with internal IP %s not found", internalIP)
}

// CleanupFailedPods deletes all pods in kube-system namespace with the status Failed.
func (k8sSuite *K8sSuite) CleanupFailedPods(ctx context.Context, internalIP string) {
nodeName, err := k8sSuite.GetK8sNodeByInternalIP(ctx, internalIP)
if err != nil {
k8sSuite.T().Logf("failed to get node by internal IP %s: %v", internalIP, err)

return
}

pods, err := k8sSuite.Clientset.CoreV1().Pods("kube-system").List(ctx, metav1.ListOptions{
FieldSelector: fields.OneTermEqualSelector("spec.nodeName", nodeName.Name).String(),
})
if err != nil {
k8sSuite.T().Logf("failed to list pods in kube-system namespace: %v", err)

return
}

for _, pod := range pods.Items {
if pod.Status.Phase == corev1.PodFailed {
if err := k8sSuite.Clientset.CoreV1().Pods("kube-system").Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil {
k8sSuite.T().Logf("failed to delete pod %s: %v", pod.Name, err)
} else {
k8sSuite.T().Logf("deleted pod %s", pod.Name)
}
}
}
}

// WaitForK8sNodeReadinessStatus waits for node to have the given status.
// It retries until the node with the name is found and matches the expected condition.
func (k8sSuite *K8sSuite) WaitForK8sNodeReadinessStatus(ctx context.Context, nodeName string, checkFn func(corev1.ConditionStatus) bool) error {
Expand Down

0 comments on commit 9470e84

Please sign in to comment.