Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🌱 test: add PreWaitForControlplaneToBeUpgraded to ClusterUpgradeConformanceSpec #11145

Merged
merged 8 commits into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -602,6 +602,7 @@ generate-e2e-templates-main: $(KUSTOMIZE)
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-ipv6 --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-ipv6.yaml
$(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary.yaml
Expand Down
20 changes: 19 additions & 1 deletion test/e2e/cluster_upgrade.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ type ClusterUpgradeConformanceSpecInput struct {
// Allows to inject a function to be run after test namespace is created.
// If not specified, this is a no-op.
PostNamespaceCreated func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace string)

// Allows to inject a function to be run before checking control-plane machines to be upgraded.
// If not specified, this is a no-op.
PreWaitForControlPlaneToBeUpgraded func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string)
}

// ClusterUpgradeConformanceSpec implements a spec that upgrades a cluster and runs the Kubernetes conformance suite.
Expand Down Expand Up @@ -93,6 +97,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust

clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult
kubetestConfigFilePath string

clusterName string
)

BeforeEach(func() {
Expand Down Expand Up @@ -142,6 +148,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
infrastructureProvider = *input.InfrastructureProvider
}

clusterName = fmt.Sprintf("%s-%s", specName, util.RandomString(6))

clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{
ClusterProxy: input.BootstrapClusterProxy,
ConfigCluster: clusterctl.ConfigClusterInput{
Expand All @@ -151,7 +159,7 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
InfrastructureProvider: infrastructureProvider,
Flavor: ptr.Deref(input.Flavor, "upgrades"),
Namespace: namespace.Name,
ClusterName: fmt.Sprintf("%s-%s", specName, util.RandomString(6)),
ClusterName: clusterName,
KubernetesVersion: input.E2EConfig.GetVariable(KubernetesVersionUpgradeFrom),
ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount),
WorkerMachineCount: ptr.To[int64](workerMachineCount),
Expand Down Expand Up @@ -180,6 +188,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
PreWaitForControlPlaneToBeUpgraded: func() {
if input.PreWaitForControlPlaneToBeUpgraded != nil {
input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName)
}
},
})
} else {
// Cluster is not using ClusterClass, upgrade via individual resources.
Expand Down Expand Up @@ -209,6 +222,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust
WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"),
PreWaitForControlPlaneToBeUpgraded: func() {
if input.PreWaitForControlPlaneToBeUpgraded != nil {
input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName)
}
},
})

if workerMachineCount > 0 {
Expand Down
131 changes: 129 additions & 2 deletions test/e2e/cluster_upgrade_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,20 @@ package e2e

import (
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/test/e2e/internal/log"
"sigs.k8s.io/cluster-api/test/framework"
"sigs.k8s.io/cluster-api/util/conditions"
"sigs.k8s.io/cluster-api/util/patch"
)

var _ = Describe("When upgrading a workload cluster using ClusterClass and testing K8S conformance [Conformance] [K8s-Upgrade] [ClusterClass]", func() {
Expand Down Expand Up @@ -58,6 +71,7 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass [ClusterC
})

var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA control plane [ClusterClass]", func() {
controlPlaneMachineCount := int64(3)
ClusterUpgradeConformanceSpec(ctx, func() ClusterUpgradeConformanceSpecInput {
return ClusterUpgradeConformanceSpecInput{
E2EConfig: e2eConfig,
Expand All @@ -69,9 +83,122 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA
// This test is run in CI in parallel with other tests. To keep the test duration reasonable
// the conformance tests are skipped.
SkipConformanceTests: true,
ControlPlaneMachineCount: ptr.To[int64](3),
ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount),
WorkerMachineCount: ptr.To[int64](1),
Flavor: ptr.To("topology"),
Flavor: ptr.To("kcp-pre-drain"),
PreWaitForControlPlaneToBeUpgraded: func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string) {
log.Logf("Waiting for control-plane machines to have the upgraded Kubernetes version")

preDrainHook := "pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check"

cluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Namespace: workloadClusterNamespace,
Name: workloadClusterName,
},
}
Expect(managementClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(cluster), cluster)).To(Succeed())

// This replaces the WaitForControlPlaneMachinesToBeUpgraded function to verify via a pre-drain hook
// that all static Pods, kube-proxy and all Nodes are becoming healthy before we let the upgrade
// process precede by removing the pre-drain hook.
// This captures cases where static Pods, kube-proxy and all Nodes would only become healthy after
// all control plane Machines have been upgraded.
Eventually(func() (int64, error) {
machines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{
Lister: managementClusterProxy.GetClient(),
ClusterName: cluster.Name,
Namespace: cluster.Namespace,
})

// Collect information about:
// * how many control-plane machines already got upgradedAndHealthy
// * control-plane machines which are in deletion but waiting for the pre-drain hook
chrischdi marked this conversation as resolved.
Show resolved Hide resolved
// * workload cluster nodes
// * kube-proxy pods
var upgradedAndHealthy int64
deletingMachines := []clusterv1.Machine{}
for _, m := range machines {
chrischdi marked this conversation as resolved.
Show resolved Hide resolved
if *m.Spec.Version == cluster.Spec.Topology.Version && conditions.IsTrue(&m, clusterv1.MachineNodeHealthyCondition) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

q: why are we checking clusterv1.MachineNodeHealthyCondition? as far as I remember it only checks for a dummy condition to not exists, so if I'm not wrong it doesn't really give added value 🤔

Copy link
Member

@sbueringer sbueringer Sep 9, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wait what. For a dummy condition to not exist?

	// MachineNodeHealthyCondition provides info about the operational state of the Kubernetes node hosted on the machine by summarizing  node conditions.
	// If the conditions defined in a Kubernetes node (i.e., NodeReady, NodeMemoryPressure, NodeDiskPressure, NodePIDPressure, and NodeNetworkUnavailable) are in a healthy state, it will be set to True.
	MachineNodeHealthyCondition ConditionType = "NodeHealthy"

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

jep, I was expecting this to check this as mentioned in the comment

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is how we are configuring MHC in E2E tests:

    machineHealthCheck:
      maxUnhealthy: 100%
      unhealthyConditions:
        - type: e2e.remediation.condition
          status: "False"
          timeout: 20s

So (in E2E tests only) MHC is testing for a dummy e2e.remediation.condition, not for NodeReady, NodeMemoryPressure etc

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But this is the MachineNodeHealthyCondition not the MachineHealthCheckSucceeded condition

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is set to true here:

conditions.MarkTrue(machine, clusterv1.MachineNodeHealthyCondition)

This should have nothing to do with MHC's

upgradedAndHealthy++
}
if !m.DeletionTimestamp.IsZero() {
deletingMachines = append(deletingMachines, m)
}
}

wlClient := managementClusterProxy.GetWorkloadCluster(ctx, workloadClusterNamespace, workloadClusterName).GetClient()
nodes := corev1.NodeList{}
if err := wlClient.List(ctx, &nodes); err != nil {
return 0, errors.Wrapf(err, "failed to list nodes in workload cluster")
}

kubeProxyPods := corev1.PodList{}
if err := wlClient.List(ctx, &kubeProxyPods, client.InNamespace(metav1.NamespaceSystem), client.MatchingLabels{"k8s-app": "kube-proxy"}); err != nil {
return 0, errors.Wrapf(err, "failed to list kube-proxy pods in workload cluster")
}

errList := []error{}

// Check all nodes to be Ready.
for _, node := range nodes.Items {
for _, condition := range node.Status.Conditions {
if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue {
errList = append(errList, errors.Errorf("expected the Ready condition for Node %s to be true but got %s instead: %s", node.GetName(), condition.Status, condition.Message))
}
}
}

// Check if the expected number of kube-proxy pods exist and all of them are healthy for all existing Nodes of the Cluster.
if len(nodes.Items) != len(kubeProxyPods.Items) {
errList = append(errList, errors.Errorf("expected %d kube-proxy pods to exist, got %d", len(nodes.Items), len(kubeProxyPods.Items)))
}
for _, pod := range kubeProxyPods.Items {
for _, condition := range pod.Status.Conditions {
if condition.Type == corev1.PodReady && condition.Status != corev1.ConditionTrue {
errList = append(errList, errors.Errorf("expected the Ready condition for Pod %s to be true but got %s instead: %s", pod.GetName(), condition.Status, condition.Message))
}
}
}

if err := kerrors.NewAggregate(errList); err != nil {
return 0, errors.Wrap(err, "blocking upgrade because cluster is not stable")
}

// At this stage all current machines are considered ok, so remove the pre-drain webhook from a CP to unblock the next step of the upgrade.
if len(deletingMachines) > 0 {
if len(deletingMachines) > 1 {
return 0, errors.Errorf("expected a maximum of 1 machine to be in deleting but got %d", len(deletingMachines))
}

m := &deletingMachines[0]

if m.Annotations[preDrainHook] != "true" {
return 0, errors.Errorf("machine %s is in deletion but does not have pre-drain hook %q", klog.KObj(m), preDrainHook)
}

// Removing pre-drain hook from machine.
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
patchHelper, err := patch.NewHelper(m, managementClusterProxy.GetClient())
if err != nil {
return 0, err
}
delete(m.Annotations, preDrainHook)

if err := patchHelper.Patch(ctx, m); err != nil {
return 0, err
}

// Return to enter the function again.
return 0, errors.Errorf("deletion of Machine %s was blocked by pre-drain hook", klog.KObj(m))
}

if int64(len(machines)) > upgradedAndHealthy {
return 0, errors.New("old Machines remain")
}

return upgradedAndHealthy, nil
}, e2eConfig.GetIntervals("k8s-upgrade-and-conformance", "wait-machine-upgrade")...).Should(Equal(controlPlaneMachineCount), "Timed out waiting for all control-plane machines in Cluster %s to be upgraded to kubernetes version %s", klog.KObj(cluster), cluster.Spec.Topology.Version)
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
},
}
})
})
Expand Down
1 change: 1 addition & 0 deletions test/e2e/config/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ providers:
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-remediation.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-pre-drain.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml"
- sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml"
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
---
apiVersion: cluster.x-k8s.io/v1beta1
kind: Cluster
metadata:
name: '${CLUSTER_NAME}'
spec:
topology:
class: quick-start
controlPlane:
metadata:
annotations:
pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check: "true"
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
resources:
- ../cluster-template-upgrades
patches:
- path: cluster.yaml
29 changes: 18 additions & 11 deletions test/framework/controlplane_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -306,17 +306,18 @@ func WaitForControlPlaneAndMachinesReady(ctx context.Context, input WaitForContr

// UpgradeControlPlaneAndWaitForUpgradeInput is the input type for UpgradeControlPlaneAndWaitForUpgrade.
type UpgradeControlPlaneAndWaitForUpgradeInput struct {
ClusterProxy ClusterProxy
Cluster *clusterv1.Cluster
ControlPlane *controlplanev1.KubeadmControlPlane
KubernetesUpgradeVersion string
UpgradeMachineTemplate *string
EtcdImageTag string
DNSImageTag string
WaitForMachinesToBeUpgraded []interface{}
WaitForDNSUpgrade []interface{}
WaitForKubeProxyUpgrade []interface{}
WaitForEtcdUpgrade []interface{}
ClusterProxy ClusterProxy
Cluster *clusterv1.Cluster
ControlPlane *controlplanev1.KubeadmControlPlane
KubernetesUpgradeVersion string
UpgradeMachineTemplate *string
EtcdImageTag string
DNSImageTag string
WaitForMachinesToBeUpgraded []interface{}
WaitForDNSUpgrade []interface{}
WaitForKubeProxyUpgrade []interface{}
WaitForEtcdUpgrade []interface{}
PreWaitForControlPlaneToBeUpgraded func()
}

// UpgradeControlPlaneAndWaitForUpgrade upgrades a KubeadmControlPlane and waits for it to be upgraded.
Expand Down Expand Up @@ -357,6 +358,12 @@ func UpgradeControlPlaneAndWaitForUpgrade(ctx context.Context, input UpgradeCont
return patchHelper.Patch(ctx, input.ControlPlane)
}, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch the new kubernetes version to KCP %s", klog.KObj(input.ControlPlane))

// Once we have patched the Kubernetes Cluster we can run PreWaitForControlPlaneToBeUpgraded.
if input.PreWaitForControlPlaneToBeUpgraded != nil {
log.Logf("Calling PreWaitForControlPlaneToBeUpgraded")
input.PreWaitForControlPlaneToBeUpgraded()
}

log.Logf("Waiting for control-plane machines to have the upgraded kubernetes version")
WaitForControlPlaneMachinesToBeUpgraded(ctx, WaitForControlPlaneMachinesToBeUpgradedInput{
Lister: mgmtClient,
Expand Down
Loading