diff --git a/Makefile b/Makefile index 5371d5b48d96..d872ed190830 100644 --- a/Makefile +++ b/Makefile @@ -602,6 +602,7 @@ generate-e2e-templates-main: $(KUSTOMIZE) $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-machine-pool.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-upgrades-runtimesdk.yaml + $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-pre-drain.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-kcp-scale-in.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-ipv6 --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-ipv6.yaml $(KUSTOMIZE) build $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary --load-restrictor LoadRestrictionsNone > $(DOCKER_TEMPLATES)/main/cluster-template-topology-dualstack-ipv6-primary.yaml diff --git a/test/e2e/cluster_upgrade.go b/test/e2e/cluster_upgrade.go index fd4156e9be29..bd680f4897c6 100644 --- a/test/e2e/cluster_upgrade.go +++ b/test/e2e/cluster_upgrade.go @@ -66,6 +66,10 @@ type ClusterUpgradeConformanceSpecInput struct { // Allows to inject a function to be run after test namespace is created. // If not specified, this is a no-op. PostNamespaceCreated func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace string) + + // Allows to inject a function to be run before checking control-plane machines to be upgraded. + // If not specified, this is a no-op. + PreWaitForControlPlaneToBeUpgraded func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string) } // ClusterUpgradeConformanceSpec implements a spec that upgrades a cluster and runs the Kubernetes conformance suite. @@ -93,6 +97,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust clusterResources *clusterctl.ApplyClusterTemplateAndWaitResult kubetestConfigFilePath string + + clusterName string ) BeforeEach(func() { @@ -142,6 +148,8 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust infrastructureProvider = *input.InfrastructureProvider } + clusterName = fmt.Sprintf("%s-%s", specName, util.RandomString(6)) + clusterctl.ApplyClusterTemplateAndWait(ctx, clusterctl.ApplyClusterTemplateAndWaitInput{ ClusterProxy: input.BootstrapClusterProxy, ConfigCluster: clusterctl.ConfigClusterInput{ @@ -151,7 +159,7 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust InfrastructureProvider: infrastructureProvider, Flavor: ptr.Deref(input.Flavor, "upgrades"), Namespace: namespace.Name, - ClusterName: fmt.Sprintf("%s-%s", specName, util.RandomString(6)), + ClusterName: clusterName, KubernetesVersion: input.E2EConfig.GetVariable(KubernetesVersionUpgradeFrom), ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount), WorkerMachineCount: ptr.To[int64](workerMachineCount), @@ -180,6 +188,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"), WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"), WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"), + PreWaitForControlPlaneToBeUpgraded: func() { + if input.PreWaitForControlPlaneToBeUpgraded != nil { + input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName) + } + }, }) } else { // Cluster is not using ClusterClass, upgrade via individual resources. @@ -209,6 +222,11 @@ func ClusterUpgradeConformanceSpec(ctx context.Context, inputGetter func() Clust WaitForKubeProxyUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"), WaitForDNSUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"), WaitForEtcdUpgrade: input.E2EConfig.GetIntervals(specName, "wait-machine-upgrade"), + PreWaitForControlPlaneToBeUpgraded: func() { + if input.PreWaitForControlPlaneToBeUpgraded != nil { + input.PreWaitForControlPlaneToBeUpgraded(input.BootstrapClusterProxy, namespace.Name, clusterName) + } + }, }) if workerMachineCount > 0 { diff --git a/test/e2e/cluster_upgrade_test.go b/test/e2e/cluster_upgrade_test.go index 1096bc12bf71..a9f8d8eb6d76 100644 --- a/test/e2e/cluster_upgrade_test.go +++ b/test/e2e/cluster_upgrade_test.go @@ -21,7 +21,20 @@ package e2e import ( . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/pkg/errors" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/klog/v2" "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + "sigs.k8s.io/cluster-api/test/e2e/internal/log" + "sigs.k8s.io/cluster-api/test/framework" + "sigs.k8s.io/cluster-api/util/conditions" + "sigs.k8s.io/cluster-api/util/patch" ) var _ = Describe("When upgrading a workload cluster using ClusterClass and testing K8S conformance [Conformance] [K8s-Upgrade] [ClusterClass]", func() { @@ -58,6 +71,7 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass [ClusterC }) var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA control plane [ClusterClass]", func() { + controlPlaneMachineCount := int64(3) ClusterUpgradeConformanceSpec(ctx, func() ClusterUpgradeConformanceSpecInput { return ClusterUpgradeConformanceSpecInput{ E2EConfig: e2eConfig, @@ -69,9 +83,122 @@ var _ = Describe("When upgrading a workload cluster using ClusterClass with a HA // This test is run in CI in parallel with other tests. To keep the test duration reasonable // the conformance tests are skipped. SkipConformanceTests: true, - ControlPlaneMachineCount: ptr.To[int64](3), + ControlPlaneMachineCount: ptr.To[int64](controlPlaneMachineCount), WorkerMachineCount: ptr.To[int64](1), - Flavor: ptr.To("topology"), + Flavor: ptr.To("kcp-pre-drain"), + PreWaitForControlPlaneToBeUpgraded: func(managementClusterProxy framework.ClusterProxy, workloadClusterNamespace, workloadClusterName string) { + log.Logf("Waiting for control-plane machines to have the upgraded Kubernetes version") + + preDrainHook := "pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check" + + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: workloadClusterNamespace, + Name: workloadClusterName, + }, + } + Expect(managementClusterProxy.GetClient().Get(ctx, client.ObjectKeyFromObject(cluster), cluster)).To(Succeed()) + + // This replaces the WaitForControlPlaneMachinesToBeUpgraded function to verify via a pre-drain hook + // that all static Pods, kube-proxy and all Nodes are becoming healthy before we let the upgrade + // process precede by removing the pre-drain hook. + // This captures cases where static Pods, kube-proxy and all Nodes would only become healthy after + // all control plane Machines have been upgraded. + Eventually(func() (int64, error) { + machines := framework.GetControlPlaneMachinesByCluster(ctx, framework.GetControlPlaneMachinesByClusterInput{ + Lister: managementClusterProxy.GetClient(), + ClusterName: cluster.Name, + Namespace: cluster.Namespace, + }) + + // Collect information about: + // * how many control-plane machines already got upgradedAndHealthy + // * control-plane machines which are in deletion + // * workload cluster nodes + // * kube-proxy pods + var upgradedAndHealthy int64 + deletingMachines := []clusterv1.Machine{} + for _, m := range machines { + if *m.Spec.Version == cluster.Spec.Topology.Version && conditions.IsTrue(&m, clusterv1.MachineNodeHealthyCondition) { + upgradedAndHealthy++ + } + if !m.DeletionTimestamp.IsZero() { + deletingMachines = append(deletingMachines, m) + } + } + + wlClient := managementClusterProxy.GetWorkloadCluster(ctx, workloadClusterNamespace, workloadClusterName).GetClient() + nodes := corev1.NodeList{} + if err := wlClient.List(ctx, &nodes); err != nil { + return 0, errors.Wrapf(err, "failed to list nodes in workload cluster") + } + + kubeProxyPods := corev1.PodList{} + if err := wlClient.List(ctx, &kubeProxyPods, client.InNamespace(metav1.NamespaceSystem), client.MatchingLabels{"k8s-app": "kube-proxy"}); err != nil { + return 0, errors.Wrapf(err, "failed to list kube-proxy pods in workload cluster") + } + + errList := []error{} + + // Check all nodes to be Ready. + for _, node := range nodes.Items { + for _, condition := range node.Status.Conditions { + if condition.Type == corev1.NodeReady && condition.Status != corev1.ConditionTrue { + errList = append(errList, errors.Errorf("expected the Ready condition for Node %s to be true but got %s instead: %s", node.GetName(), condition.Status, condition.Message)) + } + } + } + + // Check if the expected number of kube-proxy pods exist and all of them are healthy for all existing Nodes of the Cluster. + if len(nodes.Items) != len(kubeProxyPods.Items) { + errList = append(errList, errors.Errorf("expected %d kube-proxy pods to exist, got %d", len(nodes.Items), len(kubeProxyPods.Items))) + } + for _, pod := range kubeProxyPods.Items { + for _, condition := range pod.Status.Conditions { + if condition.Type == corev1.PodReady && condition.Status != corev1.ConditionTrue { + errList = append(errList, errors.Errorf("expected the Ready condition for Pod %s to be true but got %s instead: %s", pod.GetName(), condition.Status, condition.Message)) + } + } + } + + if err := kerrors.NewAggregate(errList); err != nil { + return 0, errors.Wrap(err, "blocking upgrade because cluster is not stable") + } + + // At this stage all current machines are considered ok, so remove the pre-drain webhook from a CP to unblock the next step of the upgrade. + if len(deletingMachines) > 0 { + if len(deletingMachines) > 1 { + return 0, errors.Errorf("expected a maximum of 1 machine to be in deleting but got %d", len(deletingMachines)) + } + + m := &deletingMachines[0] + + if m.Annotations[preDrainHook] != "true" { + return 0, errors.Errorf("machine %s is in deletion but does not have pre-drain hook %q", klog.KObj(m), preDrainHook) + } + + // Removing pre-drain hook from machine. + patchHelper, err := patch.NewHelper(m, managementClusterProxy.GetClient()) + if err != nil { + return 0, err + } + delete(m.Annotations, preDrainHook) + + if err := patchHelper.Patch(ctx, m); err != nil { + return 0, err + } + + // Return to enter the function again. + return 0, errors.Errorf("deletion of Machine %s was blocked by pre-drain hook", klog.KObj(m)) + } + + if int64(len(machines)) > upgradedAndHealthy { + return 0, errors.New("old Machines remain") + } + + return upgradedAndHealthy, nil + }, e2eConfig.GetIntervals("k8s-upgrade-and-conformance", "wait-machine-upgrade")...).Should(Equal(controlPlaneMachineCount), "Timed out waiting for all control-plane machines in Cluster %s to be upgraded to kubernetes version %s", klog.KObj(cluster), cluster.Spec.Topology.Version) + }, } }) }) diff --git a/test/e2e/config/docker.yaml b/test/e2e/config/docker.yaml index fce2cb55dd18..b214f2c8cd4c 100644 --- a/test/e2e/config/docker.yaml +++ b/test/e2e/config/docker.yaml @@ -347,6 +347,7 @@ providers: - sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-remediation.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-adoption.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-machine-pool.yaml" + - sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-pre-drain.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-upgrades-runtimesdk.yaml" - sourcePath: "../data/infrastructure-docker/main/cluster-template-kcp-scale-in.yaml" diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-pre-drain/cluster.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-pre-drain/cluster.yaml new file mode 100644 index 000000000000..805ca5e732f5 --- /dev/null +++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-pre-drain/cluster.yaml @@ -0,0 +1,12 @@ +--- +apiVersion: cluster.x-k8s.io/v1beta1 +kind: Cluster +metadata: + name: '${CLUSTER_NAME}' +spec: + topology: + class: quick-start + controlPlane: + metadata: + annotations: + pre-drain.delete.hook.machine.cluster.x-k8s.io/kcp-ready-check: "true" \ No newline at end of file diff --git a/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-pre-drain/kustomization.yaml b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-pre-drain/kustomization.yaml new file mode 100644 index 000000000000..f1e62efbcd26 --- /dev/null +++ b/test/e2e/data/infrastructure-docker/main/cluster-template-kcp-pre-drain/kustomization.yaml @@ -0,0 +1,4 @@ +resources: + - ../cluster-template-upgrades +patches: +- path: cluster.yaml diff --git a/test/framework/controlplane_helpers.go b/test/framework/controlplane_helpers.go index 301a43237f5e..e9e14aaba952 100644 --- a/test/framework/controlplane_helpers.go +++ b/test/framework/controlplane_helpers.go @@ -306,17 +306,18 @@ func WaitForControlPlaneAndMachinesReady(ctx context.Context, input WaitForContr // UpgradeControlPlaneAndWaitForUpgradeInput is the input type for UpgradeControlPlaneAndWaitForUpgrade. type UpgradeControlPlaneAndWaitForUpgradeInput struct { - ClusterProxy ClusterProxy - Cluster *clusterv1.Cluster - ControlPlane *controlplanev1.KubeadmControlPlane - KubernetesUpgradeVersion string - UpgradeMachineTemplate *string - EtcdImageTag string - DNSImageTag string - WaitForMachinesToBeUpgraded []interface{} - WaitForDNSUpgrade []interface{} - WaitForKubeProxyUpgrade []interface{} - WaitForEtcdUpgrade []interface{} + ClusterProxy ClusterProxy + Cluster *clusterv1.Cluster + ControlPlane *controlplanev1.KubeadmControlPlane + KubernetesUpgradeVersion string + UpgradeMachineTemplate *string + EtcdImageTag string + DNSImageTag string + WaitForMachinesToBeUpgraded []interface{} + WaitForDNSUpgrade []interface{} + WaitForKubeProxyUpgrade []interface{} + WaitForEtcdUpgrade []interface{} + PreWaitForControlPlaneToBeUpgraded func() } // UpgradeControlPlaneAndWaitForUpgrade upgrades a KubeadmControlPlane and waits for it to be upgraded. @@ -357,6 +358,12 @@ func UpgradeControlPlaneAndWaitForUpgrade(ctx context.Context, input UpgradeCont return patchHelper.Patch(ctx, input.ControlPlane) }, retryableOperationTimeout, retryableOperationInterval).Should(Succeed(), "Failed to patch the new kubernetes version to KCP %s", klog.KObj(input.ControlPlane)) + // Once we have patched the Kubernetes Cluster we can run PreWaitForControlPlaneToBeUpgraded. + if input.PreWaitForControlPlaneToBeUpgraded != nil { + log.Logf("Calling PreWaitForControlPlaneToBeUpgraded") + input.PreWaitForControlPlaneToBeUpgraded() + } + log.Logf("Waiting for control-plane machines to have the upgraded kubernetes version") WaitForControlPlaneMachinesToBeUpgraded(ctx, WaitForControlPlaneMachinesToBeUpgradedInput{ Lister: mgmtClient,