From 5b6a0f76c9da9d73f7dca6364cbaf39978f0fac1 Mon Sep 17 00:00:00 2001 From: Yuvaraj Kakaraparthi Date: Tue, 2 May 2023 17:33:58 -0700 Subject: [PATCH] add MS preflight checks --- api/v1beta1/common_types.go | 44 ++ api/v1beta1/machinedeployment_webhook.go | 9 + api/v1beta1/machineset_webhook.go | 43 ++ api/v1beta1/machineset_webhook_test.go | 70 +++ config/manager/manager.yaml | 2 +- docs/book/src/SUMMARY.md | 1 + docs/book/src/developer/testing.md | 1 + docs/book/src/developer/tilt.md | 1 + .../src/reference/labels_and_annotations.md | 71 +-- .../experimental-features.md | 2 + .../machineset-preflight-checks.md | 12 + feature/feature.go | 6 + internal/contract/controlplane.go | 12 +- internal/contract/infrastructure_cluster.go | 2 +- internal/contract/infrastructure_machine.go | 2 +- internal/contract/types.go | 11 +- .../machineset/machineset_controller.go | 77 ++- .../machineset/machineset_preflight.go | 220 +++++++ .../machineset/machineset_preflight_test.go | 548 ++++++++++++++++++ test/e2e/config/docker.yaml | 1 + 20 files changed, 1061 insertions(+), 74 deletions(-) create mode 100644 docs/book/src/tasks/experimental-features/machineset-preflight-checks.md create mode 100644 internal/controllers/machineset/machineset_preflight.go create mode 100644 internal/controllers/machineset/machineset_preflight_test.go diff --git a/api/v1beta1/common_types.go b/api/v1beta1/common_types.go index 56c6807f825f..6edc7e5bed48 100644 --- a/api/v1beta1/common_types.go +++ b/api/v1beta1/common_types.go @@ -117,6 +117,18 @@ const ( // MachineSkipRemediationAnnotation is the annotation used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. MachineSkipRemediationAnnotation = "cluster.x-k8s.io/skip-remediation" + // MachineSetSkipPreflightChecksAnnotation is the annotation used to provide a comma-separated list of + // preflight checks that should be skipped during the MachineSet reconciliation. + // Supported items are: + // - KubeadmVersion (skips the kubeadm version skew preflight check) + // - KubernetesVersion (skips the kubernetes version skew preflight check) + // - ControlPlaneStable (skips checking that the control plane is neither provisioning nor upgrading) + // - All (skips all preflight checks) + // Example: "machineset.cluster.x-k8s.io/skip-preflight-checks": "ControlPlaneStable,KubernetesVersion". + // Note: The annotation can also be set on a MachineDeployment as MachineDeployment annotations are synced to + // the MachineSet. + MachineSetSkipPreflightChecksAnnotation = "machineset.cluster.x-k8s.io/skip-preflight-checks" + // ClusterSecretType defines the type of secret created by core components. // Note: This is used by core CAPI, CAPBK, and KCP to determine whether a secret is created by the controllers // themselves or supplied by the user (e.g. bring your own certificates). @@ -173,6 +185,38 @@ const ( VariableDefinitionFromInline = "inline" ) +// MachineSetPreflightCheck defines a valid MachineSet preflight check. +type MachineSetPreflightCheck string + +const ( + // MachineSetPreflightCheckAll can be used to represent all the MachineSet preflight checks. + MachineSetPreflightCheckAll MachineSetPreflightCheck = "All" + + // MachineSetPreflightCheckKubeadmVersionSkew is the name of the preflight check + // that verifies if the machine being created or remediated for the MachineSet conforms to the kubeadm version + // skew policy that requires the machine to be at the same version as the control plane. + // Note: This is a stopgap while the root cause of the problem is fixed in kubeadm; this check will become + // a no-op when this check will be available in kubeadm, and then eventually be dropped when all the + // supported Kuberenetes/kubeadm versions have implemented the fix. + // The preflight check is only run if a ControlPlane is used (controlPlaneRef must exist in the Cluster), + // the ControlPlane has a version, the MachineSet has a version and the MachineSet uses the Kubeadm bootstrap + // provider. + MachineSetPreflightCheckKubeadmVersionSkew MachineSetPreflightCheck = "KubeadmVersionSkew" + + // MachineSetPreflightCheckKubernetesVersionSkew is the name of the preflight check that verifies + // if the machines being created or remediated for the MachineSet conform to the Kubernetes version skew policy + // that requires the machines to be at a version that is not more than 2 minor lower than the ControlPlane version. + // The preflight check is only run if a ControlPlane is used (controlPlaneRef must exist in the Cluster), + // the ControlPlane has a version and the MachineSet has a version. + MachineSetPreflightCheckKubernetesVersionSkew MachineSetPreflightCheck = "KubernetesVersionSkew" + + // MachineSetPreflightCheckControlPlaneIsStable is the name of the preflight check + // that verifies if the control plane is not provisioning and not upgrading. + // The preflight check is only run if a ControlPlane is used (controlPlaneRef must exist in the Cluster) + // and the ControlPlane has a version. + MachineSetPreflightCheckControlPlaneIsStable MachineSetPreflightCheck = "ControlPlaneIsStable" +) + // NodeUninitializedTaint can be added to Nodes at creation by the bootstrap provider, e.g. the // KubeadmBootstrap provider will add the taint. // This taint is used to prevent workloads to be scheduled on Nodes before the node is initialized by Cluster API. diff --git a/api/v1beta1/machinedeployment_webhook.go b/api/v1beta1/machinedeployment_webhook.go index 797b3b97e25b..472c01afd5f5 100644 --- a/api/v1beta1/machinedeployment_webhook.go +++ b/api/v1beta1/machinedeployment_webhook.go @@ -36,6 +36,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/webhook" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + "sigs.k8s.io/cluster-api/feature" "sigs.k8s.io/cluster-api/util/version" ) @@ -214,6 +215,14 @@ func (m *MachineDeployment) validate(old *MachineDeployment) error { ) } + // MachineSet preflight checks that should be skipped could also be set as annotation on the MachineDeployment + // since MachineDeployment annotations are synced to the MachineSet. + if feature.Gates.Enabled(feature.MachineSetPreflightChecks) { + if err := validateSkippedMachineSetPreflightChecks(m); err != nil { + allErrs = append(allErrs, err) + } + } + if old != nil && old.Spec.ClusterName != m.Spec.ClusterName { allErrs = append( allErrs, diff --git a/api/v1beta1/machineset_webhook.go b/api/v1beta1/machineset_webhook.go index cf5d76b7504a..a800efbab860 100644 --- a/api/v1beta1/machineset_webhook.go +++ b/api/v1beta1/machineset_webhook.go @@ -24,11 +24,14 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/webhook" "sigs.k8s.io/controller-runtime/pkg/webhook/admission" + "sigs.k8s.io/cluster-api/feature" capilabels "sigs.k8s.io/cluster-api/internal/labels" "sigs.k8s.io/cluster-api/util/version" ) @@ -120,6 +123,12 @@ func (m *MachineSet) validate(old *MachineSet) error { ) } + if feature.Gates.Enabled(feature.MachineSetPreflightChecks) { + if err := validateSkippedMachineSetPreflightChecks(m); err != nil { + allErrs = append(allErrs, err) + } + } + if old != nil && old.Spec.ClusterName != m.Spec.ClusterName { allErrs = append( allErrs, @@ -149,3 +158,37 @@ func (m *MachineSet) validate(old *MachineSet) error { return apierrors.NewInvalid(GroupVersion.WithKind("MachineSet").GroupKind(), m.Name, allErrs) } + +func validateSkippedMachineSetPreflightChecks(o client.Object) *field.Error { + if o == nil { + return nil + } + skip := o.GetAnnotations()[MachineSetSkipPreflightChecksAnnotation] + if skip == "" { + return nil + } + + supported := sets.New[MachineSetPreflightCheck]( + MachineSetPreflightCheckAll, + MachineSetPreflightCheckKubeadmVersionSkew, + MachineSetPreflightCheckKubernetesVersionSkew, + MachineSetPreflightCheckControlPlaneIsStable, + ) + + skippedList := strings.Split(skip, ",") + invalid := []MachineSetPreflightCheck{} + for i := range skippedList { + skipped := MachineSetPreflightCheck(strings.TrimSpace(skippedList[i])) + if !supported.Has(skipped) { + invalid = append(invalid, skipped) + } + } + if len(invalid) > 0 { + return field.Invalid( + field.NewPath("metadata", "annotations", MachineSetSkipPreflightChecksAnnotation), + invalid, + fmt.Sprintf("skipped preflight check(s) must be among: %v", sets.List(supported)), + ) + } + return nil +} diff --git a/api/v1beta1/machineset_webhook_test.go b/api/v1beta1/machineset_webhook_test.go index 01af9147a25e..32bcfc60b952 100644 --- a/api/v1beta1/machineset_webhook_test.go +++ b/api/v1beta1/machineset_webhook_test.go @@ -228,3 +228,73 @@ func TestMachineSetVersionValidation(t *testing.T) { }) } } + +func TestValidateSkippedMachineSetPreflightChecks(t *testing.T) { + tests := []struct { + name string + ms *MachineSet + expectErr bool + }{ + { + name: "should pass if the machine set skip preflight checks annotation is not set", + ms: &MachineSet{}, + expectErr: false, + }, + { + name: "should pass if not preflight checks are skipped", + ms: &MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + MachineSetSkipPreflightChecksAnnotation: "", + }, + }, + }, + expectErr: false, + }, + { + name: "should pass if only valid preflight checks are skipped (single)", + ms: &MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + MachineSetSkipPreflightChecksAnnotation: string(MachineSetPreflightCheckKubeadmVersionSkew), + }, + }, + }, + expectErr: false, + }, + { + name: "should pass if only valid preflight checks are skipped (multiple)", + ms: &MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + MachineSetSkipPreflightChecksAnnotation: string(MachineSetPreflightCheckKubeadmVersionSkew) + "," + string(MachineSetPreflightCheckControlPlaneIsStable), + }, + }, + }, + expectErr: false, + }, + { + name: "should fail if invalid preflight checks are skipped", + ms: &MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + MachineSetSkipPreflightChecksAnnotation: string(MachineSetPreflightCheckKubeadmVersionSkew) + ",invalid-preflight-check-name", + }, + }, + }, + expectErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + err := validateSkippedMachineSetPreflightChecks(tt.ms) + if tt.expectErr { + g.Expect(err).NotTo(BeNil()) + } else { + g.Expect(err).To(BeNil()) + } + }) + } +} diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 5ac9b39670e8..8a2cd9c3c23d 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -22,7 +22,7 @@ spec: args: - "--leader-elect" - "--metrics-bind-addr=localhost:8080" - - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=false},ClusterResourceSet=${EXP_CLUSTER_RESOURCE_SET:=false},ClusterTopology=${CLUSTER_TOPOLOGY:=false},RuntimeSDK=${EXP_RUNTIME_SDK:=false}" + - "--feature-gates=MachinePool=${EXP_MACHINE_POOL:=false},ClusterResourceSet=${EXP_CLUSTER_RESOURCE_SET:=false},ClusterTopology=${CLUSTER_TOPOLOGY:=false},RuntimeSDK=${EXP_RUNTIME_SDK:=false},MachineSetPreflightChecks=${EXP_MACHINE_SET_PREFLIGHT_CHECKS:=false}" image: controller:latest name: manager env: diff --git a/docs/book/src/SUMMARY.md b/docs/book/src/SUMMARY.md index d1c84e7b165d..7db2e461d1df 100644 --- a/docs/book/src/SUMMARY.md +++ b/docs/book/src/SUMMARY.md @@ -26,6 +26,7 @@ - [Healthchecking](./tasks/automated-machine-management/healthchecking.md) - [Experimental Features](./tasks/experimental-features/experimental-features.md) - [MachinePools](./tasks/experimental-features/machine-pools.md) + - [MachineSetPreflightChecks](./tasks/experimental-features/machineset-preflight-checks.md) - [ClusterResourceSet](./tasks/experimental-features/cluster-resource-set.md) - [ClusterClass](./tasks/experimental-features/cluster-class/index.md) - [Writing a ClusterClass](./tasks/experimental-features/cluster-class/write-clusterclass.md) diff --git a/docs/book/src/developer/testing.md b/docs/book/src/developer/testing.md index 5cbd1fe58125..adb54a1264ac 100644 --- a/docs/book/src/developer/testing.md +++ b/docs/book/src/developer/testing.md @@ -267,6 +267,7 @@ kustomize_substitutions: EXP_CLUSTER_RESOURCE_SET: "true" EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true" EXP_RUNTIME_SDK: "true" + EXP_MACHINE_SET_PREFLIGHT_CHECKS: "true" ``` diff --git a/docs/book/src/developer/tilt.md b/docs/book/src/developer/tilt.md index 50be14bbd10b..4a25dc78f870 100644 --- a/docs/book/src/developer/tilt.md +++ b/docs/book/src/developer/tilt.md @@ -110,6 +110,7 @@ kustomize_substitutions: EXP_CLUSTER_RESOURCE_SET: "true" EXP_KUBEADM_BOOTSTRAP_FORMAT_IGNITION: "true" EXP_RUNTIME_SDK: "true" + EXP_MACHINE_SET_PREFLIGHT_CHECKS: "true" ``` {{#tabs name:"tab-tilt-kustomize-substitution" tabs:"AWS,Azure,DigitalOcean,GCP,vSphere"}} diff --git a/docs/book/src/reference/labels_and_annotations.md b/docs/book/src/reference/labels_and_annotations.md index 2b34e038a73c..bbc2f00707a7 100644 --- a/docs/book/src/reference/labels_and_annotations.md +++ b/docs/book/src/reference/labels_and_annotations.md @@ -19,38 +19,39 @@ **Supported Annotations:** -| Annotation | Note | -|:-----------------------------------------------------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| clusterctl.cluster.x-k8s.io/skip-crd-name-preflight-check | Can be placed on provider CRDs, so that clusterctl doesn't emit an error if the CRD doesn't comply with Cluster APIs naming scheme. Only CRDs that are referenced by core Cluster API CRDs have to comply with the naming scheme. | -| clusterctl.cluster.x-k8s.io/delete-for-move | DeleteForMoveAnnotation will be set to objects that are going to be deleted from the source cluster after being moved to the target cluster during the clusterctl move operation. It will help any validation webhook to take decision based on it. | -| unsafe.topology.cluster.x-k8s.io/disable-update-class-name-check | It can be used to disable the webhook check on update that disallows a pre-existing Cluster to be populated with Topology information and Class. | -| cluster.x-k8s.io/cluster-name | It is set on nodes identifying the name of the cluster the node belongs to. | -| cluster.x-k8s.io/cluster-namespace | It is set on nodes identifying the namespace of the cluster the node belongs to. | -| cluster.x-k8s.io/machine | It is set on nodes identifying the machine the node belongs to. | -| cluster.x-k8s.io/owner-kind | It is set on nodes identifying the owner kind. | -| cluster.x-k8s.io/owner-name | It is set on nodes identifying the owner name. | -| cluster.x-k8s.io/paused | It can be applied to any Cluster API object to prevent a controller from processing a resource. Controllers working with Cluster API objects must check the existence of this annotation on the reconciled object. | -| cluster.x-k8s.io/disable-machine-create | It can be used to signal a MachineSet to stop creating new machines. It is utilized in the OnDelete MachineDeploymentStrategy to allow the MachineDeployment controller to scale down older MachineSets when Machines are deleted and add the new replicas to the latest MachineSet. | -| cluster.x-k8s.io/delete-machine | It marks control plane and worker nodes that will be given priority for deletion when KCP or a MachineSet scales down. It is given top priority on all delete policies. | -| cluster.x-k8s.io/cloned-from-name | It is the infrastructure machine annotation that stores the name of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. | -| cluster.x-k8s.io/cloned-from-groupkind | It is the infrastructure machine annotation that stores the group-kind of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. | -| cluster.x-k8s.io/skip-remediation | It is used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. | -| cluster.x-k8s.io/managed-by | It can be applied to InfraCluster resources to signify that some external system is managing the cluster infrastructure. Provider InfraCluster controllers will ignore resources with this annotation. An external controller must fulfill the contract of the InfraCluster resource. External infrastructure providers should ensure that the annotation, once set, cannot be removed. | -| cluster.x-k8s.io/replicas-managed-by | It can be applied to MachinePool resources to signify that some external system is managing infrastructure scaling for that pool. See [the MachinePool documentation](../developer/architecture/controllers/machine-pool.md#externally-managed-autoscaler) for more details. | -| topology.cluster.x-k8s.io/defer-upgrade | It can be used to defer the Kubernetes upgrade of a single MachineDeployment topology. If the annotation is set on a MachineDeployment topology in Cluster.spec.topology.workers, the Kubernetes upgrade for this MachineDeployment topology is deferred. It doesn't affect other MachineDeployment topologies. | -| topology.cluster.x-k8s.io/dry-run | It is an annotation that gets set on objects by the topology controller only during a server side dry run apply operation. It is used for validating update webhooks for objects which get updated by template rotation (e.g. InfrastructureMachineTemplate). When the annotation is set and the admission request is a dry run, the webhook should deny validation due to immutability. By that the request will succeed (without any changes to the actual object because it is a dry run) and the topology controller will receive the resulting object. | -| topology.cluster.x-k8s.io/hold-upgrade-sequence | It can be used to hold the entire MachineDeployment upgrade sequence. If the annotation is set on a MachineDeployment topology in Cluster.spec.topology.workers, the Kubernetes upgrade for this MachineDeployment topology and all subsequent ones is deferred. | -| machine.cluster.x-k8s.io/certificates-expiry | It captures the expiry date of the machine certificates in RFC3339 format. It is used to trigger rollout of control plane machines before certificates expire. It can be set on BootstrapConfig and Machine objects. The value set on Machine object takes precedence. The annotation is only used by control plane machines. | -| machine.cluster.x-k8s.io/exclude-node-draining | It explicitly skips node draining if set. | -| machine.cluster.x-k8s.io/exclude-wait-for-node-volume-detach | It explicitly skips the waiting for node volume detaching if set. | -| pre-drain.delete.hook.machine.cluster.x-k8s.io | It specifies the prefix we search each annotation for during the pre-drain.delete lifecycle hook to pause reconciliation of deletion. These hooks will prevent removal of draining the associated node until all are removed. | -| pre-terminate.delete.hook.machine.cluster.x-k8s.io | It specifies the prefix we search each annotation for during the pre-terminate.delete lifecycle hook to pause reconciliation of deletion. These hooks will prevent removal of an instance from an infrastructure provider until all are removed. | -| machinedeployment.clusters.x-k8s.io/revision | It is the revision annotation of a machine deployment's machine sets which records its rollout sequence. | -| machinedeployment.clusters.x-k8s.io/revision-history | It maintains the history of all old revisions that a machine set has served for a machine deployment. | -| machinedeployment.clusters.x-k8s.io/desired-replicas | It is the desired replicas for a machine deployment recorded as an annotation in its machine sets. Helps in separating scaling events from the rollout process and for determining if the new machine set for a deployment is really saturated. | -| machinedeployment.clusters.x-k8s.io/max-replicas | It is the maximum replicas a deployment can have at a given point, which is machinedeployment.spec.replicas + maxSurge. Used by the underlying machine sets to estimate their proportions in case the deployment has surge replicas. | -| controlplane.cluster.x-k8s.io/skip-coredns | It explicitly skips reconciling CoreDNS if set. | -| controlplane.cluster.x-k8s.io/skip-kube-proxy | It explicitly skips reconciling kube-proxy if set. | -| controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration | It is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration. This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP. | -| controlplane.cluster.x-k8s.io/remediation-in-progress | It is a KCP annotation that tracks that the system is in between having deleted an unhealthy machine and recreating its replacement. | -| controlplane.cluster.x-k8s.io/remediation-for | It is a machine annotation that links a new machine to the unhealthy machine it is replacing. | +| Annotation | Note | +|:-----------------------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| clusterctl.cluster.x-k8s.io/skip-crd-name-preflight-check | Can be placed on provider CRDs, so that clusterctl doesn't emit an error if the CRD doesn't comply with Cluster APIs naming scheme. Only CRDs that are referenced by core Cluster API CRDs have to comply with the naming scheme. | +| clusterctl.cluster.x-k8s.io/delete-for-move | DeleteForMoveAnnotation will be set to objects that are going to be deleted from the source cluster after being moved to the target cluster during the clusterctl move operation. It will help any validation webhook to take decision based on it. | +| unsafe.topology.cluster.x-k8s.io/disable-update-class-name-check | It can be used to disable the webhook check on update that disallows a pre-existing Cluster to be populated with Topology information and Class. | +| cluster.x-k8s.io/cluster-name | It is set on nodes identifying the name of the cluster the node belongs to. | +| cluster.x-k8s.io/cluster-namespace | It is set on nodes identifying the namespace of the cluster the node belongs to. | +| cluster.x-k8s.io/machine | It is set on nodes identifying the machine the node belongs to. | +| cluster.x-k8s.io/owner-kind | It is set on nodes identifying the owner kind. | +| cluster.x-k8s.io/owner-name | It is set on nodes identifying the owner name. | +| cluster.x-k8s.io/paused | It can be applied to any Cluster API object to prevent a controller from processing a resource. Controllers working with Cluster API objects must check the existence of this annotation on the reconciled object. | +| cluster.x-k8s.io/disable-machine-create | It can be used to signal a MachineSet to stop creating new machines. It is utilized in the OnDelete MachineDeploymentStrategy to allow the MachineDeployment controller to scale down older MachineSets when Machines are deleted and add the new replicas to the latest MachineSet. | +| cluster.x-k8s.io/delete-machine | It marks control plane and worker nodes that will be given priority for deletion when KCP or a MachineSet scales down. It is given top priority on all delete policies. | +| cluster.x-k8s.io/cloned-from-name | It is the infrastructure machine annotation that stores the name of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. | +| cluster.x-k8s.io/cloned-from-groupkind | It is the infrastructure machine annotation that stores the group-kind of the infrastructure template resource that was cloned for the machine. This annotation is set only during cloning a template. Older/adopted machines will not have this annotation. | +| cluster.x-k8s.io/skip-remediation | It is used to mark the machines that should not be considered for remediation by MachineHealthCheck reconciler. | +| cluster.x-k8s.io/managed-by | It can be applied to InfraCluster resources to signify that some external system is managing the cluster infrastructure. Provider InfraCluster controllers will ignore resources with this annotation. An external controller must fulfill the contract of the InfraCluster resource. External infrastructure providers should ensure that the annotation, once set, cannot be removed. | +| cluster.x-k8s.io/replicas-managed-by | It can be applied to MachinePool resources to signify that some external system is managing infrastructure scaling for that pool. See [the MachinePool documentation](../developer/architecture/controllers/machine-pool.md#externally-managed-autoscaler) for more details. | +| cluster.x-k8s.io/skip-machineset-preflight-checks | It can be applied on MachineDeployment and MachineSet resources to specify a comma-separated list of preflight checks that should be skipped during MachineSet reconciliation. Supported preflight checks are: All, KubeadmVersionSkew, KubernetesVersionSkew, ControlPlaneIsStable. | +| topology.cluster.x-k8s.io/defer-upgrade | It can be used to defer the Kubernetes upgrade of a single MachineDeployment topology. If the annotation is set on a MachineDeployment topology in Cluster.spec.topology.workers, the Kubernetes upgrade for this MachineDeployment topology is deferred. It doesn't affect other MachineDeployment topologies. | +| topology.cluster.x-k8s.io/dry-run | It is an annotation that gets set on objects by the topology controller only during a server side dry run apply operation. It is used for validating update webhooks for objects which get updated by template rotation (e.g. InfrastructureMachineTemplate). When the annotation is set and the admission request is a dry run, the webhook should deny validation due to immutability. By that the request will succeed (without any changes to the actual object because it is a dry run) and the topology controller will receive the resulting object. | +| topology.cluster.x-k8s.io/hold-upgrade-sequence | It can be used to hold the entire MachineDeployment upgrade sequence. If the annotation is set on a MachineDeployment topology in Cluster.spec.topology.workers, the Kubernetes upgrade for this MachineDeployment topology and all subsequent ones is deferred. | +| machine.cluster.x-k8s.io/certificates-expiry | It captures the expiry date of the machine certificates in RFC3339 format. It is used to trigger rollout of control plane machines before certificates expire. It can be set on BootstrapConfig and Machine objects. The value set on Machine object takes precedence. The annotation is only used by control plane machines. | +| machine.cluster.x-k8s.io/exclude-node-draining | It explicitly skips node draining if set. | +| machine.cluster.x-k8s.io/exclude-wait-for-node-volume-detach | It explicitly skips the waiting for node volume detaching if set. | +| pre-drain.delete.hook.machine.cluster.x-k8s.io | It specifies the prefix we search each annotation for during the pre-drain.delete lifecycle hook to pause reconciliation of deletion. These hooks will prevent removal of draining the associated node until all are removed. | +| pre-terminate.delete.hook.machine.cluster.x-k8s.io | It specifies the prefix we search each annotation for during the pre-terminate.delete lifecycle hook to pause reconciliation of deletion. These hooks will prevent removal of an instance from an infrastructure provider until all are removed. | +| machinedeployment.clusters.x-k8s.io/revision | It is the revision annotation of a machine deployment's machine sets which records its rollout sequence. | +| machinedeployment.clusters.x-k8s.io/revision-history | It maintains the history of all old revisions that a machine set has served for a machine deployment. | +| machinedeployment.clusters.x-k8s.io/desired-replicas | It is the desired replicas for a machine deployment recorded as an annotation in its machine sets. Helps in separating scaling events from the rollout process and for determining if the new machine set for a deployment is really saturated. | +| machinedeployment.clusters.x-k8s.io/max-replicas | It is the maximum replicas a deployment can have at a given point, which is machinedeployment.spec.replicas + maxSurge. Used by the underlying machine sets to estimate their proportions in case the deployment has surge replicas. | +| controlplane.cluster.x-k8s.io/skip-coredns | It explicitly skips reconciling CoreDNS if set. | +| controlplane.cluster.x-k8s.io/skip-kube-proxy | It explicitly skips reconciling kube-proxy if set. | +| controlplane.cluster.x-k8s.io/kubeadm-cluster-configuration | It is a machine annotation that stores the json-marshalled string of KCP ClusterConfiguration. This annotation is used to detect any changes in ClusterConfiguration and trigger machine rollout in KCP. | +| controlplane.cluster.x-k8s.io/remediation-in-progress | It is a KCP annotation that tracks that the system is in between having deleted an unhealthy machine and recreating its replacement. | +| controlplane.cluster.x-k8s.io/remediation-for | It is a machine annotation that links a new machine to the unhealthy machine it is replacing. | diff --git a/docs/book/src/tasks/experimental-features/experimental-features.md b/docs/book/src/tasks/experimental-features/experimental-features.md index a03012fc2153..97e036182fe1 100644 --- a/docs/book/src/tasks/experimental-features/experimental-features.md +++ b/docs/book/src/tasks/experimental-features/experimental-features.md @@ -34,6 +34,7 @@ variables: EXP_MACHINE_POOL: "true" CLUSTER_TOPOLOGY: "true" EXP_RUNTIME_SDK: "true" + EXP_MACHINE_SET_PREFLIGHT_CHECKS: "true" ``` Another way is to set them as environmental variables before running e2e tests. @@ -48,6 +49,7 @@ kustomize_substitutions: EXP_MACHINE_POOL: 'true' CLUSTER_TOPOLOGY: 'true' EXP_RUNTIME_SDK: 'true' + EXP_MACHINE_SET_PREFLIGHT_CHECKS: 'true' ``` For more details on setting up a development environment with `tilt`, see [Developing Cluster API with Tilt](../../developer/tilt.md) diff --git a/docs/book/src/tasks/experimental-features/machineset-preflight-checks.md b/docs/book/src/tasks/experimental-features/machineset-preflight-checks.md new file mode 100644 index 000000000000..4104da963f6c --- /dev/null +++ b/docs/book/src/tasks/experimental-features/machineset-preflight-checks.md @@ -0,0 +1,12 @@ +# Experimental Feature: MachineSetPreflightChecks (alpha) + +The `MachineSetPreflightChecks` feature can provide additional safety while creating new Machines for a MachineSet. + +**Feature gate name**: `MachineSetPreflightChecks` + +**Variable name to enable/disable the feature gate**: `EXP_MACHINE_SET_PREFLIGHT_CHECKS` + +The following preflight checks are performed when the feature is enabled: +* ControlPlaneIsStable +* KubeadmVersionSkew +* KubernetesVersionSkew \ No newline at end of file diff --git a/feature/feature.go b/feature/feature.go index 79ab03fc305d..ff2bb33dc4bf 100644 --- a/feature/feature.go +++ b/feature/feature.go @@ -55,6 +55,11 @@ const ( // // alpha: v1.1 KubeadmBootstrapFormatIgnition featuregate.Feature = "KubeadmBootstrapFormatIgnition" + + // MachineSetPreflightChecks is a feature gate for the MachineSet preflight checks functionality. + // + // alpha: v1.5 + MachineSetPreflightChecks featuregate.Feature = "MachineSetPreflightChecks" ) func init() { @@ -70,4 +75,5 @@ var defaultClusterAPIFeatureGates = map[featuregate.Feature]featuregate.FeatureS ClusterTopology: {Default: false, PreRelease: featuregate.Alpha}, KubeadmBootstrapFormatIgnition: {Default: false, PreRelease: featuregate.Alpha}, RuntimeSDK: {Default: false, PreRelease: featuregate.Alpha}, + MachineSetPreflightChecks: {Default: false, PreRelease: featuregate.Alpha}, } diff --git a/internal/contract/controlplane.go b/internal/contract/controlplane.go index 3148477af309..df322ddcda06 100644 --- a/internal/contract/controlplane.go +++ b/internal/contract/controlplane.go @@ -157,7 +157,7 @@ func (c *ControlPlaneContract) IsProvisioning(obj *unstructured.Unstructured) (b // assume that the control plane is being created for the first time. statusVersion, err := c.StatusVersion().Get(obj) if err != nil { - if errors.Is(err, errNotFound) { + if errors.Is(err, ErrFieldNotFound) { return true, nil } return false, errors.Wrap(err, "failed to get control plane status version") @@ -183,7 +183,7 @@ func (c *ControlPlaneContract) IsUpgrading(obj *unstructured.Unstructured) (bool } statusVersion, err := c.StatusVersion().Get(obj) if err != nil { - if errors.Is(err, errNotFound) { // status version is not yet set + if errors.Is(err, ErrFieldNotFound) { // status version is not yet set // If the status.version is not yet present in the object, it implies the // first machine of the control plane is provisioning. We can reasonably assume // that the control plane is not upgrading at this stage. @@ -216,7 +216,7 @@ func (c *ControlPlaneContract) IsScaling(obj *unstructured.Unstructured) (bool, statusReplicas, err := c.StatusReplicas().Get(obj) if err != nil { - if errors.Is(err, errNotFound) { + if errors.Is(err, ErrFieldNotFound) { // status is probably not yet set on the control plane // if status is missing we can consider the control plane to be scaling // so that we can block any operations that expect control plane to be stable. @@ -227,7 +227,7 @@ func (c *ControlPlaneContract) IsScaling(obj *unstructured.Unstructured) (bool, updatedReplicas, err := c.UpdatedReplicas().Get(obj) if err != nil { - if errors.Is(err, errNotFound) { + if errors.Is(err, ErrFieldNotFound) { // If updatedReplicas is not set on the control plane // we should consider the control plane to be scaling so that // we block any operation that expect the control plane to be stable. @@ -238,7 +238,7 @@ func (c *ControlPlaneContract) IsScaling(obj *unstructured.Unstructured) (bool, readyReplicas, err := c.ReadyReplicas().Get(obj) if err != nil { - if errors.Is(err, errNotFound) { + if errors.Is(err, ErrFieldNotFound) { // If readyReplicas is not set on the control plane // we should consider the control plane to be scaling so that // we block any operation that expect the control plane to be stable. @@ -249,7 +249,7 @@ func (c *ControlPlaneContract) IsScaling(obj *unstructured.Unstructured) (bool, unavailableReplicas, err := c.UnavailableReplicas().Get(obj) if err != nil { - if !errors.Is(err, errNotFound) { + if !errors.Is(err, ErrFieldNotFound) { return false, errors.Wrap(err, "failed to get control plane status unavailableReplicas") } // If unavailableReplicas is not set on the control plane we assume it is 0. diff --git a/internal/contract/infrastructure_cluster.go b/internal/contract/infrastructure_cluster.go index e1055fea3577..d295c70d7f60 100644 --- a/internal/contract/infrastructure_cluster.go +++ b/internal/contract/infrastructure_cluster.go @@ -137,7 +137,7 @@ func (d *FailureDomains) Get(obj *unstructured.Unstructured) (*clusterv1.Failure return nil, errors.Wrapf(err, "failed to get %s from object", "."+strings.Join(d.path, ".")) } if !ok { - return nil, errors.Wrapf(errNotFound, "path %s", "."+strings.Join(d.path, ".")) + return nil, errors.Wrapf(ErrFieldNotFound, "path %s", "."+strings.Join(d.path, ".")) } domains := make(clusterv1.FailureDomains, len(domainMap)) diff --git a/internal/contract/infrastructure_machine.go b/internal/contract/infrastructure_machine.go index ac63f5a96a93..a762cb3fe264 100644 --- a/internal/contract/infrastructure_machine.go +++ b/internal/contract/infrastructure_machine.go @@ -101,7 +101,7 @@ func (m *MachineAddresses) Get(obj *unstructured.Unstructured) (*[]clusterv1.Mac return nil, errors.Wrapf(err, "failed to get %s from object", "."+strings.Join(m.path, ".")) } if !ok { - return nil, errors.Wrapf(errNotFound, "path %s", "."+strings.Join(m.path, ".")) + return nil, errors.Wrapf(ErrFieldNotFound, "path %s", "."+strings.Join(m.path, ".")) } addresses := make([]clusterv1.MachineAddress, len(slice)) diff --git a/internal/contract/types.go b/internal/contract/types.go index a7a2b28fbda1..e6f1f6e80c4d 100644 --- a/internal/contract/types.go +++ b/internal/contract/types.go @@ -25,7 +25,8 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) -var errNotFound = errors.New("not found") +// ErrFieldNotFound is returned when a field is not found in the object. +var ErrFieldNotFound = errors.New("field not found") // Path defines a how to access a field in an Unstructured object. type Path []string @@ -88,7 +89,7 @@ func (i *Int64) Get(obj *unstructured.Unstructured) (*int64, error) { return nil, errors.Wrapf(err, "failed to get %s from object", "."+strings.Join(i.path, ".")) } if !ok { - return nil, errors.Wrapf(errNotFound, "path %s", "."+strings.Join(i.path, ".")) + return nil, errors.Wrapf(ErrFieldNotFound, "path %s", "."+strings.Join(i.path, ".")) } return &value, nil } @@ -118,7 +119,7 @@ func (b *Bool) Get(obj *unstructured.Unstructured) (*bool, error) { return nil, errors.Wrapf(err, "failed to get %s from object", "."+strings.Join(b.path, ".")) } if !ok { - return nil, errors.Wrapf(errNotFound, "path %s", "."+strings.Join(b.path, ".")) + return nil, errors.Wrapf(ErrFieldNotFound, "path %s", "."+strings.Join(b.path, ".")) } return &value, nil } @@ -148,7 +149,7 @@ func (s *String) Get(obj *unstructured.Unstructured) (*string, error) { return nil, errors.Wrapf(err, "failed to get %s from object", "."+strings.Join(s.path, ".")) } if !ok { - return nil, errors.Wrapf(errNotFound, "path %s", "."+strings.Join(s.path, ".")) + return nil, errors.Wrapf(ErrFieldNotFound, "path %s", "."+strings.Join(s.path, ".")) } return &value, nil } @@ -178,7 +179,7 @@ func (i *Duration) Get(obj *unstructured.Unstructured) (*metav1.Duration, error) return nil, errors.Wrapf(err, "failed to get %s from object", "."+strings.Join(i.path, ".")) } if !ok { - return nil, errors.Wrapf(errNotFound, "path %s", "."+strings.Join(i.path, ".")) + return nil, errors.Wrapf(ErrFieldNotFound, "path %s", "."+strings.Join(i.path, ".")) } d := &metav1.Duration{} diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 035009ce544c..4a3e76ae7693 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -298,24 +298,41 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, // Remediate failed Machines by deleting them. var errs []error + machinesToRemediate := make([]*clusterv1.Machine, 0, len(filteredMachines)) for _, machine := range filteredMachines { - log := log.WithValues("Machine", klog.KObj(machine)) // filteredMachines contains machines in deleting status to calculate correct status. // skip remediation for those in deleting status. if !machine.DeletionTimestamp.IsZero() { continue } if conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition) { - log.Info("Deleting machine because marked as unhealthy by the MachineHealthCheck controller") - patch := client.MergeFrom(machine.DeepCopy()) - if err := r.Client.Delete(ctx, machine); err != nil { - errs = append(errs, errors.Wrap(err, "failed to delete")) - continue - } - conditions.MarkTrue(machine, clusterv1.MachineOwnerRemediatedCondition) - if err := r.Client.Status().Patch(ctx, machine, patch); err != nil && !apierrors.IsNotFound(err) { - errs = append(errs, errors.Wrap(err, "failed to update status")) + machinesToRemediate = append(machinesToRemediate, machine) + } + } + + result := ctrl.Result{} + if len(machinesToRemediate) > 0 { + preflightChecksResult, err := r.runPreflightChecks(ctx, cluster, machineSet, "Machine Remediation") + if err != nil { + return ctrl.Result{}, err + } + // Delete the machines only if the preflight checks have passed. Do not delete machines if we cannot + // guarantee creating new machines. + if preflightChecksResult.IsZero() { + for _, machine := range machinesToRemediate { + log.Info("Deleting machine because marked as unhealthy by the MachineHealthCheck controller") + patch := client.MergeFrom(machine.DeepCopy()) + if err := r.Client.Delete(ctx, machine); err != nil { + errs = append(errs, errors.Wrap(err, "failed to delete")) + continue + } + conditions.MarkTrue(machine, clusterv1.MachineOwnerRemediatedCondition) + if err := r.Client.Status().Patch(ctx, machine, patch); err != nil && !apierrors.IsNotFound(err) { + errs = append(errs, errors.Wrap(err, "failed to update status")) + } } + } else { + result = util.LowestNonZeroResult(result, preflightChecksResult) } } @@ -329,7 +346,8 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, return ctrl.Result{}, errors.Wrap(err, "failed to update Machines") } - syncErr := r.syncReplicas(ctx, machineSet, filteredMachines) + syncReplicasResult, syncErr := r.syncReplicas(ctx, cluster, machineSet, filteredMachines) + result = util.LowestNonZeroResult(result, syncReplicasResult) // Always updates status as machines come up or die. if err := r.updateStatus(ctx, cluster, machineSet, filteredMachines); err != nil { @@ -355,15 +373,18 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster, if machineSet.Spec.MinReadySeconds > 0 && machineSet.Status.ReadyReplicas == replicas && machineSet.Status.AvailableReplicas != replicas { - return ctrl.Result{RequeueAfter: time.Duration(machineSet.Spec.MinReadySeconds) * time.Second}, nil + minReadyResult := ctrl.Result{RequeueAfter: time.Duration(machineSet.Spec.MinReadySeconds) * time.Second} + result = util.LowestNonZeroResult(result, minReadyResult) + return result, nil } // Quickly reconcile until the nodes become Ready. if machineSet.Status.ReadyReplicas != replicas { - return ctrl.Result{RequeueAfter: 15 * time.Second}, nil + result = util.LowestNonZeroResult(result, ctrl.Result{RequeueAfter: 15 * time.Second}) + return result, nil } - return ctrl.Result{}, nil + return result, nil } // syncMachines updates Machines, InfrastructureMachine and BootstrapConfig to propagate in-place mutable fields @@ -444,10 +465,10 @@ func (r *Reconciler) syncMachines(ctx context.Context, machineSet *clusterv1.Mac } // syncReplicas scales Machine resources up or down. -func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine) error { +func (r *Reconciler) syncReplicas(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, machines []*clusterv1.Machine) (ctrl.Result, error) { log := ctrl.LoggerFrom(ctx) if ms.Spec.Replicas == nil { - return errors.Errorf("the Replicas field in Spec for machineset %v is nil, this should not be allowed", ms.Name) + return ctrl.Result{}, errors.Errorf("the Replicas field in Spec for machineset %v is nil, this should not be allowed", ms.Name) } diff := len(machines) - int(*(ms.Spec.Replicas)) switch { @@ -457,9 +478,15 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet, if ms.Annotations != nil { if _, ok := ms.Annotations[clusterv1.DisableMachineCreateAnnotation]; ok { log.Info("Automatic creation of new machines disabled for machine set") - return nil + return ctrl.Result{}, nil } } + + result, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up") + if err != nil || !result.IsZero() { + return result, err + } + var ( machineList []*clusterv1.Machine errs []error @@ -493,7 +520,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet, }) if err != nil { conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.BootstrapTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error()) - return errors.Wrapf(err, "failed to clone bootstrap configuration from %s %s while creating a machine", + return ctrl.Result{}, errors.Wrapf(err, "failed to clone bootstrap configuration from %s %s while creating a machine", ms.Spec.Template.Spec.Bootstrap.ConfigRef.Kind, klog.KRef(ms.Spec.Template.Spec.Bootstrap.ConfigRef.Namespace, ms.Spec.Template.Spec.Bootstrap.ConfigRef.Name)) } @@ -518,7 +545,7 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet, }) if err != nil { conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.InfrastructureTemplateCloningFailedReason, clusterv1.ConditionSeverityError, err.Error()) - return errors.Wrapf(err, "failed to clone infrastructure machine from %s %s while creating a machine", + return ctrl.Result{}, errors.Wrapf(err, "failed to clone infrastructure machine from %s %s while creating a machine", ms.Spec.Template.Spec.InfrastructureRef.Kind, klog.KRef(ms.Spec.Template.Spec.InfrastructureRef.Namespace, ms.Spec.Template.Spec.InfrastructureRef.Name)) } @@ -551,15 +578,15 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet, } if len(errs) > 0 { - return kerrors.NewAggregate(errs) + return ctrl.Result{}, kerrors.NewAggregate(errs) } - return r.waitForMachineCreation(ctx, machineList) + return ctrl.Result{}, r.waitForMachineCreation(ctx, machineList) case diff > 0: log.Info(fmt.Sprintf("MachineSet is scaling down to %d replicas by deleting %d machines", *(ms.Spec.Replicas), diff), "replicas", *(ms.Spec.Replicas), "machineCount", len(machines), "deletePolicy", ms.Spec.DeletePolicy) deletePriorityFunc, err := getDeletePriorityFunc(ms) if err != nil { - return err + return ctrl.Result{}, err } var errs []error @@ -581,12 +608,12 @@ func (r *Reconciler) syncReplicas(ctx context.Context, ms *clusterv1.MachineSet, } if len(errs) > 0 { - return kerrors.NewAggregate(errs) + return ctrl.Result{}, kerrors.NewAggregate(errs) } - return r.waitForMachineDeletion(ctx, machinesToDelete) + return ctrl.Result{}, r.waitForMachineDeletion(ctx, machinesToDelete) } - return nil + return ctrl.Result{}, nil } // computeDesiredMachine computes the desired Machine. diff --git a/internal/controllers/machineset/machineset_preflight.go b/internal/controllers/machineset/machineset_preflight.go new file mode 100644 index 000000000000..582413c7c306 --- /dev/null +++ b/internal/controllers/machineset/machineset_preflight.go @@ -0,0 +1,220 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machineset + +import ( + "context" + "fmt" + "strings" + "time" + + "github.com/blang/semver" + "github.com/pkg/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + kerrors "k8s.io/apimachinery/pkg/util/errors" + "k8s.io/apimachinery/pkg/util/sets" + "k8s.io/klog/v2" + "k8s.io/utils/pointer" + ctrl "sigs.k8s.io/controller-runtime" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/controllers/external" + "sigs.k8s.io/cluster-api/feature" + "sigs.k8s.io/cluster-api/internal/contract" +) + +type preflightCheckErrorMessage *string + +// preflightFailedRequeueAfter is used to requeue the MachineSet to re-verify the preflight checks if +// the preflight checks fail. +const preflightFailedRequeueAfter = 15 * time.Second + +func (r *Reconciler) runPreflightChecks(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, action string) (ctrl.Result, error) { + log := ctrl.LoggerFrom(ctx) + // If the MachineSetPreflightChecks feature gate is disabled return early. + if !feature.Gates.Enabled(feature.MachineSetPreflightChecks) { + return ctrl.Result{}, nil + } + + skipped := skippedPreflightChecks(ms) + // If all the preflight checks are skipped then return early. + if skipped.Has(clusterv1.MachineSetPreflightCheckAll) { + return ctrl.Result{}, nil + } + + // If the cluster does not have a control plane reference then there is nothing to do. Return early. + if cluster.Spec.ControlPlaneRef == nil { + return ctrl.Result{}, nil + } + + // Get the control plane object. + controlPlane, err := external.Get(ctx, r.Client, cluster.Spec.ControlPlaneRef, cluster.Namespace) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to get ControlPlane %s", action, klog.KRef(cluster.Spec.ControlPlaneRef.Namespace, cluster.Spec.ControlPlaneRef.Name)) + } + cpKlogRef := klog.KRef(controlPlane.GetNamespace(), controlPlane.GetName()) + + // If the Control Plane version is not set then we are dealing with a control plane that does not support version + // or a control plane where the version is not set. In both cases we cannot perform any preflight checks as + // we do not have enough information. Return early. + cpVersion, err := contract.ControlPlane().Version().Get(controlPlane) + if err != nil { + if errors.Is(err, contract.ErrFieldNotFound) { + return ctrl.Result{}, nil + } + return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to get the version of ControlPlane %s", action, cpKlogRef) + } + cpSemver, err := semver.ParseTolerant(*cpVersion) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to parse version %q of ControlPlane %s", action, *cpVersion, cpKlogRef) + } + + errList := []error{} + preflightCheckErrs := []preflightCheckErrorMessage{} + // Run the control-plane-stable preflight check. + if !skipped.Has(clusterv1.MachineSetPreflightCheckControlPlaneIsStable) { + preflightCheckErr, err := r.controlPlaneStablePreflightCheck(controlPlane) + if err != nil { + errList = append(errList, err) + } + if preflightCheckErr != nil { + preflightCheckErrs = append(preflightCheckErrs, preflightCheckErr) + } + } + + // Check the version skew policies only if version is defined in the MachineSet. + if ms.Spec.Template.Spec.Version != nil { + msVersion := *ms.Spec.Template.Spec.Version + msSemver, err := semver.ParseTolerant(msVersion) + if err != nil { + return ctrl.Result{}, errors.Wrapf(err, "failed to perform %q: failed to perform preflight checks: failed to parse version %q of MachineSet %s", action, msVersion, klog.KObj(ms)) + } + + // Run the kubernetes-version skew preflight check. + if !skipped.Has(clusterv1.MachineSetPreflightCheckKubernetesVersionSkew) { + preflightCheckErr := r.kubernetesVersionPreflightCheck(cpSemver, msSemver) + if preflightCheckErr != nil { + preflightCheckErrs = append(preflightCheckErrs, preflightCheckErr) + } + } + + // Run the kubeadm-version skew preflight check. + if !skipped.Has(clusterv1.MachineSetPreflightCheckKubeadmVersionSkew) { + preflightCheckErr, err := r.kubeadmVersionPreflightCheck(cpSemver, msSemver, ms) + if err != nil { + errList = append(errList, err) + } + if preflightCheckErr != nil { + preflightCheckErrs = append(preflightCheckErrs, preflightCheckErr) + } + } + } + + if len(errList) > 0 { + return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate(errList), "failed to perform %q: failed to perform preflight checks", action) + } + if len(preflightCheckErrs) > 0 { + preflightCheckErrStrings := []string{} + for _, v := range preflightCheckErrs { + preflightCheckErrStrings = append(preflightCheckErrStrings, *v) + } + log.Info(fmt.Sprintf("Performing %q on hold because %s. The operation will continue after the preflight check(s) pass", action, strings.Join(preflightCheckErrStrings, "; "))) + return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil + } + return ctrl.Result{}, nil +} + +func (r *Reconciler) controlPlaneStablePreflightCheck(controlPlane *unstructured.Unstructured) (preflightCheckErrorMessage, error) { + cpKlogRef := klog.KRef(controlPlane.GetNamespace(), controlPlane.GetName()) + + // Check that the control plane is not provisioning. + isProvisioning, err := contract.ControlPlane().IsProvisioning(controlPlane) + if err != nil { + return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if ControlPlane %s is provisioning", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, cpKlogRef) + } + if isProvisioning { + return pointer.String(fmt.Sprintf("ControlPlane %s is provisioning (%q preflight failed)", cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil + } + + // Check that the control plane is not upgrading. + isUpgrading, err := contract.ControlPlane().IsUpgrading(controlPlane) + if err != nil { + return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to check if the ControlPlane %s is upgrading", clusterv1.MachineSetPreflightCheckControlPlaneIsStable, cpKlogRef) + } + if isUpgrading { + return pointer.String(fmt.Sprintf("ControlPlane %s is upgrading (%q preflight failed)", cpKlogRef, clusterv1.MachineSetPreflightCheckControlPlaneIsStable)), nil + } + + return nil, nil +} + +func (r *Reconciler) kubernetesVersionPreflightCheck(cpSemver, msSemver semver.Version) preflightCheckErrorMessage { + // Check the Kubernetes version skew policy. + // => MS minor version cannot be greater than the Control Plane minor version. + // => MS minor version cannot be older than 2 minor versions of Control Plane. + // Kubernetes skew policy: https://kubernetes.io/releases/version-skew-policy/#kubelet + if msSemver.Minor > cpSemver.Minor { + return pointer.String(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to the kubernetes version skew policy as MachineSet version is higher than ControlPlane version (%q preflight failed)", msSemver.String(), cpSemver.String(), clusterv1.MachineSetPreflightCheckKubernetesVersionSkew)) + } + if msSemver.Minor < cpSemver.Minor-2 { + return pointer.String(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to the kubernetes version skew policy as MachineSet version is more than 2 minor versions older than the ControlPlane version (%q preflight failed)", msSemver.String(), cpSemver.String(), clusterv1.MachineSetPreflightCheckKubernetesVersionSkew)) + } + + return nil +} + +func (r *Reconciler) kubeadmVersionPreflightCheck(cpSemver, msSemver semver.Version, ms *clusterv1.MachineSet) (preflightCheckErrorMessage, error) { + // If the bootstrap.configRef is nil return early. + if ms.Spec.Template.Spec.Bootstrap.ConfigRef == nil { + return nil, nil + } + + // If using kubeadm bootstrap provider, check the kubeadm version skew policy. + // => MS version should match (major+minor) the Control Plane version. + // kubeadm skew policy: https://kubernetes.io/docs/setup/production-environment/tools/kubeadm/create-cluster-kubeadm/#kubeadm-s-skew-against-kubeadm + bootstrapConfigRef := ms.Spec.Template.Spec.Bootstrap.ConfigRef + groupVersion, err := schema.ParseGroupVersion(bootstrapConfigRef.APIVersion) + if err != nil { + return nil, errors.Wrapf(err, "failed to perform %q preflight check: failed to parse bootstrap configRef APIVersion %s", clusterv1.MachineSetPreflightCheckKubeadmVersionSkew, bootstrapConfigRef.APIVersion) + } + kubeadmBootstrapProviderUsed := bootstrapConfigRef.Kind == "KubeadmConfigTemplate" && + groupVersion.Group == bootstrapv1.GroupVersion.Group + if kubeadmBootstrapProviderUsed { + if cpSemver.Minor != msSemver.Minor { + return pointer.String(fmt.Sprintf("MachineSet version (%s) and ControlPlane version (%s) do not conform to kubeadm version skew policy as kubeadm only supports joining with the same major+minor version as the control plane (%q preflight failed)", msSemver.String(), cpSemver.String(), clusterv1.MachineSetPreflightCheckKubeadmVersionSkew)), nil + } + } + return nil, nil +} + +func skippedPreflightChecks(ms *clusterv1.MachineSet) sets.Set[clusterv1.MachineSetPreflightCheck] { + skipped := sets.Set[clusterv1.MachineSetPreflightCheck]{} + if ms == nil { + return skipped + } + skip := ms.Annotations[clusterv1.MachineSetSkipPreflightChecksAnnotation] + if skip == "" { + return skipped + } + skippedList := strings.Split(skip, ",") + for i := range skippedList { + skipped.Insert(clusterv1.MachineSetPreflightCheck(strings.TrimSpace(skippedList[i]))) + } + return skipped +} diff --git a/internal/controllers/machineset/machineset_preflight_test.go b/internal/controllers/machineset/machineset_preflight_test.go new file mode 100644 index 000000000000..45adfb9d5f78 --- /dev/null +++ b/internal/controllers/machineset/machineset_preflight_test.go @@ -0,0 +1,548 @@ +/* +Copyright 2023 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package machineset + +import ( + "testing" + + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + utilfeature "k8s.io/component-base/featuregate/testing" + "k8s.io/utils/pointer" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1" + bootstrapv1 "sigs.k8s.io/cluster-api/bootstrap/kubeadm/api/v1beta1" + "sigs.k8s.io/cluster-api/feature" + "sigs.k8s.io/cluster-api/internal/contract" + "sigs.k8s.io/cluster-api/internal/test/builder" +) + +func TestMachineSetReconciler_runPreflightChecks(t *testing.T) { + ns := "ns1" + + controlPlaneWithNoVersion := builder.ControlPlane(ns, "cp1").Build() + + controlPlaneWithInvalidVersion := builder.ControlPlane(ns, "cp1"). + WithVersion("v1.25.6.0").Build() + + controlPlaneProvisioning := builder.ControlPlane(ns, "cp1"). + WithVersion("v1.25.6").Build() + + controlPlaneUpgrading := builder.ControlPlane(ns, "cp1"). + WithVersion("v1.26.2"). + WithStatusFields(map[string]interface{}{ + "status.version": "v1.25.2", + }). + Build() + + controlPlaneStable := builder.ControlPlane(ns, "cp1"). + WithVersion("v1.26.2"). + WithStatusFields(map[string]interface{}{ + "status.version": "v1.26.2", + }). + Build() + + t.Run("should run preflight checks if the feature gate is enabled", func(t *testing.T) { + defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, true)() + + tests := []struct { + name string + cluster *clusterv1.Cluster + controlPlane *unstructured.Unstructured + machineSet *clusterv1.MachineSet + wantPass bool + wantErr bool + }{ + { + name: "should pass if cluster has no control plane", + cluster: &clusterv1.Cluster{}, + machineSet: &clusterv1.MachineSet{}, + wantPass: true, + }, + { + name: "should pass if the control plane version is not defined", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneWithNoVersion), + }, + }, + controlPlane: controlPlaneWithNoVersion, + machineSet: &clusterv1.MachineSet{}, + wantPass: true, + }, + { + name: "should error if the control plane version is invalid", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneWithInvalidVersion), + }, + }, + controlPlane: controlPlaneWithInvalidVersion, + machineSet: &clusterv1.MachineSet{}, + wantErr: true, + }, + { + name: "should pass if all preflight checks are skipped", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading), + }, + }, + controlPlane: controlPlaneUpgrading, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Annotations: map[string]string{ + clusterv1.MachineSetSkipPreflightChecksAnnotation: string(clusterv1.MachineSetPreflightCheckAll), + }, + }, + }, + wantPass: true, + }, + { + name: "control plane preflight check: should fail if the control plane is provisioning", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneProvisioning), + }, + }, + controlPlane: controlPlaneProvisioning, + machineSet: &clusterv1.MachineSet{}, + wantPass: false, + }, + { + name: "control plane preflight check: should fail if the control plane is upgrading", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading), + }, + }, + controlPlane: controlPlaneUpgrading, + machineSet: &clusterv1.MachineSet{}, + wantPass: false, + }, + { + name: "control plane preflight check: should pass if the control plane is upgrading but the preflight check is skipped", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading), + }, + }, + controlPlane: controlPlaneUpgrading, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Annotations: map[string]string{ + clusterv1.MachineSetSkipPreflightChecksAnnotation: string(clusterv1.MachineSetPreflightCheckControlPlaneIsStable), + }, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.26.2"), + Bootstrap: clusterv1.Bootstrap{ConfigRef: &corev1.ObjectReference{Kind: "KubeadmConfigTemplate"}}, + }, + }, + }, + }, + wantPass: true, + }, + { + name: "control plane preflight check: should pass if the control plane is stable", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{}, + wantPass: true, + }, + { + name: "should pass if the machine set version is not defined", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{}, + }, + wantPass: true, + }, + { + name: "should error if the machine set version is invalid", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.27.0.0"), + }, + }, + }, + }, + wantErr: true, + }, + { + name: "kubernetes version preflight check: should fail if the machine set minor version is greater than control plane minor version", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.27.0"), + }, + }, + }, + }, + wantPass: false, + }, + { + name: "kubernetes version preflight check: should fail if the machine set minor version is 2 older than control plane minor version", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.23.0"), + }, + }, + }, + }, + wantPass: false, + }, + { + name: "kubernetes version preflight check: should pass if the machine set minor version is greater than control plane minor version but the preflight check is skipped", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Annotations: map[string]string{ + clusterv1.MachineSetSkipPreflightChecksAnnotation: string(clusterv1.MachineSetPreflightCheckKubernetesVersionSkew), + }, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.27.0"), + }, + }, + }, + }, + wantPass: true, + }, + { + name: "kubernetes version preflight check: should pass if the machine set minor version and control plane version conform to kubernetes version skew policy", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.24.0"), + }, + }, + }, + }, + wantPass: true, + }, + { + name: "kubeadm version preflight check: should fail if the machine set version is not equal (major+minor) to control plane version when using kubeadm bootstrap provider", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.25.5"), + Bootstrap: clusterv1.Bootstrap{ConfigRef: &corev1.ObjectReference{ + APIVersion: bootstrapv1.GroupVersion.String(), + Kind: "KubeadmConfigTemplate", + }}, + }, + }, + }, + }, + wantPass: false, + }, + { + name: "kubeadm version preflight check: should pass if the machine set is not using kubeadm bootstrap provider", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.25.0"), + }, + }, + }, + }, + wantPass: true, + }, + { + name: "kubeadm version preflight check: should pass if the machine set version and control plane version do not conform to kubeadm version skew policy but the preflight check is skipped", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Annotations: map[string]string{ + clusterv1.MachineSetSkipPreflightChecksAnnotation: "foobar," + string(clusterv1.MachineSetPreflightCheckKubeadmVersionSkew), + }, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.25.0"), + Bootstrap: clusterv1.Bootstrap{ConfigRef: &corev1.ObjectReference{ + APIVersion: bootstrapv1.GroupVersion.String(), + Kind: "KubeadmConfigTemplate", + }}, + }, + }, + }, + }, + wantPass: true, + }, + { + name: "kubeadm version preflight check: should pass if the machine set version and control plane version conform to kubeadm version skew when using kubeadm bootstrap provider", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.26.2"), + Bootstrap: clusterv1.Bootstrap{ConfigRef: &corev1.ObjectReference{ + APIVersion: bootstrapv1.GroupVersion.String(), + Kind: "KubeadmConfigTemplate", + }}, + }, + }, + }, + }, + wantPass: true, + }, + { + name: "kubeadm version preflight check: should error if the bootstrap ref APIVersion is invalid", + cluster: &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneStable), + }, + }, + controlPlane: controlPlaneStable, + machineSet: &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.26.2"), + Bootstrap: clusterv1.Bootstrap{ConfigRef: &corev1.ObjectReference{ + APIVersion: "bootstrap.cluster.x-k8s.io/v1beta1/invalid", + Kind: "KubeadmConfigTemplate", + }}, + }, + }, + }, + }, + wantErr: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + g := NewWithT(t) + objs := []client.Object{} + if tt.controlPlane != nil { + objs = append(objs, tt.controlPlane) + } + fakeClient := fake.NewClientBuilder().WithObjects(objs...).Build() + r := &Reconciler{Client: fakeClient} + result, err := r.runPreflightChecks(ctx, tt.cluster, tt.machineSet, "") + if tt.wantErr { + g.Expect(err).To(HaveOccurred()) + } else { + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(result.IsZero()).To(Equal(tt.wantPass)) + } + }) + } + }) + + t.Run("should not run the preflight checks if the feature gate is disabled", func(t *testing.T) { + defer utilfeature.SetFeatureGateDuringTest(t, feature.Gates, feature.MachineSetPreflightChecks, false)() + + g := NewWithT(t) + cluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.ClusterSpec{ + ControlPlaneRef: contract.ObjToRef(controlPlaneUpgrading), + }, + } + controlPlane := controlPlaneUpgrading + machineSet := &clusterv1.MachineSet{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + }, + Spec: clusterv1.MachineSetSpec{ + Template: clusterv1.MachineTemplateSpec{ + Spec: clusterv1.MachineSpec{ + Version: pointer.String("v1.26.0"), + Bootstrap: clusterv1.Bootstrap{ConfigRef: &corev1.ObjectReference{ + APIVersion: bootstrapv1.GroupVersion.String(), + Kind: "KubeadmConfigTemplate", + }}, + }, + }, + }, + } + fakeClient := fake.NewClientBuilder().WithObjects(controlPlane).Build() + r := &Reconciler{Client: fakeClient} + result, err := r.runPreflightChecks(ctx, cluster, machineSet, "") + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(result.IsZero()).To(BeTrue()) + }) +} diff --git a/test/e2e/config/docker.yaml b/test/e2e/config/docker.yaml index c66c62bbfbfd..ee26ea088ebe 100644 --- a/test/e2e/config/docker.yaml +++ b/test/e2e/config/docker.yaml @@ -281,6 +281,7 @@ variables: EXP_MACHINE_POOL: "true" CLUSTER_TOPOLOGY: "true" EXP_RUNTIME_SDK: "true" + EXP_MACHINE_SET_PREFLIGHT_CHECKS: "true" intervals: default/wait-controllers: ["3m", "10s"]