From a64cd41fa7cd1151058f18d65d430c2a6d0f354e Mon Sep 17 00:00:00 2001 From: Guillermo Gaston Date: Sat, 20 Jan 2024 22:02:44 +0000 Subject: [PATCH] Watch external objects for machine before deleting This fixes a race condition in the machine controller when the controller is restarted after a machine has been marked for deletion but before the infra machine or bootstrap config are deleted. In that case, the controller doesn't have watches on those external objects, so the reconciliation is not triggered once they are deleted. This means that Machines stay in Deleting state for the resync period, the default is 10 minutes. --- .../controllers/machine/machine_controller.go | 24 ++- .../machine/machine_controller_phases.go | 56 +++-- .../machine/machine_controller_test.go | 196 +++++++++++++++++- 3 files changed, 241 insertions(+), 35 deletions(-) diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go index 31663a5de504..47c5b4002785 100644 --- a/internal/controllers/machine/machine_controller.go +++ b/internal/controllers/machine/machine_controller.go @@ -333,7 +333,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu if err != nil { switch err { case errNoControlPlaneNodes, errLastControlPlaneNode, errNilNodeRef, errClusterIsBeingDeleted, errControlPlaneIsBeingDeleted: - var nodeName = "" + nodeName := "" if m.Status.NodeRef != nil { nodeName = m.Status.NodeRef.Name } @@ -427,7 +427,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu return ctrl.Result{}, errors.Wrap(err, "failed to patch Machine") } - infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, m) + infrastructureDeleted, err := r.reconcileDeleteInfrastructure(ctx, cluster, m) if err != nil { return ctrl.Result{}, err } @@ -436,7 +436,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu return ctrl.Result{}, nil } - bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, m) + bootstrapDeleted, err := r.reconcileDeleteBootstrap(ctx, cluster, m) if err != nil { return ctrl.Result{}, err } @@ -723,8 +723,8 @@ func (r *Reconciler) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, return nil } -func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1.Machine) (bool, error) { - obj, err := r.reconcileDeleteExternal(ctx, m, m.Spec.Bootstrap.ConfigRef) +func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) { + obj, err := r.reconcileDeleteExternal(ctx, cluster, m, m.Spec.Bootstrap.ConfigRef) if err != nil { return false, err } @@ -743,8 +743,8 @@ func (r *Reconciler) reconcileDeleteBootstrap(ctx context.Context, m *clusterv1. return false, nil } -func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clusterv1.Machine) (bool, error) { - obj, err := r.reconcileDeleteExternal(ctx, m, &m.Spec.InfrastructureRef) +func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine) (bool, error) { + obj, err := r.reconcileDeleteExternal(ctx, cluster, m, &m.Spec.InfrastructureRef) if err != nil { return false, err } @@ -764,7 +764,7 @@ func (r *Reconciler) reconcileDeleteInfrastructure(ctx context.Context, m *clust } // reconcileDeleteExternal tries to delete external references. -func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) { +func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (*unstructured.Unstructured, error) { if ref == nil { return nil, nil } @@ -777,6 +777,14 @@ func (r *Reconciler) reconcileDeleteExternal(ctx context.Context, m *clusterv1.M } if obj != nil { + // reconcileExternal ensures that we set the object's OwnerReferences correctly and watch the object. + // The machine delete logic depends on reconciling the machine when the external objects are deleted. + // This avoids a race condition where the machine is deleted before the external objects are ever reconciled + // by this controller. + if _, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref); err != nil { + return nil, err + } + // Issue a delete request. if err := r.Client.Delete(ctx, obj); err != nil && !apierrors.IsNotFound(err) { return obj, errors.Wrapf(err, diff --git a/internal/controllers/machine/machine_controller_phases.go b/internal/controllers/machine/machine_controller_phases.go index 5e34f8904444..8c197b4c1fe6 100644 --- a/internal/controllers/machine/machine_controller_phases.go +++ b/internal/controllers/machine/machine_controller_phases.go @@ -43,9 +43,7 @@ import ( "sigs.k8s.io/cluster-api/util/patch" ) -var ( - externalReadyWait = 30 * time.Second -) +var externalReadyWait = 30 * time.Second func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) { originalPhase := m.Status.Phase @@ -89,12 +87,44 @@ func (r *Reconciler) reconcilePhase(_ context.Context, m *clusterv1.Machine) { // reconcileExternal handles generic unstructured objects referenced by a Machine. func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) { - log := ctrl.LoggerFrom(ctx) - if err := utilconversion.UpdateReferenceAPIContract(ctx, r.Client, ref); err != nil { return external.ReconcileOutput{}, err } + result, err := r.ensureExternalOwnershipAndWatch(ctx, cluster, m, ref) + if err != nil { + return external.ReconcileOutput{}, err + } + if result.RequeueAfter > 0 || result.Paused { + return result, nil + } + + obj := result.Result + + // Set failure reason and message, if any. + failureReason, failureMessage, err := external.FailuresFrom(obj) + if err != nil { + return external.ReconcileOutput{}, err + } + if failureReason != "" { + machineStatusError := capierrors.MachineStatusError(failureReason) + m.Status.FailureReason = &machineStatusError + } + if failureMessage != "" { + m.Status.FailureMessage = pointer.String( + fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s", + obj.GroupVersionKind(), obj.GetName(), failureMessage), + ) + } + + return external.ReconcileOutput{Result: obj}, nil +} + +// ensureExternalOwnershipAndWatch ensures that only the Machine owns the external object, +// adds a watch to the external object if one does not already exist and adds the necessary labels. +func (r *Reconciler) ensureExternalOwnershipAndWatch(ctx context.Context, cluster *clusterv1.Cluster, m *clusterv1.Machine, ref *corev1.ObjectReference) (external.ReconcileOutput, error) { + log := ctrl.LoggerFrom(ctx) + obj, err := external.Get(ctx, r.UnstructuredCachingClient, ref, m.Namespace) if err != nil { if apierrors.IsNotFound(errors.Cause(err)) { @@ -146,22 +176,6 @@ func (r *Reconciler) reconcileExternal(ctx context.Context, cluster *clusterv1.C return external.ReconcileOutput{}, err } - // Set failure reason and message, if any. - failureReason, failureMessage, err := external.FailuresFrom(obj) - if err != nil { - return external.ReconcileOutput{}, err - } - if failureReason != "" { - machineStatusError := capierrors.MachineStatusError(failureReason) - m.Status.FailureReason = &machineStatusError - } - if failureMessage != "" { - m.Status.FailureMessage = pointer.String( - fmt.Sprintf("Failure detected from referenced resource %v with name %q: %s", - obj.GroupVersionKind(), obj.GetName(), failureMessage), - ) - } - return external.ReconcileOutput{Result: obj}, nil } diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go index 62ecfcba8245..01a128cdc8f6 100644 --- a/internal/controllers/machine/machine_controller_test.go +++ b/internal/controllers/machine/machine_controller_test.go @@ -35,6 +35,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -150,7 +151,8 @@ func TestWatches(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, } g.Expect(env.Create(ctx, machine)).To(Succeed()) @@ -183,6 +185,185 @@ func TestWatches(t *testing.T) { }, timeout).Should(BeTrue()) } +func TestWatchesDelete(t *testing.T) { + g := NewWithT(t) + ns, err := env.CreateNamespace(ctx, "test-machine-watches-delete") + g.Expect(err).ToNot(HaveOccurred()) + + infraMachine := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "kind": "GenericInfrastructureMachine", + "apiVersion": "infrastructure.cluster.x-k8s.io/v1beta1", + "metadata": map[string]interface{}{ + "name": "infra-config1", + "namespace": ns.Name, + }, + "spec": map[string]interface{}{ + "providerID": "test://id-1", + }, + "status": map[string]interface{}{ + "ready": true, + "addresses": []interface{}{ + map[string]interface{}{ + "type": "InternalIP", + "address": "10.0.0.1", + }, + }, + }, + }, + } + infraMachineFinalizer := "test.infrastructure.cluster.x-k8s.io" + controllerutil.AddFinalizer(infraMachine, infraMachineFinalizer) + + defaultBootstrap := &unstructured.Unstructured{ + Object: map[string]interface{}{ + "kind": "GenericBootstrapConfig", + "apiVersion": "bootstrap.cluster.x-k8s.io/v1beta1", + "metadata": map[string]interface{}{ + "name": "bootstrap-config-machinereconcile", + "namespace": ns.Name, + }, + "spec": map[string]interface{}{}, + "status": map[string]interface{}{}, + }, + } + bootstrapFinalizer := "test.bootstrap.cluster.x-k8s.io" + controllerutil.AddFinalizer(defaultBootstrap, bootstrapFinalizer) + + testCluster := &clusterv1.Cluster{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "machine-reconcile-", + Namespace: ns.Name, + }, + Spec: clusterv1.ClusterSpec{ + // we create the cluster in paused state so we don't reconcile + // the machine immediately after creation. + // This avoids going through reconcileExternal, which adds watches + // for the provider machine and the bootstrap config objects. + Paused: true, + }, + } + + g.Expect(env.Create(ctx, testCluster)).To(Succeed()) + g.Expect(env.CreateKubeconfigSecret(ctx, testCluster)).To(Succeed()) + g.Expect(env.Create(ctx, defaultBootstrap)).To(Succeed()) + g.Expect(env.Create(ctx, infraMachine)).To(Succeed()) + + defer func(do ...client.Object) { + g.Expect(env.Cleanup(ctx, do...)).To(Succeed()) + }(ns, testCluster, defaultBootstrap) + + // Patch infra machine ready + patchHelper, err := patch.NewHelper(infraMachine, env) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(unstructured.SetNestedField(infraMachine.Object, true, "status", "ready")).To(Succeed()) + g.Expect(patchHelper.Patch(ctx, infraMachine, patch.WithStatusObservedGeneration{})).To(Succeed()) + + // Patch bootstrap ready + patchHelper, err = patch.NewHelper(defaultBootstrap, env) + g.Expect(err).ShouldNot(HaveOccurred()) + g.Expect(unstructured.SetNestedField(defaultBootstrap.Object, true, "status", "ready")).To(Succeed()) + g.Expect(unstructured.SetNestedField(defaultBootstrap.Object, "secretData", "status", "dataSecretName")).To(Succeed()) + g.Expect(patchHelper.Patch(ctx, defaultBootstrap, patch.WithStatusObservedGeneration{})).To(Succeed()) + + machine := &clusterv1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + GenerateName: "machine-created-", + Namespace: ns.Name, + Labels: map[string]string{ + clusterv1.MachineControlPlaneLabel: "", + }, + }, + Spec: clusterv1.MachineSpec{ + ClusterName: testCluster.Name, + InfrastructureRef: corev1.ObjectReference{ + APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1", + Kind: "GenericInfrastructureMachine", + Name: "infra-config1", + }, + Bootstrap: clusterv1.Bootstrap{ + ConfigRef: &corev1.ObjectReference{ + APIVersion: "bootstrap.cluster.x-k8s.io/v1beta1", + Kind: "GenericBootstrapConfig", + Name: "bootstrap-config-machinereconcile", + }, + }, + }, + } + // We create the machine with a finalizer so the machine is not deleted immediately. + controllerutil.AddFinalizer(machine, clusterv1.MachineFinalizer) + + g.Expect(env.Create(ctx, machine)).To(Succeed()) + defer func() { + g.Expect(env.Cleanup(ctx, machine)).To(Succeed()) + }() + + // We mark the machine for deletion + g.Expect(env.Delete(ctx, machine)).To(Succeed()) + + // We unpause the cluster so the machine can be reconciled. + testCluster.Spec.Paused = false + g.Expect(env.Update(ctx, testCluster)).To(Succeed()) + + // Wait for reconciliation to happen. + // The first reconciliation should add the cluster name label. + key := client.ObjectKey{Name: machine.Name, Namespace: machine.Namespace} + g.Eventually(func() bool { + if err := env.Get(ctx, key, machine); err != nil { + return false + } + return machine.Labels[clusterv1.ClusterNameLabel] == testCluster.Name + }, timeout).Should(BeTrue()) + + // Deleting the machine should mark the infra machine for deletion + infraMachineKey := client.ObjectKey{Name: infraMachine.GetName(), Namespace: infraMachine.GetNamespace()} + g.Eventually(func() bool { + if err := env.Get(ctx, infraMachineKey, infraMachine); err != nil { + return false + } + return infraMachine.GetDeletionTimestamp() != nil + }, timeout).Should(BeTrue(), "infra machine should be marked for deletion") + + // We wait a bit and remove the finalizer, simulating the infra machine controller. + time.Sleep(2 * time.Second) + infraMachine.SetFinalizers([]string{}) + g.Expect(env.Update(ctx, infraMachine)).To(Succeed()) + + // This should delete the infra machine + g.Eventually(func() bool { + err := env.Get(ctx, infraMachineKey, infraMachine) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue(), "infra machine should be deleted") + + // If the watch on infra machine works, deleting of the infra machine will trigger another + // reconcile, which will mark the bootstrap config for deletion + bootstrapKey := client.ObjectKey{Name: defaultBootstrap.GetName(), Namespace: defaultBootstrap.GetNamespace()} + g.Eventually(func() bool { + if err := env.Get(ctx, bootstrapKey, defaultBootstrap); err != nil { + return false + } + return defaultBootstrap.GetDeletionTimestamp() != nil + }, timeout).Should(BeTrue(), "bootstrap config should be marked for deletion") + + // We wait a bit a remove the finalizer, simulating the bootstrap config controller. + time.Sleep(2 * time.Second) + defaultBootstrap.SetFinalizers([]string{}) + g.Expect(env.Update(ctx, defaultBootstrap)).To(Succeed()) + + // This should delete the bootstrap config. + g.Eventually(func() bool { + err := env.Get(ctx, bootstrapKey, defaultBootstrap) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue(), "bootstrap config should be deleted") + + // If the watch on bootstrap config works, the deleting of the bootstrap config will trigger another + // reconcile, which will remove the finalizer and delete the machine + g.Eventually(func() bool { + err := env.Get(ctx, key, machine) + return apierrors.IsNotFound(err) + }, timeout).Should(BeTrue(), "machine should be deleted") +} + func TestMachine_Reconcile(t *testing.T) { g := NewWithT(t) @@ -250,7 +431,8 @@ func TestMachine_Reconcile(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, Status: clusterv1.MachineStatus{ NodeRef: &corev1.ObjectReference{ Name: "test", @@ -1076,13 +1258,13 @@ func TestReconcileDeleteExternal(t *testing.T) { UnstructuredCachingClient: c, } - obj, err := r.reconcileDeleteExternal(ctx, machine, machine.Spec.Bootstrap.ConfigRef) - g.Expect(obj).To(BeComparableTo(tc.expected)) + obj, err := r.reconcileDeleteExternal(ctx, testCluster, machine, machine.Spec.Bootstrap.ConfigRef) if tc.expectError { g.Expect(err).To(HaveOccurred()) } else { g.Expect(err).ToNot(HaveOccurred()) } + g.Expect(obj).To(BeComparableTo(tc.expected)) }) } } @@ -1889,7 +2071,8 @@ func TestNodeToMachine(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, } g.Expect(env.Create(ctx, expectedMachine)).To(Succeed()) @@ -1928,7 +2111,8 @@ func TestNodeToMachine(t *testing.T) { Kind: "GenericBootstrapConfig", Name: "bootstrap-config-machinereconcile", }, - }}, + }, + }, } g.Expect(env.Create(ctx, randomMachine)).To(Succeed())