Skip to content

Commit

Permalink
Do not update KCP.Status and MS.Status when unable to get workload cl…
Browse files Browse the repository at this point in the history
…uster
  • Loading branch information
jessehu committed Mar 28, 2024
1 parent f077af8 commit d32de47
Show file tree
Hide file tree
Showing 4 changed files with 91 additions and 6 deletions.
8 changes: 8 additions & 0 deletions controlplane/kubeadm/internal/controllers/fakes_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,14 @@ func (f *fakeManagementCluster) GetMachinePoolsForCluster(c context.Context, clu
return f.MachinePools, nil
}

type fakeManagementClusterWithError struct {
fakeManagementCluster
}

func (f *fakeManagementClusterWithError) GetWorkloadCluster(_ context.Context, _ client.ObjectKey) (internal.WorkloadCluster, error) {
return nil, errors.New("failed to get workload cluster")
}

type fakeWorkloadCluster struct {
*internal.Workload
Status internal.ClusterStatus
Expand Down
6 changes: 3 additions & 3 deletions controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ func (r *KubeadmControlPlaneReconciler) updateStatus(ctx context.Context, contro
replicas := int32(len(controlPlane.Machines))
desiredReplicas := *controlPlane.KCP.Spec.Replicas

// set basic data that does not require interacting with the workload cluster
// Set basic data that does not require interacting with the workload cluster.
controlPlane.KCP.Status.Replicas = replicas
controlPlane.KCP.Status.ReadyReplicas = 0
controlPlane.KCP.Status.UnavailableReplicas = replicas
// Set UnavailableReplicas to `replicas` when KCP is newly created, otherwise keep it unchanged. So as to avoid updating it when unable to get the workload cluster.
controlPlane.KCP.Status.UnavailableReplicas = replicas - controlPlane.KCP.Status.ReadyReplicas

// Return early if the deletion timestamp is set, because we don't want to try to connect to the workload cluster
// and we don't want to report resize condition (because it is set to deleting into reconcile delete).
Expand Down
73 changes: 73 additions & 0 deletions controlplane/kubeadm/internal/controllers/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -332,6 +332,79 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing
g.Expect(kcp.Status.Ready).To(BeTrue())
}

func TestKubeadmControlPlaneReconciler_updateStatusCannotGetWorkloadClusterStatus(t *testing.T) {
g := NewWithT(t)

cluster := &clusterv1.Cluster{
ObjectMeta: metav1.ObjectMeta{
Name: "foo",
Namespace: metav1.NamespaceDefault,
},
}

kcp := &controlplanev1.KubeadmControlPlane{
TypeMeta: metav1.TypeMeta{
Kind: "KubeadmControlPlane",
APIVersion: controlplanev1.GroupVersion.String(),
},
ObjectMeta: metav1.ObjectMeta{
Namespace: cluster.Namespace,
Name: "foo",
},
Spec: controlplanev1.KubeadmControlPlaneSpec{
Version: "v1.16.6",
MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{
InfrastructureRef: corev1.ObjectReference{
APIVersion: "test/v1alpha1",
Kind: "UnknownInfraMachine",
Name: "foo",
},
},
},
Status: controlplanev1.KubeadmControlPlaneStatus{
Ready: true,
Replicas: 3,
ReadyReplicas: 3,
UpdatedReplicas: 3,
UnavailableReplicas: 0,
},
}
kcp.Default()
_, err := kcp.ValidateCreate()
g.Expect(err).ToNot(HaveOccurred())

machines := map[string]*clusterv1.Machine{}
objs := []client.Object{cluster.DeepCopy(), kcp.DeepCopy()}
for i := 0; i < 3; i++ {
name := fmt.Sprintf("test-%d", i)
m, n := createMachineNodePair(name, cluster, kcp, true)
objs = append(objs, n, m)
machines[m.Name] = m
}

fakeClient := newFakeClient(objs...)

r := &KubeadmControlPlaneReconciler{
Client: fakeClient,
managementCluster: &fakeManagementClusterWithError{},
recorder: record.NewFakeRecorder(32),
}

controlPlane := &internal.ControlPlane{
KCP: kcp,
Cluster: cluster,
Machines: machines,
}
controlPlane.InjectTestManagementCluster(r.managementCluster)

// When updateStatus() returns a non-nil error which is unable to get workload cluster, the original kcp.Status should not be updated.
g.Expect(r.updateStatus(ctx, controlPlane)).To(HaveOccurred())
g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(3))
g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(3))
g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0))
g.Expect(kcp.Status.Ready).To(BeTrue())
}

func TestKubeadmControlPlaneReconciler_machinesCreatedIsIsTrueEvenWhenTheNodesAreNotReady(t *testing.T) {
g := NewWithT(t)

Expand Down
10 changes: 7 additions & 3 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,10 @@ func (r *Reconciler) reconcile(ctx context.Context, cluster *clusterv1.Cluster,

// Always updates status as machines come up or die.
if err := r.updateStatus(ctx, cluster, machineSet, filteredMachines); err != nil {
if errors.Is(err, remote.ErrClusterLocked) {
log.V(5).Info("Requeuing because another worker has the lock on the ClusterCacheTracker")
return ctrl.Result{RequeueAfter: time.Minute}, nil
}
return ctrl.Result{}, errors.Wrapf(kerrors.NewAggregate([]error{err, syncErr}), "failed to update MachineSet's Status")
}

Expand Down Expand Up @@ -842,7 +846,8 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool {
}

// updateStatus updates the Status field for the MachineSet
// It checks for the current state of the replicas and updates the Status of the MachineSet.
// It checks for the current state of the replicas and updates the Status field of the MachineSet.
// When unable to retrieve the Node status, it returns error and won't update the Status field of the MachineSet.
func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error {
log := ctrl.LoggerFrom(ctx)
newStatus := ms.Status.DeepCopy()
Expand Down Expand Up @@ -880,8 +885,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste

node, err := r.getMachineNode(ctx, cluster, machine)
if err != nil && machine.GetDeletionTimestamp().IsZero() {
log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node))
continue
return errors.Wrapf(err, "Unable to retrieve the status of Node %s", klog.KObj(node))
}

if noderefutil.IsNodeReady(node) {
Expand Down

0 comments on commit d32de47

Please sign in to comment.