-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
🐛 Do not update KCP and MS status when unable to get workload cluster #10229
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -331,6 +331,80 @@ func TestKubeadmControlPlaneReconciler_updateStatusMachinesReadyMixed(t *testing | |
g.Expect(kcp.Status.Ready).To(BeTrue()) | ||
} | ||
|
||
func TestKubeadmControlPlaneReconciler_updateStatusCannotGetWorkloadClusterStatus(t *testing.T) { | ||
g := NewWithT(t) | ||
|
||
cluster := &clusterv1.Cluster{ | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Name: "foo", | ||
Namespace: metav1.NamespaceDefault, | ||
}, | ||
} | ||
|
||
kcp := &controlplanev1.KubeadmControlPlane{ | ||
TypeMeta: metav1.TypeMeta{ | ||
Kind: "KubeadmControlPlane", | ||
APIVersion: controlplanev1.GroupVersion.String(), | ||
}, | ||
ObjectMeta: metav1.ObjectMeta{ | ||
Namespace: cluster.Namespace, | ||
Name: "foo", | ||
}, | ||
Spec: controlplanev1.KubeadmControlPlaneSpec{ | ||
Version: "v1.16.6", | ||
MachineTemplate: controlplanev1.KubeadmControlPlaneMachineTemplate{ | ||
InfrastructureRef: corev1.ObjectReference{ | ||
APIVersion: "test/v1alpha1", | ||
Kind: "UnknownInfraMachine", | ||
Name: "foo", | ||
}, | ||
}, | ||
}, | ||
Status: controlplanev1.KubeadmControlPlaneStatus{ | ||
Ready: true, | ||
Replicas: 3, | ||
ReadyReplicas: 3, | ||
UpdatedReplicas: 3, | ||
UnavailableReplicas: 0, | ||
}, | ||
} | ||
webhook := &controlplanev1webhooks.KubeadmControlPlane{} | ||
g.Expect(webhook.Default(ctx, kcp)).To(Succeed()) | ||
_, err := webhook.ValidateCreate(ctx, kcp) | ||
g.Expect(err).ToNot(HaveOccurred()) | ||
|
||
machines := map[string]*clusterv1.Machine{} | ||
objs := []client.Object{cluster.DeepCopy(), kcp.DeepCopy()} | ||
for i := 0; i < 3; i++ { | ||
name := fmt.Sprintf("test-%d", i) | ||
m, n := createMachineNodePair(name, cluster, kcp, true) | ||
objs = append(objs, n, m) | ||
machines[m.Name] = m | ||
} | ||
|
||
fakeClient := newFakeClient(objs...) | ||
|
||
r := &KubeadmControlPlaneReconciler{ | ||
Client: fakeClient, | ||
managementCluster: &fakeManagementClusterWithGetWorkloadClusterError{}, | ||
recorder: record.NewFakeRecorder(32), | ||
} | ||
|
||
controlPlane := &internal.ControlPlane{ | ||
KCP: kcp, | ||
Cluster: cluster, | ||
Machines: machines, | ||
} | ||
controlPlane.InjectTestManagementCluster(r.managementCluster) | ||
|
||
// When updateStatus() returns a non-nil error(e.g. unable to get workload cluster), the original kcp.Status should not be updated. | ||
g.Expect(r.updateStatus(ctx, controlPlane)).To(HaveOccurred()) | ||
g.Expect(kcp.Status.Replicas).To(BeEquivalentTo(3)) | ||
g.Expect(kcp.Status.ReadyReplicas).To(BeEquivalentTo(3)) | ||
g.Expect(kcp.Status.UnavailableReplicas).To(BeEquivalentTo(0)) | ||
g.Expect(kcp.Status.Ready).To(BeTrue()) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add an additional call to updateStatus here:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure. Will add the case after the discussion is resolved: #10229 (comment) |
||
|
||
func TestKubeadmControlPlaneReconciler_machinesCreatedIsIsTrueEvenWhenTheNodesAreNotReady(t *testing.T) { | ||
g := NewWithT(t) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -844,7 +844,8 @@ func (r *Reconciler) shouldAdopt(ms *clusterv1.MachineSet) bool { | |
} | ||
|
||
// updateStatus updates the Status field for the MachineSet | ||
// It checks for the current state of the replicas and updates the Status of the MachineSet. | ||
// It checks for the current state of the replicas and updates the Status field of the MachineSet. | ||
// When unable to retrieve the Node status, it returns error and won't update the Status field of the MachineSet. | ||
func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluster, ms *clusterv1.MachineSet, filteredMachines []*clusterv1.Machine) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here a proposed solution to the ErrClusterLocked issue: (that Fabrizio was referring to) // Retry getting a remote client.
// Note: This is to ensure that we don't run into errors here just
// because multiple reconcilers try to create the client at the same time
// (for the case where the workload cluster is reachable).
var canCommunicateWithWorkloadCluster bool
var remoteClient client.Client
err = retry.OnError(wait.Backoff{
Steps: 5,
Duration: 200 * time.Millisecond,
Factor: 1.0,
}, func(err error) bool {
// Retry as long as we get remote.ErrClusterLocked errors.
return errors.Is(err, remote.ErrClusterLocked)
}, func() error {
remoteClient, err = r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
if err != nil {
return err
}
canCommunicateWithWorkloadCluster = true
return nil
})
if err != nil {
log.Error(err, "Unable to retrieve status of Nodes: failed to create remote client")
}
for _, machine := range filteredMachines {
log := log.WithValues("Machine", klog.KObj(machine))
if templateLabel.Matches(labels.Set(machine.Labels)) {
fullyLabeledReplicasCount++
}
if machine.Status.NodeRef == nil {
log.V(4).Info("Waiting for the machine controller to set status.nodeRef on the Machine")
continue
}
if !canCommunicateWithWorkloadCluster {
// Skip the rest of the for loop if we can't communicate with the workload cluster.
continue
}
node, err := r.getMachineNode(ctx, remoteClient, machine)
if err != nil && machine.GetDeletionTimestamp().IsZero() {
log.Error(err, "Unable to retrieve Node status", "Node", klog.KObj(node))
continue
}
if noderefutil.IsNodeReady(node) {
readyReplicasCount++
if noderefutil.IsNodeAvailable(node, ms.Spec.MinReadySeconds, metav1.Now()) {
availableReplicasCount++
}
} else if machine.GetDeletionTimestamp().IsZero() {
log.V(4).Info("Waiting for the Kubernetes node on the machine to report ready state")
}
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea is that when the workload cluster is reachable, we only get ErrClusterLocked for a very short amount of time (the time it takes to create a client). For this case it is good enough to simply retry creating the client. We will fallback to the previous behavior only if the workload cluster is actually not reachable. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Cool!I'm not quite sure if it's worthy to add this bulk of code to resolve the temporary inconsistency of some status replicas and conditions fields caused by ErrClusterLocked error. The simple change in current PR can solve this problem while there are inconsistency but acceptable and won't cause issues per my thinking (maybe I missed some cases). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @sbueringer and I are in total agreement, and we are trying to help in re-focusing the PR to the original issue. We also took some additional steps to help in finding a way forward by proposing an alternative solution. With regards to the current PR, I already tried to explain my concern and I will be happy to add more if you have any doubts (and I already answered one). But given the concern above, I'm personally -1 to merge the PR in the current state. Instead, we both think the change proposed by @sbueringer solves the original issues, but ultimately it is up to you to accept it or not There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @fabriziopandini for the details. Here are my two cents:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One more thing: not sure if There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Our main concern is basically that if we return an error here as soon as we hit ErrClusterLocked we don't update the status at all anymore. This should be okay in the happy path, which is that we can actually communicate with the workload cluster and it's just a matter of waiting until another goroutine successfully created the client so we can use it. The problem we see is if the workload cluster is actually not reachable. Because in that case we will just continuously return the error forever. In this case we "freeze" the values of status: FullyLabeledReplicas, ReadyReplicas, AvailableReplicas, ObservedGeneration and a few conditions. The concern Fabrizio raised (and I didn't have on my radar before) is that if we freeze the status indefinitely in these cases (or rather until the workload cluster is reachable) this can be pretty confusing for users. So for this case we should actually have a more extensive solution which also covers signalling to users that we can't communicate with the workload cluster anymore. What we were trying to suggest was a mitigation for the issue for the happy path, where we actually can talk to the workload cluster, but replicas are flipping only because of the way we create the client initially for a cluster.
I'm not really sure about this one. Basically we introduced the current "TryLock" over a "Lock" to make sure we don't block all reconcile for a cluster when a workload cluster is not reachable. The "Lock" we had before lead to really problematic performance issues when some workload clusters (with a lot of Machines) were unreachable. |
||
log := ctrl.LoggerFrom(ctx) | ||
newStatus := ms.Status.DeepCopy() | ||
|
@@ -882,8 +883,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, cluster *clusterv1.Cluste | |
|
||
node, err := r.getMachineNode(ctx, cluster, machine) | ||
if err != nil && machine.GetDeletionTimestamp().IsZero() { | ||
log.Error(err, "Unable to retrieve Node status", "node", klog.KObj(node)) | ||
continue | ||
return errors.Wrapf(err, "unable to retrieve the status of Node %s", klog.KObj(node)) | ||
sbueringer marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the issue
But if I'm not wrong with the current implementation we are going to freeze the replica count indefinetly, which could be confusing or eventually also wrong, given that the number of replicas/machine might change in the meantime. Frankly speaking, in order to properly fix this issue I think that we first need to figure out how to get/store the "last seen" information for each node. Given that info, we can decide if it is ok to flip the replica status or if to wait (but I don't have a concrete idea on how to do so, I need some time to research into it) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important point that I discussed with Fabrizio. We're not only freezing the replica fields, we would also freeze other status fields like ObservedGeneration and conditions. |
||
} | ||
|
||
if noderefutil.IsNodeReady(node) { | ||
|
@@ -956,6 +956,9 @@ func (r *Reconciler) getMachineNode(ctx context.Context, cluster *clusterv1.Clus | |
} | ||
node := &corev1.Node{} | ||
if err := remoteClient.Get(ctx, client.ObjectKey{Name: machine.Status.NodeRef.Name}, node); err != nil { | ||
if apierrors.IsNotFound(err) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure why are we ignoring not found. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just for clarification. The idea behind this one was to signal to the calling function that the node doesn't exist (vs. that it's just an error). Because not finding the Node is also a valuable information (which basically comes down to that the node is not ready) This was added to preserve the previous behavior one level above (where we only logged the error before but still considered a not found node a node that is not ready) But yeah it becomes pretty had to understand |
||
return nil, nil | ||
} | ||
return nil, errors.Wrapf(err, "error retrieving node %s for machine %s/%s", machine.Status.NodeRef.Name, machine.Namespace, machine.Name) | ||
} | ||
return node, nil | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Note: This change avoids setting controlPlane.KCP.Status.ReadyReplicas from none 0 to 0 when GetWorkloadCluster(ctx) hits ErrClusterLocked or other error.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW:https://github.com/kubernetes-sigs/cluster-api/blob/release-1.5/controlplane/kubeadm/internal/controllers/status.go#L93 uses replicas rather than desiredReplicas to calculate UnavailableReplicas. Not sure if it's a bug or on purpose. cc @fabriziopandini please help take a look.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it would be odd to have two different ways to calculate the unavailable replicas within the same status update function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hi @neolit123, actually it uses the same way to calculate the unavailable replicas. The line 46 is only for covering the case that KCP.Status.ReadyReplicas is 0 since the default value of UnavailableReplicas is 0 when KCP is newly created. If ReadyReplicas is not 0, UnavailableReplicas should equal to
replicas - controlPlane.KCP.Status.ReadyReplicas
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
hi @neolit123 @fabriziopandini could you please help take some time to move this forward? We need this patch to fix the issue hit in our project based on CAPI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sbueringer Got your point. If doing it this way, we may also need to move
controlPlane.KCP.Status.Replicas = replicas
to before line 96controlPlane.KCP.Status.ReadyReplicas = status.ReadyNodes
, otherwise only Status.Replicas increases or decreases by 1 and other Replicas fields unchanged (e.g. UpdatedReplicas, ReadyReplicas, UnavailableReplicas etc. which seems unreasonable). Even if it's moved, the conditions are updated while no Replicas fields are updated (which is unreasonable but maybe acceptedable since it will be corrected in the coming reconciles soon ?).If updating both Status.Replicas and Status.UnavailableReplicas before getting ClusterStatus like this PR currently behaviors is not expected:
The root cause is line 93
controlPlane.KCP.Status.UnavailableReplicas = replicas - status.ReadyNodes
(and line 47 as well) should becontrolPlane.KCP.Status.UnavailableReplicas = desiredReplicas- status.ReadyNodes
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point. Let's do the following:
I think it makes sense to update the conditions in 2 as they are all based on information from the mgmt cluster.
Overall we're just missing a way to signal that we can't communicate with the wl cluster anymore, but we can follow-up on that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@sbueringer Got it. I pushed a commit to use
Status.UnavailableReplicas = desiredReplicas- status.ReadyNodes
which should solve the problems we discussed above. Please take a look if it is. If not, I will make the changes as you described, but not sure why we need to move down the update of status updatedReplicas because it doesn't depend on the workload cluste status:controlPlane.KCP.Status.UpdatedReplicas = int32(len(controlPlane.UpToDateMachines()))
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The idea would have been to not update any status replica fields if we can't communicate with the workload cluster to not get them into an inconsistent state with each other. I'll think a bit about it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup, I agree that the idea way would have been to not update any status replica fields and conditions fields if we can't communicate with the workload cluster.