Skip to content

Commit

Permalink
Refine v1beta2 ScalingUp conditions
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed Nov 18, 2024
1 parent b12ad96 commit 7aa7b8d
Show file tree
Hide file tree
Showing 8 changed files with 169 additions and 123 deletions.
27 changes: 15 additions & 12 deletions controlplane/kubeadm/internal/controllers/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ func setScalingUpCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
if currentReplicas >= desiredReplicas {
var message string
if missingReferencesMessage != "" {
message = fmt.Sprintf("Scaling up would be blocked %s", missingReferencesMessage)
message = fmt.Sprintf("Scaling up would be blocked because %s", missingReferencesMessage)
}
v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Condition,
Expand All @@ -241,29 +241,32 @@ func setScalingUpCondition(_ context.Context, kcp *controlplanev1.KubeadmControl
return
}

message := fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas)
if missingReferencesMessage != "" {
message = fmt.Sprintf("%s is blocked %s", message, missingReferencesMessage)
}
messages := []string{message}

additionalMessages := []string{}
if preflightChecks.HasDeletingMachine {
messages = append(messages, "waiting for Machine being deleted")
additionalMessages = append(additionalMessages, "* waiting for a control plane Machine to complete deletion")
}

if preflightChecks.ControlPlaneComponentsNotHealthy {
messages = append(messages, "waiting for control plane components to be healthy")
additionalMessages = append(additionalMessages, "* waiting for control plane components to be healthy")
}

if preflightChecks.EtcdClusterNotHealthy {
messages = append(messages, "waiting for etcd cluster to be healthy")
additionalMessages = append(additionalMessages, "* waiting for etcd cluster to be healthy")
}
if missingReferencesMessage != "" {
additionalMessages = append(additionalMessages, fmt.Sprintf("* %s", missingReferencesMessage))
}

message := fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas)
if len(additionalMessages) > 0 {
message = fmt.Sprintf("%s is blocked because:\n%s", message, strings.Join(additionalMessages, "\n"))
}

v1beta2conditions.Set(kcp, metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Reason,
Message: strings.Join(messages, "; "),
Message: message,
})
}

Expand Down Expand Up @@ -399,7 +402,7 @@ func setMachinesUpToDateCondition(ctx context.Context, kcp *controlplanev1.Kubea

func calculateMissingReferencesMessage(kcp *controlplanev1.KubeadmControlPlane, infraMachineTemplateNotFound bool) string {
if infraMachineTemplateNotFound {
return fmt.Sprintf("because %s does not exist", kcp.Spec.MachineTemplate.InfrastructureRef.Kind)
return fmt.Sprintf("%s does not exist", kcp.Spec.MachineTemplate.InfrastructureRef.Kind)
}
return ""
}
Expand Down
20 changes: 12 additions & 8 deletions controlplane/kubeadm/internal/controllers/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,10 +233,11 @@ func Test_setScalingUpCondition(t *testing.T) {
InfraMachineTemplateIsNotFound: true,
},
expectCondition: metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Reason,
Message: "Scaling up from 3 to 5 replicas is blocked because AWSTemplate does not exist",
Type: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Reason,
Message: "Scaling up from 3 to 5 replicas is blocked because:\n" +
"* AWSTemplate does not exist",
},
},
{
Expand All @@ -258,10 +259,13 @@ func Test_setScalingUpCondition(t *testing.T) {
},
},
expectCondition: metav1.Condition{
Type: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Reason,
Message: "Scaling up from 3 to 5 replicas; waiting for Machine being deleted; waiting for control plane components to be healthy; waiting for etcd cluster to be healthy",
Type: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: controlplanev1.KubeadmControlPlaneScalingUpV1Beta2Reason,
Message: "Scaling up from 3 to 5 replicas is blocked because:\n" +
"* waiting for a control plane Machine to complete deletion\n" +
"* waiting for control plane components to be healthy\n" +
"* waiting for etcd cluster to be healthy",
},
},
}
Expand Down
52 changes: 32 additions & 20 deletions internal/controllers/machineset/machineset_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -272,7 +272,7 @@ type scope struct {
infrastructureObjectNotFound bool
getAndAdoptMachinesForMachineSetSucceeded bool
owningMachineDeployment *clusterv1.MachineDeployment
scaleUpPreflightCheckErrMessage string
scaleUpPreflightCheckErrMessages []string
reconciliationTime time.Time
}

Expand Down Expand Up @@ -673,15 +673,19 @@ func (r *Reconciler) syncReplicas(ctx context.Context, s *scope) (ctrl.Result, e
}
}

result, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up")
if err != nil || !result.IsZero() {
preflightCheckErrMessages, err := r.runPreflightChecks(ctx, cluster, ms, "Scale up")
if err != nil || len(preflightCheckErrMessages) > 0 {
if err != nil {
// If the error is not nil use that as the message for the condition.
preflightCheckErrMessage = err.Error()
// If err is not nil use that as the preflightCheckErrMessage
preflightCheckErrMessages = append(preflightCheckErrMessages, err.Error())
}
s.scaleUpPreflightCheckErrMessage = preflightCheckErrMessage
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, preflightCheckErrMessage)
return result, err

s.scaleUpPreflightCheckErrMessages = preflightCheckErrMessages
conditions.MarkFalse(ms, clusterv1.MachinesCreatedCondition, clusterv1.PreflightCheckFailedReason, clusterv1.ConditionSeverityError, strings.Join(preflightCheckErrMessages, "; "))
if err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil
}

var (
Expand Down Expand Up @@ -1418,31 +1422,39 @@ func (r *Reconciler) reconcileUnhealthyMachines(ctx context.Context, s *scope) (
}

// Run preflight checks.
preflightChecksResult, preflightCheckErrMessage, err := r.runPreflightChecks(ctx, cluster, ms, "Machine remediation")
if err != nil {
// If err is not nil use that as the preflightCheckErrMessage
preflightCheckErrMessage = err.Error()
}
preflightCheckErrMessages, err := r.runPreflightChecks(ctx, cluster, ms, "Machine remediation")
if err != nil || len(preflightCheckErrMessages) > 0 {
if err != nil {
// If err is not nil use that as the preflightCheckErrMessage
preflightCheckErrMessages = append(preflightCheckErrMessages, err.Error())
}

listMessages := make([]string, len(preflightCheckErrMessages))
for i, msg := range preflightCheckErrMessages {
listMessages[i] = fmt.Sprintf("* %s", msg)
}

preflightChecksFailed := err != nil || !preflightChecksResult.IsZero()
if preflightChecksFailed {
// PreflightChecks did not pass. Update the MachineOwnerRemediated condition on the unhealthy Machines with
// WaitingForRemediationReason reason.
if err := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
if patchErr := patchMachineConditions(ctx, r.Client, machinesToRemediate, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
Message: preflightCheckErrMessage,
Message: strings.Join(listMessages, "\n"),
}, &clusterv1.Condition{
Type: clusterv1.MachineOwnerRemediatedCondition,
Status: corev1.ConditionFalse,
Reason: clusterv1.WaitingForRemediationReason,
Severity: clusterv1.ConditionSeverityWarning,
Message: preflightCheckErrMessage,
}); err != nil {
Message: strings.Join(preflightCheckErrMessages, "; "),
}); patchErr != nil {
return ctrl.Result{}, kerrors.NewAggregate([]error{patchErr, err})
}

if err != nil {
return ctrl.Result{}, err
}
return preflightChecksResult, nil
return ctrl.Result{RequeueAfter: preflightFailedRequeueAfter}, nil
}

// PreflightChecks passed, so it is safe to remediate unhealthy machines by deleting them.
Expand Down
19 changes: 11 additions & 8 deletions internal/controllers/machineset/machineset_controller_status.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ package machineset
import (
"context"
"fmt"
"slices"
"sort"
"strings"
"time"
Expand Down Expand Up @@ -48,7 +47,7 @@ func (r *Reconciler) updateStatus(ctx context.Context, s *scope) {
// Conditions

// Update the ScalingUp and ScalingDown condition.
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded, s.scaleUpPreflightCheckErrMessage)
setScalingUpCondition(ctx, s.machineSet, s.machines, s.bootstrapObjectNotFound, s.infrastructureObjectNotFound, s.getAndAdoptMachinesForMachineSetSucceeded, s.scaleUpPreflightCheckErrMessages)
setScalingDownCondition(ctx, s.machineSet, s.machines, s.getAndAdoptMachinesForMachineSetSucceeded)

// MachinesReady condition: aggregate the Machine's Ready condition.
Expand Down Expand Up @@ -93,7 +92,7 @@ func setReplicas(_ context.Context, ms *clusterv1.MachineSet, machines []*cluste
ms.Status.V1Beta2.UpToDateReplicas = ptr.To(upToDateReplicas)
}

func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool, scaleUpPreflightCheckErrMessage string) {
func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines []*clusterv1.Machine, bootstrapObjectNotFound, infrastructureObjectNotFound, getAndAdoptMachinesForMachineSetSucceeded bool, scaleUpPreflightCheckErrMessages []string) {
// If we got unexpected errors in listing the machines (this should never happen), surface them.
if !getAndAdoptMachinesForMachineSetSucceeded {
v1beta2conditions.Set(ms, metav1.Condition{
Expand Down Expand Up @@ -140,11 +139,15 @@ func setScalingUpCondition(_ context.Context, ms *clusterv1.MachineSet, machines

// Scaling up.
message := fmt.Sprintf("Scaling up from %d to %d replicas", currentReplicas, desiredReplicas)
if missingReferencesMessage != "" || scaleUpPreflightCheckErrMessage != "" {
blockMessages := slices.DeleteFunc([]string{missingReferencesMessage, scaleUpPreflightCheckErrMessage}, func(s string) bool {
return s == ""
})
message += fmt.Sprintf(" is blocked because %s", strings.Join(blockMessages, " and "))
if missingReferencesMessage != "" || len(scaleUpPreflightCheckErrMessages) > 0 {
listMessages := make([]string, len(scaleUpPreflightCheckErrMessages))
for i, msg := range scaleUpPreflightCheckErrMessages {
listMessages[i] = fmt.Sprintf("* %s", msg)
}
if missingReferencesMessage != "" {
listMessages = append(listMessages, fmt.Sprintf("* %s", missingReferencesMessage))
}
message += fmt.Sprintf(" is blocked because:\n%s", strings.Join(listMessages, "\n"))
}
v1beta2conditions.Set(ms, metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,7 @@ func Test_setScalingUpCondition(t *testing.T) {
bootstrapObjectNotFound bool
infrastructureObjectNotFound bool
getAndAdoptMachinesForMachineSetSucceeded bool
scaleUpPreflightCheckErrMessage string
scaleUpPreflightCheckErrMessages []string
expectCondition metav1.Condition
}{
{
Expand Down Expand Up @@ -281,10 +281,11 @@ func Test_setScalingUpCondition(t *testing.T) {
infrastructureObjectNotFound: false,
getAndAdoptMachinesForMachineSetSucceeded: true,
expectCondition: metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
Message: "Scaling up from 0 to 3 replicas is blocked because KubeadmBootstrapTemplate does not exist",
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
Message: "Scaling up from 0 to 3 replicas is blocked because:\n" +
"* KubeadmBootstrapTemplate does not exist",
},
},
{
Expand All @@ -294,10 +295,11 @@ func Test_setScalingUpCondition(t *testing.T) {
infrastructureObjectNotFound: true,
getAndAdoptMachinesForMachineSetSucceeded: true,
expectCondition: metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
Message: "Scaling up from 0 to 3 replicas is blocked because DockerMachineTemplate does not exist",
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
Message: "Scaling up from 0 to 3 replicas is blocked because:\n" +
"* DockerMachineTemplate does not exist",
},
},
{
Expand All @@ -308,17 +310,14 @@ func Test_setScalingUpCondition(t *testing.T) {
getAndAdoptMachinesForMachineSetSucceeded: true,
// This preflight check error can happen when a MachineSet is scaling up while the control plane
// already has a newer Kubernetes version.
scaleUpPreflightCheckErrMessage: "MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
scaleUpPreflightCheckErrMessages: []string{"MachineSet version (1.25.5) and ControlPlane version (1.26.2) do not conform to kubeadm version skew policy as kubeadm only supports joining with the same major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)"},
expectCondition: metav1.Condition{
Type: clusterv1.MachineSetScalingUpV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineSetScalingUpV1Beta2Reason,
Message: "Scaling up from 0 to 3 replicas is blocked because KubeadmBootstrapTemplate and DockerMachineTemplate " +
"do not exist and MachineSet version (1.25.5) and ControlPlane version (1.26.2) " +
"do not conform to kubeadm version skew policy as kubeadm only supports joining with the same " +
"major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)",
Message: "Scaling up from 0 to 3 replicas is blocked because:\n" +
"* MachineSet version (1.25.5) and ControlPlane version (1.26.2) do not conform to kubeadm version skew policy as kubeadm only supports joining with the same major+minor version as the control plane (\"KubeadmVersionSkew\" preflight check failed)\n" +
"* KubeadmBootstrapTemplate and DockerMachineTemplate do not exist",
},
},
{
Expand All @@ -339,7 +338,7 @@ func Test_setScalingUpCondition(t *testing.T) {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.scaleUpPreflightCheckErrMessage)
setScalingUpCondition(ctx, tt.ms, tt.machines, tt.bootstrapObjectNotFound, tt.infrastructureObjectNotFound, tt.getAndAdoptMachinesForMachineSetSucceeded, tt.scaleUpPreflightCheckErrMessages)

condition := v1beta2conditions.Get(tt.ms, clusterv1.MachineSetScalingUpV1Beta2Condition)
g.Expect(condition).ToNot(BeNil())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1731,7 +1731,7 @@ func TestMachineSetReconciler_reconcileUnhealthyMachines(t *testing.T) {
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineSetMachineRemediationDeferredV1Beta2Reason,
Message: "GenericControlPlane default/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed)",
Message: "* GenericControlPlane default/cp1 is upgrading (\"ControlPlaneIsStable\" preflight check failed)",
}, v1beta2conditions.IgnoreLastTransitionTime(true)))

// Verify the healthy machine is not deleted and does not have the OwnerRemediated condition.
Expand Down
Loading

0 comments on commit 7aa7b8d

Please sign in to comment.