gardener · himanshu-kun · Dec 15, 2022 · Dec 13, 2022
@@ -98,7 +98,7 @@ type MachineDeploymentSpec struct {
 	// process failed MachineDeployments and a condition with a ProgressDeadlineExceeded
 	// reason will be surfaced in the MachineDeployment status. Note that progress will
 	// not be estimated during the time a MachineDeployment is paused. This is not set
-	// by default.
+	// by default, which is treated as infinite deadline.
 	// +optional
 	ProgressDeadlineSeconds *int32 `json:"progressDeadlineSeconds,omitempty"`
 }
@@ -219,8 +219,8 @@ const (
 
 	// Progressing means the MachineDeployment is progressing. Progress for a MachineDeployment is
 	// considered when a new machine set is created or adopted, and when new machines scale
-	// up or old machines scale down. Progress is not estimated for paused MachineDeployments or
-	// when progressDeadlineSeconds is not specified.
+	// up or old machines scale down. Progress is not estimated for paused MachineDeployments. It is also updated
+	// if progressDeadlineSeconds is not specified(treated as infinite deadline), in which case it would never be updated to "false".
 	MachineDeploymentProgressing MachineDeploymentConditionType = "Progressing"
 
 	// ReplicaFailure is added in a MachineDeployment when one of its machines fails to be created

@@ -40,19 +40,13 @@ import (
 func (dc *controller) syncRolloutStatus(ctx context.Context, allISs []*v1alpha1.MachineSet, newIS *v1alpha1.MachineSet, d *v1alpha1.MachineDeployment) error {
 	newStatus := calculateDeploymentStatus(allISs, newIS, d)
 
-	// If there is no progressDeadlineSeconds set, remove any Progressing condition.
-	if d.Spec.ProgressDeadlineSeconds == nil {
-		RemoveMachineDeploymentCondition(&newStatus, v1alpha1.MachineDeploymentProgressing)
-	}
-
 	// If there is only one machine set that is active then that means we are not running
 	// a new rollout and this is a resync where we don't need to estimate any progress.
 	// In such a case, we should simply not estimate any progress for this deployment.
 	currentCond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
 	isCompleteDeployment := newStatus.Replicas == newStatus.UpdatedReplicas && currentCond != nil && currentCond.Reason == NewISAvailableReason
-	// Check for progress only if there is a progress deadline set and the latest rollout
-	// hasn't completed yet.
-	if d.Spec.ProgressDeadlineSeconds != nil && !isCompleteDeployment {
+	// Check for progress only if the latest rollout hasn't completed yet.
+	if !isCompleteDeployment {
 		switch {
 		case MachineDeploymentComplete(d, &newStatus):
 			// Update the deployment conditions with a message for the new machine set that
@@ -87,6 +81,8 @@ func (dc *controller) syncRolloutStatus(ctx context.Context, allISs []*v1alpha1.
 			}
 			SetMachineDeploymentCondition(&newStatus, *condition)
 
+		// the case -> (mark `Progressing` condition reason as `ProgressDeadlineExceeded` only when non-infinite progressDeadline is set)
+		// is already handled inside `MachineDeploymentTimedOut()`
 		case MachineDeploymentTimedOut(d, &newStatus):
 			// Update the deployment with a timeout condition. If the condition already exists,
 			// we ignore this update.
@@ -161,6 +157,7 @@ func (dc *controller) getReplicaFailures(allISs []*v1alpha1.MachineSet, newIS *v
 // requeueStuckDeployment checks whether the provided deployment needs to be synced for a progress
 // check. It returns the time after the deployment will be requeued for the progress check, 0 if it
 // will be requeued now, or -1 if it does not need to be requeued.
+// This method is just to make sure that as soon as `progressDeadlineSeconds` timeout happens we update the `Progressing` Condition.
 func (dc *controller) requeueStuckMachineDeployment(d *v1alpha1.MachineDeployment, newStatus v1alpha1.MachineDeploymentStatus) time.Duration {
 	currentCond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
 	// Can't estimate progress if there is no deadline in the spec or progressing condition in the current status.

@@ -79,9 +79,6 @@ func (dc *controller) sync(ctx context.Context, d *v1alpha1.MachineDeployment, i
 // These conditions are needed so that we won't accidentally report lack of progress for resumed deployments
 // that were paused for longer than progressDeadlineSeconds.
 func (dc *controller) checkPausedConditions(ctx context.Context, d *v1alpha1.MachineDeployment) error {
-	if d.Spec.ProgressDeadlineSeconds == nil {
-		return nil
-	}
 	cond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
 	if cond != nil && cond.Reason == TimedOutReason {
 		// If we have reported lack of progress, do not overwrite it with a paused condition.
@@ -265,11 +262,8 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD
 
 		// Should use the revision in existingNewRS's annotation, since it set by before
 		needsUpdate := SetMachineDeploymentRevision(d, isCopy.Annotations[RevisionAnnotation])
-		// If no other Progressing condition has been recorded and we need to estimate the progress
-		// of this deployment then it is likely that old users started caring about progress. In that
-		// case we need to take into account the first time we noticed their new machine set.
 		cond := GetMachineDeploymentCondition(d.Status, v1alpha1.MachineDeploymentProgressing)
-		if d.Spec.ProgressDeadlineSeconds != nil && cond == nil {
+		if cond == nil {
 			msg := fmt.Sprintf("Found new machine set %q", isCopy.Name)
 			condition := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionTrue, FoundNewISReason, msg)
 			SetMachineDeploymentCondition(&d.Status, *condition)
@@ -340,7 +334,7 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD
 	createdIS, err := dc.controlMachineClient.MachineSets(newIS.Namespace).Create(ctx, &newIS, metav1.CreateOptions{})
 	switch {
 	// We may end up hitting this due to a slow cache or a fast resync of the Deployment.
-	// Fetch a copy of the ReplicaSet. If its machineTemplateSpec is semantically deep equal
+	// Fetch a copy of the ReplicaSet. If its owner is our deployment and its machineTemplateSpec is semantically deep equal
 	// with the machineTemplateSpec of the Deployment, then that is our new ReplicaSet. Otherwise,
 	// this is a hash collision and we need to increment the collisionCount field in the
 	// status of the Deployment and try the creation again.
@@ -350,37 +344,42 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD
 		if isErr != nil {
 			return nil, isErr
 		}
-		isEqual := EqualIgnoreHash(&d.Spec.Template, &is.Spec.Template)
+
+		// bought these changes from current latest k/k deployment code (https://github.com/kubernetes/kubernetes/blob/0e19bbb91644885a6db38a77ea3d697730269802/pkg/controller/deployment/sync.go#L240-L243)
+		controllerRef := metav1.GetControllerOf(is)
+		if controllerRef != nil && controllerRef.UID == d.UID && EqualIgnoreHash(&d.Spec.Template, &is.Spec.Template) {
+			// Pass through the matching ReplicaSet as the new ReplicaSet.
+			createdIS = is
+			err = nil
+			break
+		}
 
 		// Matching ReplicaSet is not equal - increment the collisionCount in the DeploymentStatus
 		// and requeue the Deployment.
-		if !isEqual {
-			if d.Status.CollisionCount == nil {
-				d.Status.CollisionCount = new(int32)
-			}
-			preCollisionCount := *d.Status.CollisionCount
-			*d.Status.CollisionCount++
-			// Update the collisionCount for the Deployment and let it requeue by returning the original
-			// error.
-			_, dErr := dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
-			if dErr == nil {
-				klog.V(2).Infof("Found a hash collision for machine deployment %q - bumping collisionCount (%d->%d) to resolve it", d.Name, preCollisionCount, *d.Status.CollisionCount)
-			}
-			return nil, err
+		if d.Status.CollisionCount == nil {
+			d.Status.CollisionCount = new(int32)
+		}
+		preCollisionCount := *d.Status.CollisionCount
+		*d.Status.CollisionCount++
+		// Update the collisionCount for the Deployment and let it requeue by returning the original
+		// error.
+		_, dErr := dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
+		if dErr == nil {
+			klog.V(2).Infof("Found a hash collision for machine deployment %q - bumping collisionCount (%d->%d) to resolve it", d.Name, preCollisionCount, *d.Status.CollisionCount)
 		}
-		// Pass through the matching ReplicaSet as the new ReplicaSet.
-		createdIS = is
-		err = nil
+		return nil, err
+	// bought this case from current latest k/k deployment code (https://github.com/kubernetes/kubernetes/blob/0e19bbb91644885a6db38a77ea3d697730269802/pkg/controller/deployment/sync.go#L260-L262)
+	case errors.HasStatusCause(err, v1.NamespaceTerminatingCause):
+		// if the namespace is terminating, all subsequent creates will fail and we can safely do nothing
+		return nil, err
 	case err != nil:
 		msg := fmt.Sprintf("Failed to create new machine set %q: %v", newIS.Name, err)
-		if d.Spec.ProgressDeadlineSeconds != nil {
-			cond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionFalse, FailedISCreateReason, msg)
-			SetMachineDeploymentCondition(&d.Status, *cond)
-			// We don't really care about this error at this point, since we have a bigger issue to report.
-			// TODO: Identify which errors are permanent and switch DeploymentIsFailed to take into account
-			// these reasons as well. Related issue: https://github.com/kubernetes/kubernetes/issues/18568
-			_, _ = dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
-		}
+		cond := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionFalse, FailedISCreateReason, msg)
+		SetMachineDeploymentCondition(&d.Status, *cond)
+		// We don't really care about this error at this point, since we have a bigger issue to report.
+		// TODO: Identify which errors are permanent and switch DeploymentIsFailed  to take into account
+		// these reasons as well. Related issue: https://github.com/kubernetes/kubernetes/issues/18568
+		_, _ = dc.controlMachineClient.MachineDeployments(d.Namespace).UpdateStatus(ctx, d, metav1.UpdateOptions{})
 		dc.recorder.Eventf(d, v1.EventTypeWarning, FailedISCreateReason, msg)
 		return nil, err
 	}
@@ -389,7 +388,7 @@ func (dc *controller) getNewMachineSet(ctx context.Context, d *v1alpha1.MachineD
 	}
 
 	needsUpdate := SetMachineDeploymentRevision(d, newRevision)
-	if !alreadyExists && d.Spec.ProgressDeadlineSeconds != nil {
+	if !alreadyExists {
 		msg := fmt.Sprintf("Created new machine set %q", createdIS.Name)
 		condition := NewMachineDeploymentCondition(v1alpha1.MachineDeploymentProgressing, v1alpha1.ConditionTrue, NewMachineSetReason, msg)
 		SetMachineDeploymentCondition(&d.Status, *condition)

@@ -358,6 +358,8 @@ func (c *controller) checkAndFreezeORUnfreezeMachineSets(ctx context.Context) er
 		lowerThreshold := higherThreshold - c.safetyOptions.SafetyDown
 
 		machineDeployments := c.getMachineDeploymentsForMachineSet(machineSet)
+		// if we have a parent machineDeployment than we use a different higherThreshold and lowerThreshold,
+		// keeping in mind the rolling update scenario, as we won't want to freeze during a normal rolling update.
 		if len(machineDeployments) >= 1 {
 			machineDeployment := machineDeployments[0]
 			if machineDeployment != nil {