Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🐛 Delete out of date machines with unhealthy control plane component conditions when rolling out KCP #10196

Merged
merged 14 commits into from
Apr 11, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,24 +238,32 @@ func (c *ControlPlane) IsEtcdManaged() bool {
return c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil
}

// UnhealthyMachines returns the list of control plane machines marked as unhealthy by MHC.
func (c *ControlPlane) UnhealthyMachines() collections.Machines {
// UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents returns all unhealthy control plane machines that
// do not have a Kubernetes node, or have unhealthy control plane components.
//
// It differs from UnhealthyMachinesByHealthCheck which checks `MachineHealthCheck` conditions.
func (c *ControlPlane) UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(machines collections.Machines) collections.Machines {
return machines.Filter(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(c.IsEtcdManaged()))
}

// UnhealthyMachinesByMachineHealthCheck returns the list of control plane machines marked as unhealthy by Machine Health Check.
func (c *ControlPlane) UnhealthyMachinesByMachineHealthCheck() collections.Machines {
return c.Machines.Filter(collections.HasUnhealthyCondition)
}

// HealthyMachines returns the list of control plane machines not marked as unhealthy by MHC.
func (c *ControlPlane) HealthyMachines() collections.Machines {
// HealthyMachinesByMachineHealthCheck returns the list of control plane machines not marked as unhealthy by Machine Health Check.
func (c *ControlPlane) HealthyMachinesByMachineHealthCheck() collections.Machines {
return c.Machines.Filter(collections.Not(collections.HasUnhealthyCondition))
}

// HasUnhealthyMachine returns true if any machine in the control plane is marked as unhealthy by MHC.
func (c *ControlPlane) HasUnhealthyMachine() bool {
return len(c.UnhealthyMachines()) > 0
// HasUnhealthyMachineByMachineHealthCheck returns true if any machine in the control plane is marked as unhealthy by Machine Health Check.
func (c *ControlPlane) HasUnhealthyMachineByMachineHealthCheck() bool {
return len(c.UnhealthyMachinesByMachineHealthCheck()) > 0
}

// HasHealthyMachineStillProvisioning returns true if any healthy machine in the control plane is still in the process of being provisioned.
func (c *ControlPlane) HasHealthyMachineStillProvisioning() bool {
return len(c.HealthyMachines().Filter(collections.Not(collections.HasNode()))) > 0
return len(c.HealthyMachinesByMachineHealthCheck().Filter(collections.Not(collections.HasNode()))) > 0
}

// PatchMachines patches all the machines conditions.
Expand Down
4 changes: 2 additions & 2 deletions controlplane/kubeadm/internal/control_plane_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func TestHasUnhealthyMachine(t *testing.T) {
}

g := NewWithT(t)
g.Expect(c.HasUnhealthyMachine()).To(BeTrue())
g.Expect(c.HasUnhealthyMachineByMachineHealthCheck()).To(BeTrue())
})

t.Run("No unhealthy machine to be remediated by KCP", func(t *testing.T) {
Expand All @@ -101,7 +101,7 @@ func TestHasUnhealthyMachine(t *testing.T) {
}

g := NewWithT(t)
g.Expect(c.HasUnhealthyMachine()).To(BeFalse())
g.Expect(c.HasUnhealthyMachineByMachineHealthCheck()).To(BeFalse())
})
}

Expand Down
6 changes: 3 additions & 3 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
// if the underlying machine is now back to healthy / not deleting.
errList := []error{}
healthyMachines := controlPlane.HealthyMachines()
healthyMachines := controlPlane.HealthyMachinesByMachineHealthCheck()
for _, m := range healthyMachines {
if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) &&
conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) &&
Expand All @@ -74,7 +74,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
unhealthyMachines := controlPlane.UnhealthyMachines()
unhealthyMachines := controlPlane.UnhealthyMachinesByMachineHealthCheck()

// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
if len(unhealthyMachines) == 0 {
Expand Down Expand Up @@ -192,7 +192,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
if controlPlane.IsEtcdManaged() {
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
etcdLeaderCandidate := controlPlane.HealthyMachinesByMachineHealthCheck().Newest()
if etcdLeaderCandidate == nil {
log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
Expand Down
6 changes: 6 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1881,6 +1881,12 @@ func withUnhealthyEtcdMember() machineOption {
}
}

func withUnhealthyAPIServerPod() machineOption {
return func(machine *clusterv1.Machine) {
conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "")
}
}

func withNodeRef(ref string) machineOption {
return func(machine *clusterv1.Machine) {
machine.Status.NodeRef = &corev1.ObjectReference{
Expand Down
2 changes: 2 additions & 0 deletions controlplane/kubeadm/internal/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.Contr
machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines)
case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0:
machines = controlPlane.MachineWithDeleteAnnotation(machines)
case controlPlane.UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0:
machines = controlPlane.UnhealthyMachinesWithMissingNodeOrUnhealthyControlPlaneComponents(outdatedMachines)
case outdatedMachines.Len() > 0:
machines = outdatedMachines
}
Expand Down
46 changes: 37 additions & 9 deletions controlplane/kubeadm/internal/controllers/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,17 +368,26 @@ func TestSelectMachineForScaleDown(t *testing.T) {
Spec: controlplanev1.KubeadmControlPlaneSpec{},
}
startDate := time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC)
m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)))
m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)))
m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)))
m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)))
m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)))
m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)))
m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"))
m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"))
m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)), machineOpt(withNodeRef("machine-1")))
m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)), machineOpt(withNodeRef("machine-2")))
m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)), machineOpt(withNodeRef("machine-3")))
m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)), machineOpt(withNodeRef("machine-4")))
m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), machineOpt(withNodeRef("machine-5")))
m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)), machineOpt(withNodeRef("machine-6")))
m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)),
withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-7")))
m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)),
withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-8")))
m9 := machine("machine-9", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)),
machineOpt(withNodeRef("machine-9")))
m10 := machine("machine-10", withFailureDomain("two"), withTimestamp(startDate.Add(-4*time.Hour)),
machineOpt(withNodeRef("machine-10")), machineOpt(withUnhealthyAPIServerPod()))
m11 := machine("machine-11", withFailureDomain("two"), withTimestamp(startDate.Add(-3*time.Hour)),
machineOpt(withNodeRef("machine-11")), machineOpt(withUnhealthyEtcdMember()))

mc3 := collections.FromMachines(m1, m2, m3, m4, m5)
mc6 := collections.FromMachines(m6, m7, m8)
mc9 := collections.FromMachines(m9, m10, m11)
fd := clusterv1.FailureDomains{
"one": failureDomain(true),
"two": failureDomain(true),
Expand All @@ -389,6 +398,11 @@ func TestSelectMachineForScaleDown(t *testing.T) {
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
Machines: mc3,
}
needsUpgradeControlPlane1 := &internal.ControlPlane{
KCP: &kcp,
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
Machines: mc9,
}
upToDateControlPlane := &internal.ControlPlane{
KCP: &kcp,
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
Expand Down Expand Up @@ -452,12 +466,26 @@ func TestSelectMachineForScaleDown(t *testing.T) {
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-7"}},
},
{
name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotatio that still exist, it returns oldest marked machine first",
name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotation that still exist, it returns oldest marked machine first",
cp: upToDateControlPlane,
outDatedMachines: collections.FromMachines(m5, m3, m8, m7, m6, m1, m2),
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-8"}},
},
{
name: "when there are machines needing upgrade, it returns the single unhealthy machine with MachineAPIServerPodHealthyCondition set to False",
cp: needsUpgradeControlPlane1,
outDatedMachines: collections.FromMachines(m9, m10),
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}},
},
{
name: "when there are machines needing upgrade, it returns the oldest unhealthy machine with MachineEtcdMemberHealthyCondition set to False",
cp: needsUpgradeControlPlane1,
outDatedMachines: collections.FromMachines(m9, m10, m11),
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}},
},
}

for _, tc := range testCases {
Expand Down
3 changes: 2 additions & 1 deletion docs/proposals/20191017-kubeadm-based-control-plane.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,9 @@ spec:
- See [Preflight checks](#preflight-checks) below.
- Scale down operations removes the oldest machine in the failure domain that has the most control-plane machines on it.
- Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key. The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order, while selecting machines for scale down:
- outdatedMachines with the delete annotation
- outdated machines with the delete annotation
- machines with the delete annotation
- outdated machines with unhealthy control plane component pods or missing nodes
- outdated machines
- all machines

Expand Down
35 changes: 35 additions & 0 deletions util/collections/machine_filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,41 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool {
return conditions.IsFalse(machine, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition)
}

// HasMissingNodeOrUnhealthyControlPlaneComponents returns a filter to find all unhealthy control plane machines that
// does not have a Kubernetes node or have any of the following control plane component conditions set to False:
// APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy & EtcdMemberHealthy (if using managed etcd).
// It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions.
func HasMissingNodeOrUnhealthyControlPlaneComponents(isEtcdManaged bool) Func {
controlPlaneMachineHealthConditions := []clusterv1.ConditionType{
controlplanev1.MachineAPIServerPodHealthyCondition,
controlplanev1.MachineControllerManagerPodHealthyCondition,
controlplanev1.MachineSchedulerPodHealthyCondition,
}
if isEtcdManaged {
controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions,
controlplanev1.MachineEtcdPodHealthyCondition,
controlplanev1.MachineEtcdMemberHealthyCondition,
)
}
return func(machine *clusterv1.Machine) bool {
if machine == nil {
return false
}
if !HasNode()(machine) {
sbueringer marked this conversation as resolved.
Show resolved Hide resolved
return true
}
for _, condition := range controlPlaneMachineHealthConditions {
// Do not return true when the condition is not set or is set to Unknown because
// it means a transient state and can not be considered as unhealthy.
// preflightCheckCondition() can cover these two cases and skip the scaling up/down.
if conditions.IsFalse(machine, condition) {
return true
}
}
return false
}
}

// IsReady returns a filter to find all machines with the ReadyCondition equals to True.
func IsReady() Func {
return func(machine *clusterv1.Machine) bool {
Expand Down
96 changes: 95 additions & 1 deletion util/collections/machine_filters_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ func TestWithVersion(t *testing.T) {
})
}

func TestHealtyAPIServer(t *testing.T) {
func TestHealthyAPIServer(t *testing.T) {
t.Run("nil machine returns false", func(t *testing.T) {
g := NewWithT(t)
g.Expect(collections.HealthyAPIServer()(nil)).To(BeFalse())
Expand Down Expand Up @@ -454,6 +454,100 @@ func TestHasNode(t *testing.T) {
})
}

func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) {
t.Run("nil machine returns false", func(t *testing.T) {
g := NewWithT(t)
g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(nil)).To(BeFalse())
})

t.Run("machine without node returns true", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue())
})

t.Run("machine with all healthy controlPlane component conditions returns false when the Etcd is not managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
}
g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
})

t.Run("machine with unhealthy 'APIServerPodHealthy' condition returns true when the Etcd is not managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.FalseCondition(controlplanev1.MachineAPIServerPodHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
}
g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue())
})

t.Run("machine with unhealthy etcd component conditions returns false when Etcd is not managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
}
g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
})

t.Run("machine with unhealthy etcd conditions returns true when Etcd is managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
}
g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(true)(machine)).To(BeTrue())
})

t.Run("machine with all healthy controlPlane and the Etcd component conditions returns false when Etcd is managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
}
g.Expect(collections.HasMissingNodeOrUnhealthyControlPlaneComponents(true)(machine)).To(BeFalse())
})
}

func testControlPlaneMachine(name string) *clusterv1.Machine {
owned := true
ownedRef := []metav1.OwnerReference{
Expand Down
Loading