Skip to content

Commit

Permalink
Merge pull request #10422 from smartxworks/huangwei/cherry-pick-10196…
Browse files Browse the repository at this point in the history
…-to-release-1.6

[release-1.6] 🐛 Delete out of date machines with unhealthy control plane component conditions when rolling out KCP
  • Loading branch information
k8s-ci-robot authored Apr 11, 2024
2 parents e20e649 + d0530a6 commit 5001599
Show file tree
Hide file tree
Showing 9 changed files with 197 additions and 22 deletions.
21 changes: 14 additions & 7 deletions controlplane/kubeadm/internal/control_plane.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,19 +238,26 @@ func (c *ControlPlane) IsEtcdManaged() bool {
return c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration == nil || c.KCP.Spec.KubeadmConfigSpec.ClusterConfiguration.Etcd.External == nil
}

// UnhealthyMachines returns the list of control plane machines marked as unhealthy by MHC.
func (c *ControlPlane) UnhealthyMachines() collections.Machines {
// UnhealthyMachinesWithUnhealthyControlPlaneComponents returns all unhealthy control plane machines that
// have unhealthy control plane components.
// It differs from UnhealthyMachinesByHealthCheck which checks `MachineHealthCheck` conditions.
func (c *ControlPlane) UnhealthyMachinesWithUnhealthyControlPlaneComponents(machines collections.Machines) collections.Machines {
return machines.Filter(collections.HasUnhealthyControlPlaneComponents(c.IsEtcdManaged()))
}

// UnhealthyMachinesByMachineHealthCheck returns the list of control plane machines marked as unhealthy by Machine Health Check.
func (c *ControlPlane) UnhealthyMachinesByMachineHealthCheck() collections.Machines {
return c.Machines.Filter(collections.HasUnhealthyCondition)
}

// HealthyMachines returns the list of control plane machines not marked as unhealthy by MHC.
func (c *ControlPlane) HealthyMachines() collections.Machines {
// HealthyMachinesByMachineHealthCheck returns the list of control plane machines not marked as unhealthy by Machine Health Check.
func (c *ControlPlane) HealthyMachinesByMachineHealthCheck() collections.Machines {
return c.Machines.Filter(collections.Not(collections.HasUnhealthyCondition))
}

// HasUnhealthyMachine returns true if any machine in the control plane is marked as unhealthy by MHC.
func (c *ControlPlane) HasUnhealthyMachine() bool {
return len(c.UnhealthyMachines()) > 0
// HasUnhealthyMachineByMachineHealthCheck returns true if any machine in the control plane is marked as unhealthy by Machine Health Check.
func (c *ControlPlane) HasUnhealthyMachineByMachineHealthCheck() bool {
return len(c.UnhealthyMachinesByMachineHealthCheck()) > 0
}

// PatchMachines patches all the machines conditions.
Expand Down
2 changes: 1 addition & 1 deletion controlplane/kubeadm/internal/control_plane_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ func TestHasUnhealthyMachine(t *testing.T) {
}

g := NewWithT(t)
g.Expect(c.HasUnhealthyMachine()).To(BeTrue())
g.Expect(c.HasUnhealthyMachineByMachineHealthCheck()).To(BeTrue())
}

type machineOpt func(*clusterv1.Machine)
Expand Down
6 changes: 3 additions & 3 deletions controlplane/kubeadm/internal/controllers/remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
// Cleanup pending remediation actions not completed for any reasons (e.g. number of current replicas is less or equal to 1)
// if the underlying machine is now back to healthy / not deleting.
errList := []error{}
healthyMachines := controlPlane.HealthyMachines()
healthyMachines := controlPlane.HealthyMachinesByMachineHealthCheck()
for _, m := range healthyMachines {
if conditions.IsTrue(m, clusterv1.MachineHealthCheckSucceededCondition) &&
conditions.IsFalse(m, clusterv1.MachineOwnerRemediatedCondition) &&
Expand All @@ -73,7 +73,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

// Gets all machines that have `MachineHealthCheckSucceeded=False` (indicating a problem was detected on the machine)
// and `MachineOwnerRemediated` present, indicating that this controller is responsible for performing remediation.
unhealthyMachines := controlPlane.UnhealthyMachines()
unhealthyMachines := controlPlane.UnhealthyMachinesByMachineHealthCheck()

// If there are no unhealthy machines, return so KCP can proceed with other operations (ctrl.Result nil).
if len(unhealthyMachines) == 0 {
Expand Down Expand Up @@ -183,7 +183,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C

// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
if controlPlane.IsEtcdManaged() {
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
etcdLeaderCandidate := controlPlane.HealthyMachinesByMachineHealthCheck().Newest()
if etcdLeaderCandidate == nil {
log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
Expand Down
6 changes: 6 additions & 0 deletions controlplane/kubeadm/internal/controllers/remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1680,6 +1680,12 @@ func withUnhealthyEtcdMember() machineOption {
}
}

func withUnhealthyAPIServerPod() machineOption {
return func(machine *clusterv1.Machine) {
conditions.MarkFalse(machine, controlplanev1.MachineAPIServerPodHealthyCondition, controlplanev1.ControlPlaneComponentsUnhealthyReason, clusterv1.ConditionSeverityError, "")
}
}

func withNodeRef(ref string) machineOption {
return func(machine *clusterv1.Machine) {
machine.Status.NodeRef = &corev1.ObjectReference{
Expand Down
2 changes: 2 additions & 0 deletions controlplane/kubeadm/internal/controllers/scale.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,8 @@ func selectMachineForScaleDown(controlPlane *internal.ControlPlane, outdatedMach
machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines)
case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0:
machines = controlPlane.MachineWithDeleteAnnotation(machines)
case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0:
machines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines)
case outdatedMachines.Len() > 0:
machines = outdatedMachines
}
Expand Down
46 changes: 37 additions & 9 deletions controlplane/kubeadm/internal/controllers/scale_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,17 +368,26 @@ func TestSelectMachineForScaleDown(t *testing.T) {
Spec: controlplanev1.KubeadmControlPlaneSpec{},
}
startDate := time.Date(2000, 1, 1, 1, 0, 0, 0, time.UTC)
m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)))
m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)))
m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)))
m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)))
m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)))
m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)))
m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"))
m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)), withAnnotation("cluster.x-k8s.io/delete-machine"))
m1 := machine("machine-1", withFailureDomain("one"), withTimestamp(startDate.Add(time.Hour)), machineOpt(withNodeRef("machine-1")))
m2 := machine("machine-2", withFailureDomain("one"), withTimestamp(startDate.Add(-3*time.Hour)), machineOpt(withNodeRef("machine-2")))
m3 := machine("machine-3", withFailureDomain("one"), withTimestamp(startDate.Add(-4*time.Hour)), machineOpt(withNodeRef("machine-3")))
m4 := machine("machine-4", withFailureDomain("two"), withTimestamp(startDate.Add(-time.Hour)), machineOpt(withNodeRef("machine-4")))
m5 := machine("machine-5", withFailureDomain("two"), withTimestamp(startDate.Add(-2*time.Hour)), machineOpt(withNodeRef("machine-5")))
m6 := machine("machine-6", withFailureDomain("two"), withTimestamp(startDate.Add(-7*time.Hour)), machineOpt(withNodeRef("machine-6")))
m7 := machine("machine-7", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)),
withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-7")))
m8 := machine("machine-8", withFailureDomain("two"), withTimestamp(startDate.Add(-6*time.Hour)),
withAnnotation("cluster.x-k8s.io/delete-machine"), machineOpt(withNodeRef("machine-8")))
m9 := machine("machine-9", withFailureDomain("two"), withTimestamp(startDate.Add(-5*time.Hour)),
machineOpt(withNodeRef("machine-9")))
m10 := machine("machine-10", withFailureDomain("two"), withTimestamp(startDate.Add(-4*time.Hour)),
machineOpt(withNodeRef("machine-10")), machineOpt(withUnhealthyAPIServerPod()))
m11 := machine("machine-11", withFailureDomain("two"), withTimestamp(startDate.Add(-3*time.Hour)),
machineOpt(withNodeRef("machine-11")), machineOpt(withUnhealthyEtcdMember()))

mc3 := collections.FromMachines(m1, m2, m3, m4, m5)
mc6 := collections.FromMachines(m6, m7, m8)
mc9 := collections.FromMachines(m9, m10, m11)
fd := clusterv1.FailureDomains{
"one": failureDomain(true),
"two": failureDomain(true),
Expand All @@ -389,6 +398,11 @@ func TestSelectMachineForScaleDown(t *testing.T) {
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
Machines: mc3,
}
needsUpgradeControlPlane1 := &internal.ControlPlane{
KCP: &kcp,
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
Machines: mc9,
}
upToDateControlPlane := &internal.ControlPlane{
KCP: &kcp,
Cluster: &clusterv1.Cluster{Status: clusterv1.ClusterStatus{FailureDomains: fd}},
Expand Down Expand Up @@ -452,12 +466,26 @@ func TestSelectMachineForScaleDown(t *testing.T) {
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-7"}},
},
{
name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotatio that still exist, it returns oldest marked machine first",
name: "when there is an up to date machine with delete annotation, while there are any outdated machines without annotation that still exist, it returns oldest marked machine first",
cp: upToDateControlPlane,
outDatedMachines: collections.FromMachines(m5, m3, m8, m7, m6, m1, m2),
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-8"}},
},
{
name: "when there are machines needing upgrade, it returns the single unhealthy machine with MachineAPIServerPodHealthyCondition set to False",
cp: needsUpgradeControlPlane1,
outDatedMachines: collections.FromMachines(m9, m10),
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}},
},
{
name: "when there are machines needing upgrade, it returns the oldest unhealthy machine with MachineEtcdMemberHealthyCondition set to False",
cp: needsUpgradeControlPlane1,
outDatedMachines: collections.FromMachines(m9, m10, m11),
expectErr: false,
expectedMachine: clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "machine-10"}},
},
}

for _, tc := range testCases {
Expand Down
3 changes: 2 additions & 1 deletion docs/proposals/20191017-kubeadm-based-control-plane.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,8 +401,9 @@ spec:
- See [Preflight checks](#preflight-checks) below.
- Scale down operations removes the oldest machine in the failure domain that has the most control-plane machines on it.
- Allow scaling down of KCP with the possibility of marking specific control plane machine(s) to be deleted with delete annotation key. The presence of the annotation will affect the rollout strategy in a way that, it implements the following prioritization logic in descending order, while selecting machines for scale down:
- outdatedMachines with the delete annotation
- outdated machines with the delete annotation
- machines with the delete annotation
- outdated machines with unhealthy control plane component pods
- outdated machines
- all machines

Expand Down
36 changes: 36 additions & 0 deletions util/collections/machine_filters.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,42 @@ func HasUnhealthyCondition(machine *clusterv1.Machine) bool {
return conditions.IsFalse(machine, clusterv1.MachineHealthCheckSucceededCondition) && conditions.IsFalse(machine, clusterv1.MachineOwnerRemediatedCondition)
}

// HasUnhealthyControlPlaneComponents returns a filter to find all unhealthy control plane machines that
// have any of the following control plane component conditions set to False:
// APIServerPodHealthy, ControllerManagerPodHealthy, SchedulerPodHealthy, EtcdPodHealthy & EtcdMemberHealthy (if using managed etcd).
// It is different from the HasUnhealthyCondition func which checks MachineHealthCheck conditions.
func HasUnhealthyControlPlaneComponents(isEtcdManaged bool) Func {
controlPlaneMachineHealthConditions := []clusterv1.ConditionType{
controlplanev1.MachineAPIServerPodHealthyCondition,
controlplanev1.MachineControllerManagerPodHealthyCondition,
controlplanev1.MachineSchedulerPodHealthyCondition,
}
if isEtcdManaged {
controlPlaneMachineHealthConditions = append(controlPlaneMachineHealthConditions,
controlplanev1.MachineEtcdPodHealthyCondition,
controlplanev1.MachineEtcdMemberHealthyCondition,
)
}
return func(machine *clusterv1.Machine) bool {
if machine == nil {
return false
}

// The machine without a node could be in failure status due to the kubelet config error, or still provisioning components (including etcd).
// So do not treat it as unhealthy.

for _, condition := range controlPlaneMachineHealthConditions {
// Do not return true when the condition is not set or is set to Unknown because
// it means a transient state and can not be considered as unhealthy.
// preflightCheckCondition() can cover these two cases and skip the scaling up/down.
if conditions.IsFalse(machine, condition) {
return true
}
}
return false
}
}

// IsReady returns a filter to find all machines with the ReadyCondition equals to True.
func IsReady() Func {
return func(machine *clusterv1.Machine) bool {
Expand Down
97 changes: 96 additions & 1 deletion util/collections/machine_filters_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"time"

. "github.com/onsi/gomega"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/pointer"
"sigs.k8s.io/controller-runtime/pkg/client/fake"
Expand Down Expand Up @@ -377,7 +378,7 @@ func TestWithVersion(t *testing.T) {
})
}

func TestHealtyAPIServer(t *testing.T) {
func TestHealthyAPIServer(t *testing.T) {
t.Run("nil machine returns false", func(t *testing.T) {
g := NewWithT(t)
g.Expect(collections.HealthyAPIServer()(nil)).To(BeFalse())
Expand Down Expand Up @@ -432,6 +433,100 @@ func TestGetFilteredMachinesForCluster(t *testing.T) {
g.Expect(machines).To(HaveLen(1))
}

func TestHasUnhealthyControlPlaneComponentCondition(t *testing.T) {
t.Run("nil machine returns false", func(t *testing.T) {
g := NewWithT(t)
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(nil)).To(BeFalse())
})

t.Run("machine without node returns false", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
})

t.Run("machine with all healthy controlPlane component conditions returns false when the Etcd is not managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
}
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
})

t.Run("machine with unhealthy 'APIServerPodHealthy' condition returns true when the Etcd is not managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.FalseCondition(controlplanev1.MachineAPIServerPodHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
}
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeTrue())
})

t.Run("machine with unhealthy etcd component conditions returns false when Etcd is not managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
}
g.Expect(collections.HasUnhealthyControlPlaneComponents(false)(machine)).To(BeFalse())
})

t.Run("machine with unhealthy etcd conditions returns true when Etcd is managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.FalseCondition(controlplanev1.MachineEtcdPodHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
*conditions.FalseCondition(controlplanev1.MachineEtcdMemberHealthyCondition, "",
clusterv1.ConditionSeverityWarning, ""),
}
g.Expect(collections.HasUnhealthyControlPlaneComponents(true)(machine)).To(BeTrue())
})

t.Run("machine with all healthy controlPlane and the Etcd component conditions returns false when Etcd is managed", func(t *testing.T) {
g := NewWithT(t)
machine := &clusterv1.Machine{}
machine.Status.NodeRef = &corev1.ObjectReference{
Name: "node1",
}
machine.Status.Conditions = clusterv1.Conditions{
*conditions.TrueCondition(controlplanev1.MachineAPIServerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineControllerManagerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineSchedulerPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineEtcdPodHealthyCondition),
*conditions.TrueCondition(controlplanev1.MachineEtcdMemberHealthyCondition),
}
g.Expect(collections.HasUnhealthyControlPlaneComponents(true)(machine)).To(BeFalse())
})
}

func testControlPlaneMachine(name string) *clusterv1.Machine {
owned := true
ownedRef := []metav1.OwnerReference{
Expand Down

0 comments on commit 5001599

Please sign in to comment.