From a1ebe620fe7757c46b4dfcbf76532390650bb7c9 Mon Sep 17 00:00:00 2001 From: fabriziopandini Date: Tue, 15 Oct 2024 11:07:08 +0200 Subject: [PATCH] Update MHC with v1Beta2 status --- api/v1beta1/machine_types.go | 54 +- api/v1beta1/machinehealthcheck_types.go | 21 + api/v1beta1/v1beta2_condition_consts.go | 10 - .../machinehealthcheck_controller.go | 41 + .../machinehealthcheck_controller_test.go | 763 +++++++++--------- .../machinehealthcheck_targets.go | 35 + .../machinehealthcheck_targets_test.go | 122 ++- util/conditions/v1beta2/getter.go | 32 + util/conditions/v1beta2/setter.go | 16 + 9 files changed, 673 insertions(+), 421 deletions(-) diff --git a/api/v1beta1/machine_types.go b/api/v1beta1/machine_types.go index 858be9dec0c2..34d15695f163 100644 --- a/api/v1beta1/machine_types.go +++ b/api/v1beta1/machine_types.go @@ -200,22 +200,70 @@ const ( // Note: this could happen when creating the machine. However, this state should be treated as an error if it lasts indefinitely. MachineNodeDoesNotExistV1Beta2Reason = ObjectDoesNotExistV1Beta2Reason - // MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted. + // MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted. // Note: controllers can't identify if the Node was deleted by the controller itself, e.g. // during the deletion workflow, or by a users. MachineNodeDeletedV1Beta2Reason = ObjectDeletedV1Beta2Reason ) -// Machine's HealthCheckSucceeded and OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version. -// Note: HealthCheckSucceeded and OwnerRemediated condition are set by the MachineHealthCheck controller. +// Machine's HealthCheckSucceeded condition and corresponding reasons that will be used in v1Beta2 API version. +// Note: HealthCheckSucceeded condition ia set by the MachineHealthCheck controller. const ( // MachineHealthCheckSucceededV1Beta2Condition is true if MHC instances targeting this machine report the Machine // is healthy according to the definition of healthy present in the spec of the MachineHealthCheck object. MachineHealthCheckSucceededV1Beta2Condition = "HealthCheckSucceeded" + // MachineHealthCheckPassedV1Beta2Reason surfaces when a machine passes all the health checks defined by a MachineHealthCheck object. + MachineHealthCheckPassedV1Beta2Reason = "HealthCheckPassed" + + // MachineHealthCheckUnhealthyNodeV1Beta2Reason surfaces when the node hosted on the machine do not pass the health checks + // defined by a MachineHealthCheck object. + MachineHealthCheckUnhealthyNodeV1Beta2Reason = "UnhealthyNode" + + // MachineHealthCheckNodeStartupTimeoutV1Beta2Reason surfaces when the node hosted on the machine does not appear within + // he timeout defined by a MachineHealthCheck object. + MachineHealthCheckNodeStartupTimeoutV1Beta2Reason = "NodeStartupTimeout" + + // MachineHealthCheckNodeDeletedV1Beta2Reason surfaces when a MachineHealthCheck detect that the node hosted on the + // machine has been deleted while the Machine is still running. + MachineHealthCheckNodeDeletedV1Beta2Reason = "NodeDeleted" + + // MachineHealthCheckManuallyRemediatedV1Beta2Reason surfaces a MachineHealthCheck detects a machine manually remediated + // via the remediate-machine annotation. + MachineHealthCheckManuallyRemediatedV1Beta2Reason = "ManuallyRemediated" +) + +// Machine's OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version. +// Note: OwnerRemediated condition is created by the MachineHealthCheck controller, and then Set by the Machine's owner controller. +const ( // MachineOwnerRemediatedV1Beta2Condition is only present if MHC instances targeting this machine // determine that the controller owning this machine should perform remediation. MachineOwnerRemediatedV1Beta2Condition = "OwnerRemediated" + + // MachineOwnerRemediatedWaitingForOwnerV1Beta2Reason surfaces the machine is waiting for the owner controller + // to start remediation. + MachineOwnerRemediatedWaitingForOwnerV1Beta2Reason = "WaitingForOwner" +) + +// Machine's ExternallyRemediated conditions and corresponding reasons that will be used in v1Beta2 API version. +// Note: ExternallyRemediated condition is created by the MachineHealthCheck controller, and then Set by the external +// remediation controller. +const ( + // MachineExternallyRemediatedV1Beta2Condition is only present if MHC instances targeting this machine + // determine that an external controller should perform remediation. + MachineExternallyRemediatedV1Beta2Condition = "ExternallyRemediated" + + // MachineExternallyRemediatedWaitingForExternalRemediationV1Beta2Reason surfaces the machine is waiting for the + // external remediation controller to start remediation. + MachineExternallyRemediatedWaitingForExternalRemediationV1Beta2Reason = "WaitingForExternalRemediation" + + // MachineOwnerRemediateExternalRemediationTemplateNotFoundV1Beta2Reason surfaces that the MachineHealthCheck cannot + // find the template for a external remediation request. + MachineOwnerRemediateExternalRemediationTemplateNotFoundV1Beta2Reason = "ExternalRemediationTemplateNotFound" + + // MachineOwnerRemediateExternalRemediationRequestCreationFailedV1Beta2Reason surfaces that the MachineHealthCheck cannot + // create a request for the external remediation controller. + MachineOwnerRemediateExternalRemediationRequestCreationFailedV1Beta2Reason = "ExternalRemediationRequestCreationFailed" ) // Machine's Deleting condition and corresponding reasons that will be used in v1Beta2 API version. diff --git a/api/v1beta1/machinehealthcheck_types.go b/api/v1beta1/machinehealthcheck_types.go index e43d27854273..27614b351de0 100644 --- a/api/v1beta1/machinehealthcheck_types.go +++ b/api/v1beta1/machinehealthcheck_types.go @@ -24,6 +24,27 @@ import ( "k8s.io/apimachinery/pkg/util/intstr" ) +// MachineHealthCheck's RemediationAllowed condition and corresponding reasons that will be used in v1Beta2 API version. +const ( + // MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is + // allowed to remediate any Machines or whether it is blocked from remediating any further. + MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed" + + // MachineHealthCheckTooManyUnhealthyV1Beta2Reason is the reason used when too many Machines are unhealthy and + // the MachineHealthCheck is blocked from making any further remediation. + MachineHealthCheckTooManyUnhealthyV1Beta2Reason = "TooManyUnhealthy" + + // MachineHealthCheckRemediationAllowedV1Beta2Reason is the reason used when the number of unhealthy machine + // is within the limits defined by the MachineHealthCheck, and thus remediation is allowed. + MachineHealthCheckRemediationAllowedV1Beta2Reason = "RemediationAllowed" +) + +// MachineHealthCheck's Paused condition and corresponding reasons that will be used in v1Beta2 API version. +const ( + // MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused. + MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition +) + var ( // DefaultNodeStartupTimeout is the time allowed for a node to start up. // Can be made longer as part of spec if required for particular provider. diff --git a/api/v1beta1/v1beta2_condition_consts.go b/api/v1beta1/v1beta2_condition_consts.go index d84a1cc2a0aa..c6f72d41a482 100644 --- a/api/v1beta1/v1beta2_condition_consts.go +++ b/api/v1beta1/v1beta2_condition_consts.go @@ -225,16 +225,6 @@ const ( ClusterPausedV1Beta2Condition = PausedV1Beta2Condition ) -// Conditions that will be used for the MachineHealthCheck object in v1Beta2 API version. -const ( - // MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is - // allowed to remediate any Machines or whether it is blocked from remediating any further. - MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed" - - // MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused. - MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition -) - // Conditions that will be used for the ClusterClass object in v1Beta2 API version. const ( // ClusterClassVariablesReadyV1Beta2Condition is true if the ClusterClass variables, including both inline and external diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go index 568e494dc174..5e5b2113617d 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller.go @@ -50,6 +50,7 @@ import ( "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/conditions" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" "sigs.k8s.io/cluster-api/util/patch" "sigs.k8s.io/cluster-api/util/predicates" ) @@ -279,6 +280,13 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster Message: message, }) + v1beta2conditions.Set(m, metav1.Condition{ + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckTooManyUnhealthyV1Beta2Reason, + Message: message, + }) + // If there are no unhealthy target, skip publishing the `RemediationRestricted` event to avoid misleading. if len(unhealthy) != 0 { r.recorder.Event( @@ -321,6 +329,12 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster m.Status.RemediationsAllowed = remediationCount conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition) + v1beta2conditions.Set(m, metav1.Condition{ + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }) + errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m) errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...) @@ -399,6 +413,13 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace) if err != nil { conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error()) + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediateExternalRemediationTemplateNotFoundV1Beta2Reason, + Message: fmt.Sprintf("error retrieving remediation template %s %s", m.Spec.RemediationTemplate.Kind, klog.KRef(t.Machine.Namespace, m.Spec.RemediationTemplate.Name)), + }) errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName)) return errList } @@ -428,9 +449,21 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg // Create the external clone. if err := r.Client.Create(ctx, to); err != nil { conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error()) + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediateExternalRemediationRequestCreationFailedV1Beta2Reason, + }) errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName)) return errList } + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineExternallyRemediatedWaitingForExternalRemediationV1Beta2Reason, + }) } else { logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message) // NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed; @@ -438,6 +471,14 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) { conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "") } + + if ownerRemediatedCondition := v1beta2conditions.Get(t.Machine, clusterv1.MachineOwnerRemediatedV1Beta2Condition); ownerRemediatedCondition == nil || ownerRemediatedCondition.Status == metav1.ConditionTrue { + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineOwnerRemediatedWaitingForOwnerV1Beta2Reason, + }) + } } } diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go index 3a1dbeeb152f..21a6a9bfde16 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_controller_test.go @@ -49,6 +49,7 @@ import ( "sigs.k8s.io/cluster-api/internal/webhooks" "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/conditions" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" "sigs.k8s.io/cluster-api/util/patch" ) @@ -240,6 +241,15 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) }) @@ -293,7 +303,18 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + + assertMachinesNotHealthy(g, mhc, 0) }) t.Run("it doesn't mark anything unhealthy when all Machines are healthy", func(t *testing.T) { @@ -341,7 +362,18 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + + assertMachinesNotHealthy(g, mhc, 0) }) t.Run("it marks unhealthy machines for remediation when there is one unhealthy Machine", func(t *testing.T) { @@ -398,7 +430,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 1) }) t.Run("it marks unhealthy machines for remediation when there a Machine has a failure reason", func(t *testing.T) { @@ -456,7 +500,57 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + + g.Eventually(func() (unhealthy int) { + machines := &clusterv1.MachineList{} + err := env.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if !conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { + continue + } + + unhealthy++ + } + return + }).Should(Equal(1)) + + g.Eventually(func() (ownerRemediated int) { + machines := &clusterv1.MachineList{} + err := env.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if !conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { + continue + } + if !conditions.Has(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { + continue + } + + ownerRemediated++ + } + return + }).Should(Equal(1)) }) t.Run("it marks unhealthy machines for remediation when there a Machine has a failure message", func(t *testing.T) { @@ -514,7 +608,57 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + + g.Eventually(func() (unhealthy int) { + machines := &clusterv1.MachineList{} + err := env.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if !conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { + continue + } + + unhealthy++ + } + return + }).Should(Equal(1)) + + g.Eventually(func() (ownerRemediated int) { + machines := &clusterv1.MachineList{} + err := env.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if !conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { + continue + } + if !conditions.Has(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { + continue + } + + ownerRemediated++ + } + return + }).Should(Equal(1)) }) t.Run("it marks unhealthy machines for remediation when the unhealthy Machines exceed MaxUnhealthy", func(t *testing.T) { @@ -576,43 +720,20 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Message: "Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: 3, unhealthy: 2, maxUnhealthy: 40%)", }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckTooManyUnhealthyV1Beta2Reason, + Message: "Remediation is not allowed, the number of not started or unhealthy machines exceeds maxUnhealthy (total: 3, unhealthy: 2, maxUnhealthy: 40%)", + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(2)) - - // Calculate how many Machines have been remediated. - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { - remediated++ - } - } - return - }).Should(Equal(0)) + assertMachinesNotHealthy(g, mhc, 2) + assertMachinesOwnerRemediated(g, mhc, 0) }) t.Run("it marks unhealthy machines for remediation when number of unhealthy machines is within unhealthyRange", func(t *testing.T) { @@ -671,7 +792,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 1) }) t.Run("it marks unhealthy machines for remediation when the unhealthy Machines is not within UnhealthyRange", func(t *testing.T) { @@ -733,43 +866,20 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Message: "Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: 3, unhealthy: 2, unhealthyRange: [3-5])", }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckTooManyUnhealthyV1Beta2Reason, + Message: "Remediation is not allowed, the number of not started or unhealthy machines does not fall within the range (total: 3, unhealthy: 2, unhealthyRange: [3-5])", + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(2)) - - // Calculate how many Machines have been remediated. - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { - remediated++ - } - } - return - }).Should(Equal(0)) + assertMachinesNotHealthy(g, mhc, 2) + assertMachinesOwnerRemediated(g, mhc, 0) }) t.Run("when a Machine has no Node ref for less than the NodeStartupTimeout", func(t *testing.T) { @@ -835,43 +945,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(0)) - - // Calculate how many Machines have been remediated. - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { - remediated++ - } - } - return - }).Should(Equal(0)) + assertMachinesNotHealthy(g, mhc, 0) + assertMachinesOwnerRemediated(g, mhc, 0) }) t.Run("when a Machine has no Node ref for longer than the NodeStartupTimeout", func(t *testing.T) { @@ -932,44 +1018,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - fmt.Printf("error retrieving list: %v", err) - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }, timeout, 100*time.Millisecond).Should(Equal(1)) - - // Calculate how many Machines have been remediated. - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { - remediated++ - } - } - return - }, timeout, 100*time.Millisecond).Should(Equal(1)) + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 1) }) t.Run("when a Machine's Node has gone away", func(t *testing.T) { @@ -1027,43 +1088,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(1)) - - // Calculate how many Machines have been remediated. - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { - remediated++ - } - } - return - }, timeout, 100*time.Millisecond).Should(Equal(1)) + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 1) }) t.Run("Machine's Node without conditions", func(t *testing.T) { @@ -1113,43 +1150,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(0)) - - // Calculate how many Machines have been remediated. - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsTrue(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { - remediated++ - } - } - return - }).Should(Equal(0)) + assertMachinesNotHealthy(g, mhc, 0) + assertMachinesOwnerRemediated(g, mhc, 0) }) t.Run("should react when a Node transitions to unhealthy", func(t *testing.T) { @@ -1197,8 +1210,20 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + assertMachinesNotHealthy(g, mhc, 0) + assertMachinesOwnerRemediated(g, mhc, 0) + // Transition the node to unhealthy. node := nodes[0] nodePatch := client.MergeFrom(node.DeepCopy()) @@ -1229,43 +1254,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(1)) - - // Calculate how many Machines have been marked for remediation - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { - remediated++ - } - } - return - }).Should(Equal(1)) + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 1) }) t.Run("when in a MachineSet, unhealthy machines should be deleted", func(t *testing.T) { @@ -1367,43 +1368,20 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { } g.Expect(env.Patch(ctx, machineSet, machineSetPatch)).To(Succeed()) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }, timeout, 100*time.Millisecond).Should(Equal(1)) + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 1) - // Calculate how many Machines should be remediated. var unhealthyMachine *clusterv1.Machine - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } + machines := &clusterv1.MachineList{} + g.Expect(env.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + })).To(Succeed()) - for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { - unhealthyMachine = machines.Items[i].DeepCopy() - remediated++ - } + for i := range machines.Items { + if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { + unhealthyMachine = machines.Items[i].DeepCopy() } - return - }, timeout, 100*time.Millisecond).Should(Equal(1)) + } // Unpause the MachineSet reconciler. machineSetPatch = client.MergeFrom(machineSet.DeepCopy()) @@ -1464,6 +1442,15 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) // Pause the machine @@ -1504,43 +1491,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(1)) - - // Calculate how many Machines have been remediated. - g.Eventually(func() (remediated int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.Get(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) != nil { - remediated++ - } - } - return - }).Should(Equal(0)) + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 0) }) t.Run("When remediationTemplate is set and node transitions to unhealthy, new Remediation Request should be created", func(t *testing.T) { @@ -1616,8 +1579,20 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + assertMachinesNotHealthy(g, mhc, 0) + assertMachinesOwnerRemediated(g, mhc, 0) + // Transition the node to unhealthy. node := nodes[0] nodePatch := client.MergeFrom(node.DeepCopy()) @@ -1649,25 +1624,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(1)) + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 0) ref := corev1.ObjectReference{ APIVersion: builder.RemediationGroupVersion.String(), @@ -1764,8 +1733,20 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) + assertMachinesNotHealthy(g, mhc, 0) + assertMachinesOwnerRemediated(g, mhc, 0) + // Transition the node to unhealthy. node := nodes[0] nodePatch := client.MergeFrom(node.DeepCopy()) @@ -1797,25 +1778,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(1)) + assertMachinesNotHealthy(g, mhc, 1) + assertMachinesOwnerRemediated(g, mhc, 0) // Transition the node back to healthy. node = nodes[0] @@ -1848,25 +1823,19 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { Status: corev1.ConditionTrue, }, }, + V1Beta2: &clusterv1.MachineHealthCheckV1Beta2Status{ + Conditions: []metav1.Condition{ + { + Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason, + }, + }, + }, })) - // Calculate how many Machines have health check succeeded = false. - g.Eventually(func() (unhealthy int) { - machines := &clusterv1.MachineList{} - err := env.List(ctx, machines, client.MatchingLabels{ - "selector": mhc.Spec.Selector.MatchLabels["selector"], - }) - if err != nil { - return -1 - } - - for i := range machines.Items { - if conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { - unhealthy++ - } - } - return - }).Should(Equal(0)) + assertMachinesNotHealthy(g, mhc, 0) + assertMachinesOwnerRemediated(g, mhc, 0) ref := corev1.ObjectReference{ APIVersion: builder.RemediationGroupVersion.String(), @@ -1889,6 +1858,62 @@ func TestMachineHealthCheck_Reconcile(t *testing.T) { }) } +func assertMachinesNotHealthy(g *WithT, mhc *clusterv1.MachineHealthCheck, expectNotHealthy int) { + g.Eventually(func() (unhealthy int) { + machines := &clusterv1.MachineList{} + err := env.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if !conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { + continue + } + if c := v1beta2conditions.Get(&machines.Items[i], clusterv1.MachineHealthCheckSucceededV1Beta2Condition); c == nil || c.Status != metav1.ConditionFalse { + continue + } + + unhealthy++ + } + return + }).Should(Equal(expectNotHealthy)) +} + +func assertMachinesOwnerRemediated(g *WithT, mhc *clusterv1.MachineHealthCheck, expectOwnerRemediated int) { + // Calculate how many Machines have health check succeeded = false. + g.Eventually(func() (ownerRemediated int) { + machines := &clusterv1.MachineList{} + err := env.List(ctx, machines, client.MatchingLabels{ + "selector": mhc.Spec.Selector.MatchLabels["selector"], + }) + if err != nil { + return -1 + } + + for i := range machines.Items { + if !conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededCondition) { + continue + } + if !conditions.Has(&machines.Items[i], clusterv1.MachineOwnerRemediatedCondition) { + continue + } + + if !v1beta2conditions.IsFalse(&machines.Items[i], clusterv1.MachineHealthCheckSucceededV1Beta2Condition) { + continue + } + if !v1beta2conditions.Has(&machines.Items[i], clusterv1.MachineOwnerRemediatedV1Beta2Condition) { + continue + } + + ownerRemediated++ + } + return + }).Should(Equal(expectOwnerRemediated)) +} + func TestClusterToMachineHealthCheck(t *testing.T) { fakeClient := fake.NewClientBuilder().Build() @@ -2659,7 +2684,14 @@ func TestPatchTargets(t *testing.T) { mhc := newMachineHealthCheckWithLabels("mhc", namespace, clusterName, labels) machine1 := newTestMachine("machine1", namespace, clusterName, "nodeName", labels) machine1.ResourceVersion = "999" + conditions.MarkTrue(machine1, clusterv1.MachineHealthCheckSucceededCondition) + v1beta2conditions.Set(machine1, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckPassedV1Beta2Reason, + }) + machine2 := machine1.DeepCopy() machine2.Name = "machine2" @@ -2701,6 +2733,7 @@ func TestPatchTargets(t *testing.T) { g.Expect(r.patchUnhealthyTargets(context.TODO(), logr.New(log.NullLogSink{}), []healthCheckTarget{target1, target3}, defaultCluster, mhc)).ToNot(BeEmpty()) g.Expect(cl.Get(ctx, client.ObjectKey{Name: machine2.Name, Namespace: machine2.Namespace}, machine2)).ToNot(HaveOccurred()) g.Expect(conditions.Get(machine2, clusterv1.MachineOwnerRemediatedCondition).Status).To(Equal(corev1.ConditionFalse)) + g.Expect(v1beta2conditions.Get(machine2, clusterv1.MachineOwnerRemediatedV1Beta2Condition).Status).To(Equal(metav1.ConditionFalse)) // Target with wrong patch helper will fail but the other one will be patched. g.Expect(r.patchHealthyTargets(context.TODO(), logr.New(log.NullLogSink{}), []healthCheckTarget{target1, target3}, mhc)).ToNot(BeEmpty()) diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go index 5544291360dd..58a6ce4a14c0 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets.go @@ -34,6 +34,7 @@ import ( "sigs.k8s.io/cluster-api/util" "sigs.k8s.io/cluster-api/util/annotations" "sigs.k8s.io/cluster-api/util/conditions" + v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2" "sigs.k8s.io/cluster-api/util/patch" ) @@ -97,6 +98,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi if annotations.HasRemediateMachine(t.Machine) { conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.HasRemediateMachineAnnotationReason, clusterv1.ConditionSeverityWarning, "Marked for remediation via remediate-machine annotation") logger.V(3).Info("Target is marked for remediation via remediate-machine annotation") + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckManuallyRemediatedV1Beta2Reason, + Message: "Marked for remediation via remediate-machine annotation", + }) return true, time.Duration(0) } @@ -116,6 +124,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi if t.nodeMissing { logger.V(3).Info("Target is unhealthy: node is missing") conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeNotFoundReason, clusterv1.ConditionSeverityWarning, "") + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckNodeDeletedV1Beta2Reason, + Message: fmt.Sprintf("Node %s has been deleted", t.Machine.Status.NodeRef.Name), + }) return true, time.Duration(0) } @@ -170,6 +185,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi if comparisonTime.Add(timeoutForMachineToHaveNode.Duration).Before(now) { conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.NodeStartupTimeoutReason, clusterv1.ConditionSeverityWarning, "Node failed to report startup in %s", timeoutDuration) logger.V(3).Info("Target is unhealthy: machine has no node", "duration", timeoutDuration) + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckNodeStartupTimeoutV1Beta2Reason, + Message: fmt.Sprintf("Node failed to report startup in %s", timeoutDuration), + }) return true, time.Duration(0) } @@ -194,6 +216,13 @@ func (t *healthCheckTarget) needsRemediation(logger logr.Logger, timeoutForMachi if nodeCondition.LastTransitionTime.Add(c.Timeout.Duration).Before(now) { conditions.MarkFalse(t.Machine, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.UnhealthyNodeConditionReason, clusterv1.ConditionSeverityWarning, "Condition %s on node is reporting status %s for more than %s", c.Type, c.Status, c.Timeout.Duration.String()) logger.V(3).Info("Target is unhealthy: condition is in state longer than allowed timeout", "condition", c.Type, "state", c.Status, "timeout", c.Timeout.Duration.String()) + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineHealthCheckUnhealthyNodeV1Beta2Reason, + Message: fmt.Sprintf("Condition %s on node is reporting status %s for more than %s", c.Type, c.Status, c.Timeout.Duration.String()), + }) return true, time.Duration(0) } @@ -327,6 +356,12 @@ func (r *Reconciler) healthCheckTargets(targets []healthCheckTarget, logger logr if t.Machine.DeletionTimestamp.IsZero() && t.Node != nil { conditions.MarkTrue(t.Machine, clusterv1.MachineHealthCheckSucceededCondition) + + v1beta2conditions.Set(t.Machine, metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionTrue, + Reason: clusterv1.MachineHealthCheckPassedV1Beta2Reason, + }) healthy = append(healthy, t) } } diff --git a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go index a44b338d2e33..4db4a8cc092f 100644 --- a/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go +++ b/internal/controllers/machinehealthcheck/machinehealthcheck_targets_test.go @@ -17,6 +17,7 @@ limitations under the License. package machinehealthcheck import ( + "fmt" "testing" "time" @@ -269,6 +270,7 @@ func TestHealthCheckTargets(t *testing.T) { Node: nil, } nodeNotYetStartedTarget1200sCondition := newFailedHealthCheckCondition(clusterv1.NodeStartupTimeoutReason, "Node failed to report startup in %s", timeoutForMachineToHaveNode) + nodeNotYetStartedTarget1200sV1Beta2Condition := newFailedHealthCheckV1Beta2Condition(clusterv1.MachineHealthCheckNodeStartupTimeoutV1Beta2Reason, "Node failed to report startup in %s", timeoutForMachineToHaveNode) testMachineCreated400s := testMachine.DeepCopy() nowMinus400s := metav1.NewTime(time.Now().Add(-400 * time.Second)) @@ -290,6 +292,7 @@ func TestHealthCheckTargets(t *testing.T) { nodeMissing: true, } nodeGoneAwayCondition := newFailedHealthCheckCondition(clusterv1.NodeNotFoundReason, "") + nodeGoneAwayV1Beta2Condition := newFailedHealthCheckV1Beta2Condition(clusterv1.MachineHealthCheckNodeDeletedV1Beta2Reason, "Node %s has been deleted", testMachine.Status.NodeRef.Name) // Create a test MHC without conditions testMHCEmptyConditions := &clusterv1.MachineHealthCheck{ @@ -369,6 +372,7 @@ func TestHealthCheckTargets(t *testing.T) { nodeMissing: false, } nodeUnknown400Condition := newFailedHealthCheckCondition(clusterv1.UnhealthyNodeConditionReason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions) + nodeUnknown400V1Beta2Condition := newFailedHealthCheckV1Beta2Condition(clusterv1.MachineHealthCheckUnhealthyNodeV1Beta2Reason, "Condition Ready on node is reporting status Unknown for more than %s", timeoutForUnhealthyConditions) // Target for when a node is healthy testNodeHealthy := newTestNode("node1") @@ -416,15 +420,17 @@ func TestHealthCheckTargets(t *testing.T) { Node: nil, } machineAnnotationRemediationCondition := newFailedHealthCheckCondition(clusterv1.HasRemediateMachineAnnotationReason, annotationRemediationMsg) + machineAnnotationRemediationV1Beta2Condition := newFailedHealthCheckV1Beta2Condition(clusterv1.MachineHealthCheckManuallyRemediatedV1Beta2Reason, annotationRemediationMsg) testCases := []struct { - desc string - targets []healthCheckTarget - timeoutForMachineToHaveNode *time.Duration - expectedHealthy []healthCheckTarget - expectedNeedsRemediation []healthCheckTarget - expectedNeedsRemediationCondition []clusterv1.Condition - expectedNextCheckTimes []time.Duration + desc string + targets []healthCheckTarget + timeoutForMachineToHaveNode *time.Duration + expectedHealthy []healthCheckTarget + expectedNeedsRemediation []healthCheckTarget + expectedNeedsRemediationCondition []clusterv1.Condition + expectedNeedsRemediationV1Beta2Condition []metav1.Condition + expectedNextCheckTimes []time.Duration }{ { desc: "when the node has not yet started for shorter than the timeout", @@ -441,20 +447,22 @@ func TestHealthCheckTargets(t *testing.T) { expectedNextCheckTimes: []time.Duration{timeoutForMachineToHaveNode - 50*time.Second}, }, { - desc: "when the node has not yet started for longer than the timeout", - targets: []healthCheckTarget{nodeNotYetStartedTarget1200s}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s}, - expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition}, - expectedNextCheckTimes: []time.Duration{}, + desc: "when the node has not yet started for longer than the timeout", + targets: []healthCheckTarget{nodeNotYetStartedTarget1200s}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeNotYetStartedTarget1200s}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeNotYetStartedTarget1200sCondition}, + expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{nodeNotYetStartedTarget1200sV1Beta2Condition}, + expectedNextCheckTimes: []time.Duration{}, }, { - desc: "when the node has gone away", - targets: []healthCheckTarget{nodeGoneAway}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway}, - expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition}, - expectedNextCheckTimes: []time.Duration{}, + desc: "when the node has gone away", + targets: []healthCheckTarget{nodeGoneAway}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeGoneAway}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition}, + expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{nodeGoneAwayV1Beta2Condition}, + expectedNextCheckTimes: []time.Duration{}, }, { desc: "when the node has been in an unknown state for shorter than the timeout", @@ -464,12 +472,13 @@ func TestHealthCheckTargets(t *testing.T) { expectedNextCheckTimes: []time.Duration{100 * time.Second}, }, { - desc: "when the node has been in an unknown state for longer than the timeout", - targets: []healthCheckTarget{nodeUnknown400}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, - expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, - expectedNextCheckTimes: []time.Duration{}, + desc: "when the node has been in an unknown state for longer than the timeout", + targets: []healthCheckTarget{nodeUnknown400}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, + expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{nodeUnknown400V1Beta2Condition}, + expectedNextCheckTimes: []time.Duration{}, }, { desc: "when the node is healthy", @@ -479,12 +488,13 @@ func TestHealthCheckTargets(t *testing.T) { expectedNextCheckTimes: []time.Duration{}, }, { - desc: "with a mix of healthy and unhealthy nodes", - targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy}, - expectedHealthy: []healthCheckTarget{nodeHealthy}, - expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, - expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, - expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second}, + desc: "with a mix of healthy and unhealthy nodes", + targets: []healthCheckTarget{nodeUnknown100, nodeUnknown200, nodeUnknown400, nodeHealthy}, + expectedHealthy: []healthCheckTarget{nodeHealthy}, + expectedNeedsRemediation: []healthCheckTarget{nodeUnknown400}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeUnknown400Condition}, + expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{nodeUnknown400V1Beta2Condition}, + expectedNextCheckTimes: []time.Duration{200 * time.Second, 100 * time.Second}, }, { desc: "when the node has not started for a long time but the startup timeout is disabled", @@ -511,20 +521,22 @@ func TestHealthCheckTargets(t *testing.T) { expectedNextCheckTimes: []time.Duration{}, }, { - desc: "when the machine is manually marked for remediation", - targets: []healthCheckTarget{machineAnnotationRemediation}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{machineAnnotationRemediation}, - expectedNeedsRemediationCondition: []clusterv1.Condition{machineAnnotationRemediationCondition}, - expectedNextCheckTimes: []time.Duration{}, + desc: "when the machine is manually marked for remediation", + targets: []healthCheckTarget{machineAnnotationRemediation}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{machineAnnotationRemediation}, + expectedNeedsRemediationCondition: []clusterv1.Condition{machineAnnotationRemediationCondition}, + expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{machineAnnotationRemediationV1Beta2Condition}, + expectedNextCheckTimes: []time.Duration{}, }, { - desc: "health check with empty unhealthy conditions and missing node", - targets: []healthCheckTarget{nodeGoneAwayEmptyConditions}, - expectedHealthy: []healthCheckTarget{}, - expectedNeedsRemediation: []healthCheckTarget{nodeGoneAwayEmptyConditions}, - expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition}, - expectedNextCheckTimes: []time.Duration{}, + desc: "health check with empty unhealthy conditions and missing node", + targets: []healthCheckTarget{nodeGoneAwayEmptyConditions}, + expectedHealthy: []healthCheckTarget{}, + expectedNeedsRemediation: []healthCheckTarget{nodeGoneAwayEmptyConditions}, + expectedNeedsRemediationCondition: []clusterv1.Condition{nodeGoneAwayCondition}, + expectedNeedsRemediationV1Beta2Condition: []metav1.Condition{nodeGoneAwayV1Beta2Condition}, + expectedNextCheckTimes: []time.Duration{}, }, { desc: "health check with empty unhealthy conditions and node", @@ -574,6 +586,16 @@ func TestHealthCheckTargets(t *testing.T) { return out } + removeLastTransitionTimesV1Beta2 := func(in []metav1.Condition) []metav1.Condition { + out := []metav1.Condition{} + for _, c := range in { + withoutTime := c.DeepCopy() + withoutTime.LastTransitionTime = metav1.Time{} + out = append(out, *withoutTime) + } + return out + } + gs.Expect(healthy).To(ConsistOf(tc.expectedHealthy)) gs.Expect(unhealthy).To(ConsistOf(tc.expectedNeedsRemediation)) gs.Expect(nextCheckTimes).To(WithTransform(roundDurations, ConsistOf(tc.expectedNextCheckTimes))) @@ -582,6 +604,11 @@ func TestHealthCheckTargets(t *testing.T) { conditionsMatcher := WithTransform(removeLastTransitionTimes, ContainElements(expectedMachineCondition)) gs.Expect(actualConditions).To(conditionsMatcher) } + for i, expectedMachineCondition := range tc.expectedNeedsRemediationV1Beta2Condition { + actualConditions := unhealthy[i].Machine.GetV1Beta2Conditions() + conditionsMatcher := WithTransform(removeLastTransitionTimesV1Beta2, ContainElements(expectedMachineCondition)) + gs.Expect(actualConditions).To(conditionsMatcher) + } }) } } @@ -655,3 +682,12 @@ func newTestUnhealthyNode(name string, condition corev1.NodeConditionType, statu func newFailedHealthCheckCondition(reason string, messageFormat string, messageArgs ...interface{}) clusterv1.Condition { return *conditions.FalseCondition(clusterv1.MachineHealthCheckSucceededCondition, reason, clusterv1.ConditionSeverityWarning, messageFormat, messageArgs...) } + +func newFailedHealthCheckV1Beta2Condition(reason string, messageFormat string, messageArgs ...interface{}) metav1.Condition { + return metav1.Condition{ + Type: clusterv1.MachineHealthCheckSucceededV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: reason, + Message: fmt.Sprintf(messageFormat, messageArgs...), + } +} diff --git a/util/conditions/v1beta2/getter.go b/util/conditions/v1beta2/getter.go index 3d8364fde850..90bf0d61ee8c 100644 --- a/util/conditions/v1beta2/getter.go +++ b/util/conditions/v1beta2/getter.go @@ -55,6 +55,38 @@ func Get(sourceObj Getter, sourceConditionType string) *metav1.Condition { return meta.FindStatusCondition(sourceObj.GetV1Beta2Conditions(), sourceConditionType) } +// Has returns true if a condition with the given type exists. +func Has(from Getter, conditionType string) bool { + return Get(from, conditionType) != nil +} + +// IsTrue is true if the condition with the given type is True, otherwise it returns false +// if the condition is not True or if the condition does not exist (is nil). +func IsTrue(from Getter, conditionType string) bool { + if c := Get(from, conditionType); c != nil { + return c.Status == metav1.ConditionTrue + } + return false +} + +// IsFalse is true if the condition with the given type is False, otherwise it returns false +// if the condition is not False or if the condition does not exist (is nil). +func IsFalse(from Getter, conditionType string) bool { + if c := Get(from, conditionType); c != nil { + return c.Status == metav1.ConditionFalse + } + return false +} + +// IsUnknown is true if the condition with the given type is Unknown or if the condition +// does not exist (is nil). +func IsUnknown(from Getter, conditionType string) bool { + if c := Get(from, conditionType); c != nil { + return c.Status == metav1.ConditionUnknown + } + return true +} + // UnstructuredGet returns a condition from an Unstructured object. // // UnstructuredGet supports retrieving conditions from objects at different stages of the transition from diff --git a/util/conditions/v1beta2/setter.go b/util/conditions/v1beta2/setter.go index 9a6be20a3a90..c41b98b12332 100644 --- a/util/conditions/v1beta2/setter.go +++ b/util/conditions/v1beta2/setter.go @@ -97,3 +97,19 @@ func Set(targetObj Setter, condition metav1.Condition, opts ...SetOption) { targetObj.SetV1Beta2Conditions(conditions) } + +// Delete deletes the condition with the given type. +func Delete(to Setter, conditionType string) { + if to == nil { + return + } + + conditions := to.GetV1Beta2Conditions() + newConditions := make([]metav1.Condition, 0, len(conditions)-1) + for _, condition := range conditions { + if condition.Type != conditionType { + newConditions = append(newConditions, condition) + } + } + to.SetV1Beta2Conditions(newConditions) +}