Skip to content

Commit

Permalink
Update MHC with v1Beta2 status
Browse files Browse the repository at this point in the history
  • Loading branch information
fabriziopandini committed Oct 15, 2024
1 parent 2beebbd commit a1ebe62
Show file tree
Hide file tree
Showing 9 changed files with 673 additions and 421 deletions.
54 changes: 51 additions & 3 deletions api/v1beta1/machine_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,22 +200,70 @@ const (
// Note: this could happen when creating the machine. However, this state should be treated as an error if it lasts indefinitely.
MachineNodeDoesNotExistV1Beta2Reason = ObjectDoesNotExistV1Beta2Reason

// MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted.
// MachineNodeDeletedV1Beta2Reason surfaces when the node hosted on the machine has been deleted.
// Note: controllers can't identify if the Node was deleted by the controller itself, e.g.
// during the deletion workflow, or by a users.
MachineNodeDeletedV1Beta2Reason = ObjectDeletedV1Beta2Reason
)

// Machine's HealthCheckSucceeded and OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
// Note: HealthCheckSucceeded and OwnerRemediated condition are set by the MachineHealthCheck controller.
// Machine's HealthCheckSucceeded condition and corresponding reasons that will be used in v1Beta2 API version.
// Note: HealthCheckSucceeded condition ia set by the MachineHealthCheck controller.
const (
// MachineHealthCheckSucceededV1Beta2Condition is true if MHC instances targeting this machine report the Machine
// is healthy according to the definition of healthy present in the spec of the MachineHealthCheck object.
MachineHealthCheckSucceededV1Beta2Condition = "HealthCheckSucceeded"

// MachineHealthCheckPassedV1Beta2Reason surfaces when a machine passes all the health checks defined by a MachineHealthCheck object.
MachineHealthCheckPassedV1Beta2Reason = "HealthCheckPassed"

// MachineHealthCheckUnhealthyNodeV1Beta2Reason surfaces when the node hosted on the machine do not pass the health checks
// defined by a MachineHealthCheck object.
MachineHealthCheckUnhealthyNodeV1Beta2Reason = "UnhealthyNode"

// MachineHealthCheckNodeStartupTimeoutV1Beta2Reason surfaces when the node hosted on the machine does not appear within
// he timeout defined by a MachineHealthCheck object.
MachineHealthCheckNodeStartupTimeoutV1Beta2Reason = "NodeStartupTimeout"

// MachineHealthCheckNodeDeletedV1Beta2Reason surfaces when a MachineHealthCheck detect that the node hosted on the
// machine has been deleted while the Machine is still running.
MachineHealthCheckNodeDeletedV1Beta2Reason = "NodeDeleted"

// MachineHealthCheckManuallyRemediatedV1Beta2Reason surfaces a MachineHealthCheck detects a machine manually remediated
// via the remediate-machine annotation.
MachineHealthCheckManuallyRemediatedV1Beta2Reason = "ManuallyRemediated"
)

// Machine's OwnerRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
// Note: OwnerRemediated condition is created by the MachineHealthCheck controller, and then Set by the Machine's owner controller.
const (
// MachineOwnerRemediatedV1Beta2Condition is only present if MHC instances targeting this machine
// determine that the controller owning this machine should perform remediation.
MachineOwnerRemediatedV1Beta2Condition = "OwnerRemediated"

// MachineOwnerRemediatedWaitingForOwnerV1Beta2Reason surfaces the machine is waiting for the owner controller
// to start remediation.
MachineOwnerRemediatedWaitingForOwnerV1Beta2Reason = "WaitingForOwner"
)

// Machine's ExternallyRemediated conditions and corresponding reasons that will be used in v1Beta2 API version.
// Note: ExternallyRemediated condition is created by the MachineHealthCheck controller, and then Set by the external
// remediation controller.
const (
// MachineExternallyRemediatedV1Beta2Condition is only present if MHC instances targeting this machine
// determine that an external controller should perform remediation.
MachineExternallyRemediatedV1Beta2Condition = "ExternallyRemediated"

// MachineExternallyRemediatedWaitingForExternalRemediationV1Beta2Reason surfaces the machine is waiting for the
// external remediation controller to start remediation.
MachineExternallyRemediatedWaitingForExternalRemediationV1Beta2Reason = "WaitingForExternalRemediation"

// MachineOwnerRemediateExternalRemediationTemplateNotFoundV1Beta2Reason surfaces that the MachineHealthCheck cannot
// find the template for a external remediation request.
MachineOwnerRemediateExternalRemediationTemplateNotFoundV1Beta2Reason = "ExternalRemediationTemplateNotFound"

// MachineOwnerRemediateExternalRemediationRequestCreationFailedV1Beta2Reason surfaces that the MachineHealthCheck cannot
// create a request for the external remediation controller.
MachineOwnerRemediateExternalRemediationRequestCreationFailedV1Beta2Reason = "ExternalRemediationRequestCreationFailed"
)

// Machine's Deleting condition and corresponding reasons that will be used in v1Beta2 API version.
Expand Down
21 changes: 21 additions & 0 deletions api/v1beta1/machinehealthcheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,27 @@ import (
"k8s.io/apimachinery/pkg/util/intstr"
)

// MachineHealthCheck's RemediationAllowed condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is
// allowed to remediate any Machines or whether it is blocked from remediating any further.
MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed"

// MachineHealthCheckTooManyUnhealthyV1Beta2Reason is the reason used when too many Machines are unhealthy and
// the MachineHealthCheck is blocked from making any further remediation.
MachineHealthCheckTooManyUnhealthyV1Beta2Reason = "TooManyUnhealthy"

// MachineHealthCheckRemediationAllowedV1Beta2Reason is the reason used when the number of unhealthy machine
// is within the limits defined by the MachineHealthCheck, and thus remediation is allowed.
MachineHealthCheckRemediationAllowedV1Beta2Reason = "RemediationAllowed"
)

// MachineHealthCheck's Paused condition and corresponding reasons that will be used in v1Beta2 API version.
const (
// MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused.
MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition
)

var (
// DefaultNodeStartupTimeout is the time allowed for a node to start up.
// Can be made longer as part of spec if required for particular provider.
Expand Down
10 changes: 0 additions & 10 deletions api/v1beta1/v1beta2_condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -225,16 +225,6 @@ const (
ClusterPausedV1Beta2Condition = PausedV1Beta2Condition
)

// Conditions that will be used for the MachineHealthCheck object in v1Beta2 API version.
const (
// MachineHealthCheckRemediationAllowedV1Beta2Condition surfaces whether the MachineHealthCheck is
// allowed to remediate any Machines or whether it is blocked from remediating any further.
MachineHealthCheckRemediationAllowedV1Beta2Condition = "RemediationAllowed"

// MachineHealthCheckPausedV1Beta2Condition is true if this MachineHealthCheck or the Cluster it belongs to are paused.
MachineHealthCheckPausedV1Beta2Condition = PausedV1Beta2Condition
)

// Conditions that will be used for the ClusterClass object in v1Beta2 API version.
const (
// ClusterClassVariablesReadyV1Beta2Condition is true if the ClusterClass variables, including both inline and external
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ import (
"sigs.k8s.io/cluster-api/util"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
"sigs.k8s.io/cluster-api/util/patch"
"sigs.k8s.io/cluster-api/util/predicates"
)
Expand Down Expand Up @@ -279,6 +280,13 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
Message: message,
})

v1beta2conditions.Set(m, metav1.Condition{
Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineHealthCheckTooManyUnhealthyV1Beta2Reason,
Message: message,
})

// If there are no unhealthy target, skip publishing the `RemediationRestricted` event to avoid misleading.
if len(unhealthy) != 0 {
r.recorder.Event(
Expand Down Expand Up @@ -321,6 +329,12 @@ func (r *Reconciler) reconcile(ctx context.Context, logger logr.Logger, cluster
m.Status.RemediationsAllowed = remediationCount
conditions.MarkTrue(m, clusterv1.RemediationAllowedCondition)

v1beta2conditions.Set(m, metav1.Condition{
Type: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.MachineHealthCheckRemediationAllowedV1Beta2Reason,
})

errList := r.patchUnhealthyTargets(ctx, logger, unhealthy, cluster, m)
errList = append(errList, r.patchHealthyTargets(ctx, logger, healthy, m)...)

Expand Down Expand Up @@ -399,6 +413,13 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg
from, err := external.Get(ctx, r.Client, m.Spec.RemediationTemplate, t.Machine.Namespace)
if err != nil {
conditions.MarkFalse(m, clusterv1.ExternalRemediationTemplateAvailableCondition, clusterv1.ExternalRemediationTemplateNotFoundReason, clusterv1.ConditionSeverityError, err.Error())

v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediateExternalRemediationTemplateNotFoundV1Beta2Reason,
Message: fmt.Sprintf("error retrieving remediation template %s %s", m.Spec.RemediationTemplate.Kind, klog.KRef(t.Machine.Namespace, m.Spec.RemediationTemplate.Name)),
})
errList = append(errList, errors.Wrapf(err, "error retrieving remediation template %v %q for machine %q in namespace %q within cluster %q", m.Spec.RemediationTemplate.GroupVersionKind(), m.Spec.RemediationTemplate.Name, t.Machine.Name, t.Machine.Namespace, m.Spec.ClusterName))
return errList
}
Expand Down Expand Up @@ -428,16 +449,36 @@ func (r *Reconciler) patchUnhealthyTargets(ctx context.Context, logger logr.Logg
// Create the external clone.
if err := r.Client.Create(ctx, to); err != nil {
conditions.MarkFalse(m, clusterv1.ExternalRemediationRequestAvailableCondition, clusterv1.ExternalRemediationRequestCreationFailedReason, clusterv1.ConditionSeverityError, err.Error())

v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediateExternalRemediationRequestCreationFailedV1Beta2Reason,
})
errList = append(errList, errors.Wrapf(err, "error creating remediation request for machine %q in namespace %q within cluster %q", t.Machine.Name, t.Machine.Namespace, t.Machine.Spec.ClusterName))
return errList
}

v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineExternallyRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineExternallyRemediatedWaitingForExternalRemediationV1Beta2Reason,
})
} else {
logger.Info("Target has failed health check, marking for remediation", "target", t.string(), "reason", condition.Reason, "message", condition.Message)
// NOTE: MHC is responsible for creating MachineOwnerRemediatedCondition if missing or to trigger another remediation if the previous one is completed;
// instead, if a remediation is in already progress, the remediation owner is responsible for completing the process and MHC should not overwrite the condition.
if !conditions.Has(t.Machine, clusterv1.MachineOwnerRemediatedCondition) || conditions.IsTrue(t.Machine, clusterv1.MachineOwnerRemediatedCondition) {
conditions.MarkFalse(t.Machine, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
}

if ownerRemediatedCondition := v1beta2conditions.Get(t.Machine, clusterv1.MachineOwnerRemediatedV1Beta2Condition); ownerRemediatedCondition == nil || ownerRemediatedCondition.Status == metav1.ConditionTrue {
v1beta2conditions.Set(t.Machine, metav1.Condition{
Type: clusterv1.MachineOwnerRemediatedV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.MachineOwnerRemediatedWaitingForOwnerV1Beta2Reason,
})
}
}
}

Expand Down
Loading

0 comments on commit a1ebe62

Please sign in to comment.