diff --git a/api/v1beta1/cluster_types.go b/api/v1beta1/cluster_types.go index b573429366fa..bbce8f181311 100644 --- a/api/v1beta1/cluster_types.go +++ b/api/v1beta1/cluster_types.go @@ -45,6 +45,12 @@ const ( // ClusterAvailableV1Beta2Condition is true if the Cluster is not deleted, and RemoteConnectionProbe, InfrastructureReady, // ControlPlaneAvailable, WorkersAvailable, TopologyReconciled (if present) conditions are true. // If conditions are defined in spec.availabilityGates, those conditions must be true as well. + // Note: + // - When summarizing TopologyReconciled, all reasons except TopologyReconcileFailed and ClusterClassNotReconciled will + // be treated as info. This is because even if topology is not fully reconciled, this is an expected temporary state + // and it doesn't impact availability. + // - When summarizing InfrastructureReady, ControlPlaneAvailable, in case the Cluster is deleting, the absence of the + // referenced object won't be considered as an issue. ClusterAvailableV1Beta2Condition = AvailableV1Beta2Condition // ClusterAvailableV1Beta2Reason surfaces when the cluster availability criteria is met. @@ -270,6 +276,7 @@ const ( // Cluster's ControlPlaneMachinesUpToDate condition and corresponding reasons that will be used in v1Beta2 API version. const ( // ClusterControlPlaneMachinesUpToDateV1Beta2Condition surfaces details of control plane machines not up to date, if any. + // Note: New machines are considered 10s after machine creation. This gives time to the machine's owner controller to recognize the new machine and add the UpToDate condition. ClusterControlPlaneMachinesUpToDateV1Beta2Condition = "ControlPlaneMachinesUpToDate" // ClusterControlPlaneMachinesUpToDateV1Beta2Reason surfaces when all the control plane machine's UpToDate conditions are true. @@ -293,6 +300,7 @@ const ( // Cluster's WorkerMachinesUpToDate condition and corresponding reasons that will be used in v1Beta2 API version. const ( // ClusterWorkerMachinesUpToDateV1Beta2Condition surfaces details of worker machines not up to date, if any. + // Note: New machines are considered 10s after machine creation. This gives time to the machine's owner controller to recognize the new machine and add the UpToDate condition. ClusterWorkerMachinesUpToDateV1Beta2Condition = "WorkerMachinesUpToDate" // ClusterWorkerMachinesUpToDateV1Beta2Reason surfaces when all the worker machine's UpToDate conditions are true. diff --git a/api/v1beta1/machine_types.go b/api/v1beta1/machine_types.go index 2af03602e4be..fccc584fe5a4 100644 --- a/api/v1beta1/machine_types.go +++ b/api/v1beta1/machine_types.go @@ -104,11 +104,19 @@ const ( ) // Machine's Ready condition and corresponding reasons that will be used in v1Beta2 API version. -// Note: when possible, Ready condition will use reasons from the conditions it summarizes. const ( // MachineReadyV1Beta2Condition is true if the Machine's deletionTimestamp is not set, Machine's BootstrapConfigReady, InfrastructureReady, // NodeHealthy and HealthCheckSucceeded (if present) conditions are true; if other conditions are defined in spec.readinessGates, // these conditions must be true as well. + // Note: + // - When summarizing the Deleting condition: + // - Details about Pods stuck in draining or volumes waiting for detach are dropped, in order to improve readability & reduce flickering + // of the condition that bubbles up to the owning resources/ to the Cluster (it also makes it more likely this condition might be aggregated with + // conditions reported by other machines). + // - If deletion is in progress for more than 15m, this surfaces on the summary condition (hint about a possible stale deletion). + // - if drain is in progress for more than 5 minutes, a summery of what is blocking drain also surfaces in the message. + // - When summarizing BootstrapConfigReady, InfrastructureReady, NodeHealthy, in case the Machine is deleting, the absence of the + // referenced object won't be considered as an issue. MachineReadyV1Beta2Condition = ReadyV1Beta2Condition // MachineReadyV1Beta2Reason surfaces when the machine readiness criteria is met. @@ -420,7 +428,11 @@ type MachineSpec struct { // Another example are external controllers, e.g. responsible to install special software/hardware on the Machines; // they can include the status of those components with a new condition and add this condition to ReadinessGates. // - // NOTE: this field is considered only for computing v1beta2 conditions. + // NOTE: This field is considered only for computing v1beta2 conditions. + // NOTE: In case readinessGates conditions start with the APIServer, ControllerManager, Scheduler prefix, and all those + // readiness gates condition are reporting the same message, when computing the Machine's Ready condition those + // readinessGates will be replaced by a single entry reporting "Control plane components: " + message. + // This helps to improve readability of conditions bubbling up to the Machine's owner resource / to the Cluster). // +optional // +listType=map // +listMapKey=conditionType diff --git a/api/v1beta1/machinedeployment_types.go b/api/v1beta1/machinedeployment_types.go index 90bcaba13a63..2828550efd6e 100644 --- a/api/v1beta1/machinedeployment_types.go +++ b/api/v1beta1/machinedeployment_types.go @@ -128,6 +128,7 @@ const ( // MachineDeployment's MachinesUpToDate condition and corresponding reasons that will be used in v1Beta2 API version. const ( // MachineDeploymentMachinesUpToDateV1Beta2Condition surfaces details of controlled machines not up to date, if any. + // Note: New machines are considered 10s after machine creation. This gives time to the machine's owner controller to recognize the new machine and add the UpToDate condition. MachineDeploymentMachinesUpToDateV1Beta2Condition = MachinesUpToDateV1Beta2Condition // MachineDeploymentMachinesUpToDateV1Beta2Reason surfaces when all the controlled machine's UpToDate conditions are true. diff --git a/api/v1beta1/machineset_types.go b/api/v1beta1/machineset_types.go index 648a6413bb31..1abbd906cd45 100644 --- a/api/v1beta1/machineset_types.go +++ b/api/v1beta1/machineset_types.go @@ -90,6 +90,7 @@ type MachineSetSpec struct { // MachineSet's ScalingUp condition and corresponding reasons that will be used in v1Beta2 API version. const ( // MachineSetScalingUpV1Beta2Condition is true if actual replicas < desired replicas. + // Note: In case a MachineSet preflight check is preventing scale up, this will surface in the condition message. MachineSetScalingUpV1Beta2Condition = ScalingUpV1Beta2Condition // MachineSetScalingUpV1Beta2Reason surfaces when actual replicas < desired replicas. @@ -153,6 +154,7 @@ const ( // Note: Reason's could also be derived from the aggregation of machine's MachinesUpToDate conditions. const ( // MachineSetMachinesUpToDateV1Beta2Condition surfaces details of controlled machines not up to date, if any. + // Note: New machines are considered 10s after machine creation. This gives time to the machine's owner controller to recognize the new machine and add the UpToDate condition. MachineSetMachinesUpToDateV1Beta2Condition = MachinesUpToDateV1Beta2Condition // MachineSetMachinesUpToDateV1Beta2Reason surfaces when all the controlled machine's UpToDate conditions are true. diff --git a/api/v1beta1/zz_generated.openapi.go b/api/v1beta1/zz_generated.openapi.go index a63208ec3357..3167c70da4e7 100644 --- a/api/v1beta1/zz_generated.openapi.go +++ b/api/v1beta1/zz_generated.openapi.go @@ -3951,7 +3951,7 @@ func schema_sigsk8sio_cluster_api_api_v1beta1_MachineSpec(ref common.ReferenceCa }, }, SchemaProps: spec.SchemaProps{ - Description: "readinessGates specifies additional conditions to include when evaluating Machine Ready condition.\n\nThis field can be used e.g. by Cluster API control plane providers to extend the semantic of the Ready condition for the Machine they control, like the kubeadm control provider adding ReadinessGates for the APIServerPodHealthy, SchedulerPodHealthy conditions, etc.\n\nAnother example are external controllers, e.g. responsible to install special software/hardware on the Machines; they can include the status of those components with a new condition and add this condition to ReadinessGates.\n\nNOTE: this field is considered only for computing v1beta2 conditions.", + Description: "readinessGates specifies additional conditions to include when evaluating Machine Ready condition.\n\nThis field can be used e.g. by Cluster API control plane providers to extend the semantic of the Ready condition for the Machine they control, like the kubeadm control provider adding ReadinessGates for the APIServerPodHealthy, SchedulerPodHealthy conditions, etc.\n\nAnother example are external controllers, e.g. responsible to install special software/hardware on the Machines; they can include the status of those components with a new condition and add this condition to ReadinessGates.\n\nNOTE: This field is considered only for computing v1beta2 conditions. NOTE: In case readinessGates conditions start with the APIServer, ControllerManager, Scheduler prefix, and all those readiness gates condition are reporting the same message, when computing the Machine's Ready condition those readinessGates will be replaced by a single entry reporting \"Control plane components: \" + message. This helps to improve readability of conditions bubbling up to the Machine's owner resource / to the Cluster).", Type: []string{"array"}, Items: &spec.SchemaOrArray{ Schema: &spec.Schema{ diff --git a/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml b/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml index 80ef8adff217..4ebcaf5c24c3 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinedeployments.yaml @@ -1508,7 +1508,11 @@ spec: Another example are external controllers, e.g. responsible to install special software/hardware on the Machines; they can include the status of those components with a new condition and add this condition to ReadinessGates. - NOTE: this field is considered only for computing v1beta2 conditions. + NOTE: This field is considered only for computing v1beta2 conditions. + NOTE: In case readinessGates conditions start with the APIServer, ControllerManager, Scheduler prefix, and all those + readiness gates condition are reporting the same message, when computing the Machine's Ready condition those + readinessGates will be replaced by a single entry reporting "Control plane components: " + message. + This helps to improve readability of conditions bubbling up to the Machine's owner resource / to the Cluster). items: description: MachineReadinessGate contains the type of a Machine condition to be used as a readiness gate. diff --git a/config/crd/bases/cluster.x-k8s.io_machinepools.yaml b/config/crd/bases/cluster.x-k8s.io_machinepools.yaml index ef4f4fc8e6c9..6c6bdae5252d 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinepools.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinepools.yaml @@ -1271,7 +1271,11 @@ spec: Another example are external controllers, e.g. responsible to install special software/hardware on the Machines; they can include the status of those components with a new condition and add this condition to ReadinessGates. - NOTE: this field is considered only for computing v1beta2 conditions. + NOTE: This field is considered only for computing v1beta2 conditions. + NOTE: In case readinessGates conditions start with the APIServer, ControllerManager, Scheduler prefix, and all those + readiness gates condition are reporting the same message, when computing the Machine's Ready condition those + readinessGates will be replaced by a single entry reporting "Control plane components: " + message. + This helps to improve readability of conditions bubbling up to the Machine's owner resource / to the Cluster). items: description: MachineReadinessGate contains the type of a Machine condition to be used as a readiness gate. diff --git a/config/crd/bases/cluster.x-k8s.io_machines.yaml b/config/crd/bases/cluster.x-k8s.io_machines.yaml index 82a59fe02e50..2f9a1db1e796 100644 --- a/config/crd/bases/cluster.x-k8s.io_machines.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machines.yaml @@ -1037,7 +1037,11 @@ spec: Another example are external controllers, e.g. responsible to install special software/hardware on the Machines; they can include the status of those components with a new condition and add this condition to ReadinessGates. - NOTE: this field is considered only for computing v1beta2 conditions. + NOTE: This field is considered only for computing v1beta2 conditions. + NOTE: In case readinessGates conditions start with the APIServer, ControllerManager, Scheduler prefix, and all those + readiness gates condition are reporting the same message, when computing the Machine's Ready condition those + readinessGates will be replaced by a single entry reporting "Control plane components: " + message. + This helps to improve readability of conditions bubbling up to the Machine's owner resource / to the Cluster). items: description: MachineReadinessGate contains the type of a Machine condition to be used as a readiness gate. diff --git a/config/crd/bases/cluster.x-k8s.io_machinesets.yaml b/config/crd/bases/cluster.x-k8s.io_machinesets.yaml index f565c15262a1..04c517dee597 100644 --- a/config/crd/bases/cluster.x-k8s.io_machinesets.yaml +++ b/config/crd/bases/cluster.x-k8s.io_machinesets.yaml @@ -1246,7 +1246,11 @@ spec: Another example are external controllers, e.g. responsible to install special software/hardware on the Machines; they can include the status of those components with a new condition and add this condition to ReadinessGates. - NOTE: this field is considered only for computing v1beta2 conditions. + NOTE: This field is considered only for computing v1beta2 conditions. + NOTE: In case readinessGates conditions start with the APIServer, ControllerManager, Scheduler prefix, and all those + readiness gates condition are reporting the same message, when computing the Machine's Ready condition those + readinessGates will be replaced by a single entry reporting "Control plane components: " + message. + This helps to improve readability of conditions bubbling up to the Machine's owner resource / to the Cluster). items: description: MachineReadinessGate contains the type of a Machine condition to be used as a readiness gate. diff --git a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go index b6aec2682458..cbae8525ef95 100644 --- a/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go +++ b/controlplane/kubeadm/api/v1beta1/v1beta2_condition_consts.go @@ -30,6 +30,9 @@ const ( // plane cannot be considered operational (if etcd is not operational on a machine, most likely also API server, // scheduler and controller manager on the same machine will be impacted). // - In case of external etcd, KCP cannot make any assumption on etcd status, so all the etcd checks are skipped. + // + // Please note that when this condition is true, partial unavailability will be surfaced in the condition message, + // but with a 10s delay to ensure flakes do not impact condition stability. KubeadmControlPlaneAvailableV1Beta2Condition = clusterv1.AvailableV1Beta2Condition // KubeadmControlPlaneAvailableInspectionFailedV1Beta2Reason documents a failure when inspecting the status of the @@ -157,6 +160,7 @@ const ( // KubeadmControlPlane's MachinesUpToDate condition and corresponding reasons that will be used in v1Beta2 API version. const ( // KubeadmControlPlaneMachinesUpToDateV1Beta2Condition surfaces details of controlled machines not up to date, if any. + // Note: New machines are considered 10s after machine creation. This gives time to the machine's owner controller to recognize the new machine and add the UpToDate condition. KubeadmControlPlaneMachinesUpToDateV1Beta2Condition = clusterv1.MachinesUpToDateV1Beta2Condition // KubeadmControlPlaneMachinesUpToDateV1Beta2Reason surfaces when all the controlled machine's UpToDate conditions are true. @@ -191,6 +195,7 @@ const ( // KubeadmControlPlane's ScalingUp condition and corresponding reasons that will be used in v1Beta2 API version. const ( // KubeadmControlPlaneScalingUpV1Beta2Condition is true if actual replicas < desired replicas. + // Note: In case a KubeadmControlPlane preflight check is preventing scale up, this will surface in the condition message. KubeadmControlPlaneScalingUpV1Beta2Condition = clusterv1.ScalingUpV1Beta2Condition // KubeadmControlPlaneScalingUpV1Beta2Reason surfaces when actual replicas < desired replicas. @@ -207,6 +212,7 @@ const ( // KubeadmControlPlane's ScalingDown condition and corresponding reasons that will be used in v1Beta2 API version. const ( // KubeadmControlPlaneScalingDownV1Beta2Condition is true if actual replicas > desired replicas. + // Note: In case a KubeadmControlPlane preflight check is preventing scale down, this will surface in the condition message. KubeadmControlPlaneScalingDownV1Beta2Condition = clusterv1.ScalingDownV1Beta2Condition // KubeadmControlPlaneScalingDownV1Beta2Reason surfaces when actual replicas > desired replicas. diff --git a/controlplane/kubeadm/internal/controllers/controller.go b/controlplane/kubeadm/internal/controllers/controller.go index 6ffc52c1920b..6fd32914525b 100644 --- a/controlplane/kubeadm/internal/controllers/controller.go +++ b/controlplane/kubeadm/internal/controllers/controller.go @@ -977,6 +977,7 @@ func reconcileMachineUpToDateCondition(_ context.Context, controlPlane *internal for _, machine := range controlPlane.Machines { if machinesNotUptoDateNames.Has(machine.Name) { + // Note: the code computing the message for KCP's RolloutOut condition is making assumptions on the format/content of this message. message := "" if reasons, ok := machinesNotUptoDateConditionMessages[machine.Name]; ok { for i := range reasons { diff --git a/controlplane/kubeadm/internal/filters.go b/controlplane/kubeadm/internal/filters.go index 65a788e62dbb..8eef8c646a25 100644 --- a/controlplane/kubeadm/internal/filters.go +++ b/controlplane/kubeadm/internal/filters.go @@ -50,6 +50,7 @@ func matchesMachineSpec(infraConfigs map[string]*unstructured.Unstructured, mach machineVersion = *machine.Spec.Version } logMessages = append(logMessages, fmt.Sprintf("Machine version %q is not equal to KCP version %q", machineVersion, kcp.Spec.Version)) + // Note: the code computing the message for KCP's RolloutOut condition is making assumptions on the format/content of this message. conditionMessages = append(conditionMessages, fmt.Sprintf("Version %s, %s required", machineVersion, kcp.Spec.Version)) } diff --git a/internal/controllers/machine/drain/drain.go b/internal/controllers/machine/drain/drain.go index 802f1448f9c3..354d6504ea7d 100644 --- a/internal/controllers/machine/drain/drain.go +++ b/internal/controllers/machine/drain/drain.go @@ -452,6 +452,8 @@ func (r EvictionResult) ConditionMessage(nodeDrainStartTime *metav1.Time) string if len(r.PodsDeletionTimestampSet) > 1 { kind = "Pods" } + // Note: the code computing stale warning for the machine deleting condition is making assumptions on the format/content of this message. + // Same applies for other conditions where deleting is involved, e.g. MachineSet's Deleting and ScalingDown condition. conditionMessage = fmt.Sprintf("%s\n* %s %s: deletionTimestamp set, but still not removed from the Node", conditionMessage, kind, PodListToString(r.PodsDeletionTimestampSet, 3)) } @@ -470,6 +472,8 @@ func (r EvictionResult) ConditionMessage(nodeDrainStartTime *metav1.Time) string if len(pods) > 1 { kind = "Pods" } + // Note: the code computing stale warning for the machine deleting condition is making assumptions on the format/content of this message. + // Same applies for other conditions where deleting is involved, e.g. MachineSet's Deleting and ScalingDown condition. failureMessage = strings.Replace(failureMessage, "Cannot evict pod as it would violate the pod's disruption budget.", "cannot evict pod as it would violate the pod's disruption budget.", -1) if !strings.HasPrefix(failureMessage, "cannot evict pod as it would violate the pod's disruption budget.") { failureMessage = "failed to evict Pod, " + failureMessage diff --git a/internal/controllers/machinedeployment/mdutil/util.go b/internal/controllers/machinedeployment/mdutil/util.go index 8451038bd0fe..1519b3dbefa3 100644 --- a/internal/controllers/machinedeployment/mdutil/util.go +++ b/internal/controllers/machinedeployment/mdutil/util.go @@ -379,6 +379,7 @@ func MachineTemplateUpToDate(current, desired *clusterv1.MachineTemplateSpec) (u if !reflect.DeepEqual(currentCopy.Spec.Version, desiredCopy.Spec.Version) { logMessages = append(logMessages, fmt.Sprintf("spec.version %s, %s required", ptr.Deref(currentCopy.Spec.Version, "nil"), ptr.Deref(desiredCopy.Spec.Version, "nil"))) + // Note: the code computing the message for MachineDeployment's RolloutOut condition is making assumptions on the format/content of this message. conditionMessages = append(conditionMessages, fmt.Sprintf("Version %s, %s required", ptr.Deref(currentCopy.Spec.Version, "nil"), ptr.Deref(desiredCopy.Spec.Version, "nil"))) } diff --git a/internal/controllers/machineset/machineset_controller.go b/internal/controllers/machineset/machineset_controller.go index 715c6c00a677..e46833eb77d5 100644 --- a/internal/controllers/machineset/machineset_controller.go +++ b/internal/controllers/machineset/machineset_controller.go @@ -645,9 +645,10 @@ func newMachineUpToDateCondition(s *scope) *metav1.Condition { conditionMessages[i] = fmt.Sprintf("* %s", conditionMessages[i]) } return &metav1.Condition{ - Type: clusterv1.MachineUpToDateV1Beta2Condition, - Status: metav1.ConditionFalse, - Reason: clusterv1.MachineNotUpToDateV1Beta2Reason, + Type: clusterv1.MachineUpToDateV1Beta2Condition, + Status: metav1.ConditionFalse, + Reason: clusterv1.MachineNotUpToDateV1Beta2Reason, + // Note: the code computing the message for MachineDeployment's RolloutOut condition is making assumptions on the format/content of this message. Message: strings.Join(conditionMessages, "\n"), } }