From 1b69b020446c059a72dd358f6a1a1fa2e1aedc2c Mon Sep 17 00:00:00 2001 From: k8s-infra-cherrypick-robot <90416843+k8s-infra-cherrypick-robot@users.noreply.github.com> Date: Fri, 20 Dec 2024 02:24:07 -0800 Subject: [PATCH] =?UTF-8?q?[release-1.9]=20=F0=9F=8C=B1=20Improve=20KCP=20?= =?UTF-8?q?scale=20up=20when=20using=20failure=20domains=20(#11604)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Improve KCP scale up when using failure domains * Address comments * Address feedback --------- Co-authored-by: fabriziopandini --- .../kubeadm/internal/control_plane.go | 27 +- .../kubeadm/internal/controllers/scale.go | 28 +- util/failuredomains/failure_domains.go | 112 ++- util/failuredomains/failure_domains_test.go | 729 +++++++++++++++++- 4 files changed, 822 insertions(+), 74 deletions(-) diff --git a/controlplane/kubeadm/internal/control_plane.go b/controlplane/kubeadm/internal/control_plane.go index 5887ab582e71..f79dee1d8ddd 100644 --- a/controlplane/kubeadm/internal/control_plane.go +++ b/controlplane/kubeadm/internal/control_plane.go @@ -153,9 +153,10 @@ func (c *ControlPlane) FailureDomains() clusterv1.FailureDomains { } // MachineInFailureDomainWithMostMachines returns the first matching failure domain with machines that has the most control-plane machines on it. -func (c *ControlPlane) MachineInFailureDomainWithMostMachines(ctx context.Context, machines collections.Machines) (*clusterv1.Machine, error) { - fd := c.FailureDomainWithMostMachines(ctx, machines) - machinesInFailureDomain := machines.Filter(collections.InFailureDomains(fd)) +// Note: if there are eligibleMachines machines in failure domain that do not exists anymore, getting rid of those machines take precedence. +func (c *ControlPlane) MachineInFailureDomainWithMostMachines(ctx context.Context, eligibleMachines collections.Machines) (*clusterv1.Machine, error) { + fd := c.FailureDomainWithMostMachines(ctx, eligibleMachines) + machinesInFailureDomain := eligibleMachines.Filter(collections.InFailureDomains(fd)) machineToMark := machinesInFailureDomain.Oldest() if machineToMark == nil { return nil, errors.New("failed to pick control plane Machine to mark for deletion") @@ -171,11 +172,11 @@ func (c *ControlPlane) MachineWithDeleteAnnotation(machines collections.Machines return annotatedMachines } -// FailureDomainWithMostMachines returns a fd which exists both in machines and control-plane machines and has the most -// control-plane machines on it. -func (c *ControlPlane) FailureDomainWithMostMachines(ctx context.Context, machines collections.Machines) *string { +// FailureDomainWithMostMachines returns the fd with most machines in it and at least one eligible machine in it. +// Note: if there are eligibleMachines machines in failure domain that do not exist anymore, cleaning up those failure domains takes precedence. +func (c *ControlPlane) FailureDomainWithMostMachines(ctx context.Context, eligibleMachines collections.Machines) *string { // See if there are any Machines that are not in currently defined failure domains first. - notInFailureDomains := machines.Filter( + notInFailureDomains := eligibleMachines.Filter( collections.Not(collections.InFailureDomains(c.FailureDomains().FilterControlPlane().GetIDs()...)), ) if len(notInFailureDomains) > 0 { @@ -184,15 +185,21 @@ func (c *ControlPlane) FailureDomainWithMostMachines(ctx context.Context, machin // in the cluster status. return notInFailureDomains.Oldest().Spec.FailureDomain } - return failuredomains.PickMost(ctx, c.Cluster.Status.FailureDomains.FilterControlPlane(), c.Machines, machines) + + // Pick the failure domain with most machines in it and at least one eligible machine in it. + return failuredomains.PickMost(ctx, c.Cluster.Status.FailureDomains.FilterControlPlane(), c.Machines, eligibleMachines) } -// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date, not deleted machines. +// NextFailureDomainForScaleUp returns the failure domain with the fewest number of up-to-date, not deleted machines +// (the ultimate goal is to achieve ideal spreading of machines at stable state/when only up-to-date machines will exist). +// +// In case of tie (more failure domain with the same number of up-to-date, not deleted machines) the failure domain with the fewest number of +// machine overall is picked to ensure a better spreading of machines while the rollout is performed. func (c *ControlPlane) NextFailureDomainForScaleUp(ctx context.Context) (*string, error) { if len(c.Cluster.Status.FailureDomains.FilterControlPlane()) == 0 { return nil, nil } - return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), c.UpToDateMachines().Filter(collections.Not(collections.HasDeletionTimestamp))), nil + return failuredomains.PickFewest(ctx, c.FailureDomains().FilterControlPlane(), c.Machines, c.UpToDateMachines().Filter(collections.Not(collections.HasDeletionTimestamp))), nil } // InitialControlPlaneConfig returns a new KubeadmConfigSpec that is to be used for an initializing control plane. diff --git a/controlplane/kubeadm/internal/controllers/scale.go b/controlplane/kubeadm/internal/controllers/scale.go index aa3a8190408b..cc41cdfa3e15 100644 --- a/controlplane/kubeadm/internal/controllers/scale.go +++ b/controlplane/kubeadm/internal/controllers/scale.go @@ -248,17 +248,31 @@ func preflightCheckCondition(kind string, obj conditions.Getter, condition clust return nil } +// selectMachineForScaleDown select a machine candidate for scaling down. The selection is a two phase process: +// +// In the first phase it selects a subset of machines eligible for deletion: +// - if there are outdated machines with the delete machine annotation, use them as eligible subset (priority to user requests, part 1) +// - if there are machines (also not outdated) with the delete machine annotation, use them (priority to user requests, part 2) +// - if there are outdated machines with unhealthy control plane components, use them (priority to restore control plane health) +// - if there are outdated machines consider all the outdated machines as eligible subset (rollout) +// - otherwise consider all the machines +// +// Once the subset of machines eligible for deletion is identified, one machine is picked out of this subset by +// selecting the machine in the failure domain with most machines (including both eligible and not eligible machines). func selectMachineForScaleDown(ctx context.Context, controlPlane *internal.ControlPlane, outdatedMachines collections.Machines) (*clusterv1.Machine, error) { - machines := controlPlane.Machines + // Select the subset of machines eligible for scale down. + eligibleMachines := controlPlane.Machines switch { case controlPlane.MachineWithDeleteAnnotation(outdatedMachines).Len() > 0: - machines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) - case controlPlane.MachineWithDeleteAnnotation(machines).Len() > 0: - machines = controlPlane.MachineWithDeleteAnnotation(machines) + eligibleMachines = controlPlane.MachineWithDeleteAnnotation(outdatedMachines) + case controlPlane.MachineWithDeleteAnnotation(eligibleMachines).Len() > 0: + eligibleMachines = controlPlane.MachineWithDeleteAnnotation(eligibleMachines) case controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines).Len() > 0: - machines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines) + eligibleMachines = controlPlane.UnhealthyMachinesWithUnhealthyControlPlaneComponents(outdatedMachines) case outdatedMachines.Len() > 0: - machines = outdatedMachines + eligibleMachines = outdatedMachines } - return controlPlane.MachineInFailureDomainWithMostMachines(ctx, machines) + + // Pick an eligible machine from the failure domain with most machines in (including both eligible and not eligible machines) + return controlPlane.MachineInFailureDomainWithMostMachines(ctx, eligibleMachines) } diff --git a/util/failuredomains/failure_domains.go b/util/failuredomains/failure_domains.go index 50b3d87a604c..ffe6258a5474 100644 --- a/util/failuredomains/failure_domains.go +++ b/util/failuredomains/failure_domains.go @@ -30,8 +30,9 @@ import ( ) type failureDomainAggregation struct { - id string - count int + id string + countPriority int + countAll int } type failureDomainAggregations []failureDomainAggregation @@ -43,7 +44,27 @@ func (f failureDomainAggregations) Len() int { // Less reports whether the element with // index i should sort before the element with index j. func (f failureDomainAggregations) Less(i, j int) bool { - return f[i].count < f[j].count + // If a failure domain has less priority machines then the other, it goes first + if f[i].countPriority < f[j].countPriority { + return true + } + if f[i].countPriority > f[j].countPriority { + return false + } + + // If a failure domain has the same number of priority machines then the other, + // use the number of overall machines to pick which one goes first. + if f[i].countAll < f[j].countAll { + return true + } + if f[i].countAll > f[j].countAll { + return false + } + + // If both failure domain have the same number of priority machines and overall machines, we keep the order + // in the list which ensure a certain degree of randomness because the list originates from a map. + // This helps to spread machines e.g. when concurrently working on many clusters. + return i < j } // Swap swaps the elements with indexes i and j. @@ -51,36 +72,29 @@ func (f failureDomainAggregations) Swap(i, j int) { f[i], f[j] = f[j], f[i] } -// PickMost returns a failure domain that is in machines and has most of the group of machines on. -func PickMost(ctx context.Context, failureDomains clusterv1.FailureDomains, groupMachines, machines collections.Machines) *string { - // orderDescending sorts failure domains according to all machines belonging to the group. - fds := orderDescending(ctx, failureDomains, groupMachines) - for _, fd := range fds { - for _, m := range machines { - if m.Spec.FailureDomain == nil { - continue - } - if *m.Spec.FailureDomain == fd.id { - return &fd.id - } - } - } - return nil -} - -// orderDescending returns the sorted failure domains in decreasing order. -func orderDescending(ctx context.Context, failureDomains clusterv1.FailureDomains, machines collections.Machines) failureDomainAggregations { - aggregations := pick(ctx, failureDomains, machines) +// PickMost returns the failure domain from which we have to delete a control plane machine, which is the failure domain with most machines and at least one eligible machine in it. +func PickMost(ctx context.Context, failureDomains clusterv1.FailureDomains, allMachines, eligibleMachines collections.Machines) *string { + aggregations := countByFailureDomain(ctx, failureDomains, allMachines, eligibleMachines) if len(aggregations) == 0 { return nil } sort.Sort(sort.Reverse(aggregations)) - return aggregations + if len(aggregations) > 0 && aggregations[0].countPriority > 0 { + return ptr.To(aggregations[0].id) + } + return nil } -// PickFewest returns the failure domain with the fewest number of machines. -func PickFewest(ctx context.Context, failureDomains clusterv1.FailureDomains, machines collections.Machines) *string { - aggregations := pick(ctx, failureDomains, machines) +// PickFewest returns the failure domain that will be used for placement of a new control plane machine, which is the failure domain with the fewest +// number of up-to-date, not deleted machines. +// +// Ensuring proper spreading of up-to-date, not deleted machines, is the highest priority to achieve ideal spreading of machines +// at stable state/when only up-to-date machines will exist. +// +// In case of tie (more failure domain with the same number of up-to-date, not deleted machines) the failure domain with the fewest number of +// machine overall is picked to ensure a better spreading of machines while the rollout is performed. +func PickFewest(ctx context.Context, failureDomains clusterv1.FailureDomains, allMachines, upToDateMachines collections.Machines) *string { + aggregations := countByFailureDomain(ctx, failureDomains, allMachines, upToDateMachines) if len(aggregations) == 0 { return nil } @@ -88,22 +102,29 @@ func PickFewest(ctx context.Context, failureDomains clusterv1.FailureDomains, ma return ptr.To(aggregations[0].id) } -func pick(ctx context.Context, failureDomains clusterv1.FailureDomains, machines collections.Machines) failureDomainAggregations { +// countByFailureDomain returns failure domains with the number of machines in it. +// Note: countByFailureDomain computes both the number of machines as well as the number of a subset of machines with higher priority. +// E.g. for deletion out of date machines have higher priority vs other machines. +func countByFailureDomain(ctx context.Context, failureDomains clusterv1.FailureDomains, allMachines, priorityMachines collections.Machines) failureDomainAggregations { log := ctrl.LoggerFrom(ctx) if len(failureDomains) == 0 { return failureDomainAggregations{} } - counters := map[string]int{} + counters := map[string]failureDomainAggregation{} // Initialize the known failure domain keys to find out if an existing machine is in an unsupported failure domain. - for fd := range failureDomains { - counters[fd] = 0 + for id := range failureDomains { + counters[id] = failureDomainAggregation{ + id: id, + countPriority: 0, + countAll: 0, + } } // Count how many machines are in each failure domain. - for _, m := range machines { + for _, m := range allMachines { if m.Spec.FailureDomain == nil { continue } @@ -116,15 +137,30 @@ func pick(ctx context.Context, failureDomains clusterv1.FailureDomains, machines log.Info(fmt.Sprintf("Unknown failure domain %q for Machine %s (known failure domains: %v)", id, m.GetName(), knownFailureDomains)) continue } - counters[id]++ + a := counters[id] + a.countAll++ + counters[id] = a } - aggregations := make(failureDomainAggregations, 0) - - // Gather up tuples of failure domains ids and counts - for fd, count := range counters { - aggregations = append(aggregations, failureDomainAggregation{id: fd, count: count}) + for _, m := range priorityMachines { + if m.Spec.FailureDomain == nil { + continue + } + id := *m.Spec.FailureDomain + if _, ok := failureDomains[id]; !ok { + continue + } + a := counters[id] + a.countPriority++ + counters[id] = a } + // Collect failure domain aggregations. + // Note: by creating the list from a map, we get a certain degree of randomness that helps to spread machines + // e.g. when concurrently working on many clusters. + aggregations := make(failureDomainAggregations, 0) + for _, count := range counters { + aggregations = append(aggregations, count) + } return aggregations } diff --git a/util/failuredomains/failure_domains_test.go b/util/failuredomains/failure_domains_test.go index e49c850087bf..782096bf795f 100644 --- a/util/failuredomains/failure_domains_test.go +++ b/util/failuredomains/failure_domains_test.go @@ -17,9 +17,11 @@ limitations under the License. package failuredomains import ( + "sort" "testing" . "github.com/onsi/gomega" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" @@ -92,7 +94,7 @@ func TestNewFailureDomainPicker(t *testing.T) { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - fd := PickFewest(ctx, tc.fds, tc.machines) + fd := PickFewest(ctx, tc.fds, tc.machines, nil) if tc.expected == nil { g.Expect(fd).To(BeNil()) } else { @@ -102,7 +104,7 @@ func TestNewFailureDomainPicker(t *testing.T) { } } -func TestNewFailureDomainPickMost(t *testing.T) { +func TestPickMost(t *testing.T) { a := ptr.To("us-west-1a") b := ptr.To("us-west-1b") @@ -115,10 +117,11 @@ func TestNewFailureDomainPickMost(t *testing.T) { machinenil := &clusterv1.Machine{Spec: clusterv1.MachineSpec{FailureDomain: nil}} testcases := []struct { - name string - fds clusterv1.FailureDomains - machines collections.Machines - expected []*string + name string + fds clusterv1.FailureDomains + allMachines collections.Machines + eligibleMachines collections.Machines + expected *string }{ { name: "simple", @@ -132,26 +135,29 @@ func TestNewFailureDomainPickMost(t *testing.T) { expected: nil, }, { - name: "one machine in a failure domain", - fds: fds, - machines: collections.FromMachines(machinea.DeepCopy()), - expected: []*string{a}, + name: "one machine in a failure domain", + fds: fds, + allMachines: collections.FromMachines(machinea.DeepCopy()), + eligibleMachines: collections.FromMachines(machinea.DeepCopy()), + expected: a, }, { name: "no failure domain specified on machine", fds: clusterv1.FailureDomains{ *a: clusterv1.FailureDomainSpec{ControlPlane: true}, }, - machines: collections.FromMachines(machinenil.DeepCopy()), - expected: nil, + allMachines: collections.FromMachines(machinenil.DeepCopy()), + eligibleMachines: collections.FromMachines(machinenil.DeepCopy()), + expected: nil, }, { name: "mismatched failure domain on machine should return nil", fds: clusterv1.FailureDomains{ *a: clusterv1.FailureDomainSpec{ControlPlane: true}, }, - machines: collections.FromMachines(machineb.DeepCopy()), - expected: nil, + allMachines: collections.FromMachines(machineb.DeepCopy()), + eligibleMachines: collections.FromMachines(machineb.DeepCopy()), + expected: nil, }, { name: "failure domains and no machines should return nil", @@ -159,9 +165,10 @@ func TestNewFailureDomainPickMost(t *testing.T) { expected: nil, }, { - name: "nil failure domains with machines", - machines: collections.FromMachines(machineb.DeepCopy()), - expected: nil, + name: "nil failure domains with machines", + allMachines: collections.FromMachines(machineb.DeepCopy()), + eligibleMachines: collections.FromMachines(machineb.DeepCopy()), + expected: nil, }, { name: "nil failure domains with no machines", @@ -172,12 +179,696 @@ func TestNewFailureDomainPickMost(t *testing.T) { t.Run(tc.name, func(t *testing.T) { g := NewWithT(t) - fd := PickMost(ctx, tc.fds, tc.machines, tc.machines) + fd := PickMost(ctx, tc.fds, tc.allMachines, tc.eligibleMachines) if tc.expected == nil { g.Expect(fd).To(BeNil()) } else { - g.Expect(fd).To(BeElementOf(tc.expected)) + g.Expect(fd).To(Equal(tc.expected)) + } + }) + } +} + +func TestPickFewestNew(t *testing.T) { + a := "us-west-1a" + b := "us-west-1b" + c := "us-west-1c" + + fds3 := clusterv1.FailureDomains{ + a: clusterv1.FailureDomainSpec{ControlPlane: true}, + b: clusterv1.FailureDomainSpec{ControlPlane: true}, + c: clusterv1.FailureDomainSpec{ControlPlane: true}, + } + + fds2 := clusterv1.FailureDomains{ + a: clusterv1.FailureDomainSpec{ControlPlane: true}, + b: clusterv1.FailureDomainSpec{ControlPlane: true}, + } + + fds0 := clusterv1.FailureDomains{} + + machineA1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineA2 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a2"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineB1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineC1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "c1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(c)}} + machineA1Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a1-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineA2Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a2-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineB1Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b1-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineB2Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b2-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineC1Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "c1-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(c)}} + machineA1New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a1-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineA2New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a2-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineB1New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b1-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineC1New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "c1-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(c)}} + + testcases := []struct { + name string + fds clusterv1.FailureDomains + allMachines collections.Machines + upToDateMachines collections.Machines + expected []string + }{ + // Use case A: 3 failure domains, 3 control plane machines + + // scenario A.1: scale up to 3 + { + name: "3 failure domains, 0 machine, scale up from 0 to 1", + fds: fds3, + allMachines: collections.FromMachines(), + upToDateMachines: collections.FromMachines(), + expected: []string{a, b, c}, // select fd a, b or c because they all have no up-to-date machines + }, + { + name: "3 failure domains, 1 machine, scale up from 1 to 2", + fds: fds3, + allMachines: collections.FromMachines(machineA1), + upToDateMachines: collections.FromMachines(machineA1), + expected: []string{b, c}, // select fd b or c because they all have no up-to-date machines + }, + { + name: "3 failure domains, 2 machines, scale up from 2 to 3", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1), + upToDateMachines: collections.FromMachines(machineA1, machineB1), + expected: []string{c}, // select fd c because it has no up-to-date machines + }, + + // scenario A.2: scale up during a rollout + { + name: "3 failure domains, 3 outdated machines, scale up to 4", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineB1Old, machineC1Old), + upToDateMachines: collections.FromMachines(), + expected: []string{a, b, c}, // select fd a, b or c because they all have no up-to-date machines, 1 machine overall + }, + { + name: "3 failure domains, 2 outdated machines, 1 new machine, scale up to 4", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineB1Old, machineC1Old), + upToDateMachines: collections.FromMachines(machineA1New), + expected: []string{b, c}, // select fd b or c because they all have no up-to-date machines (less than a) + }, + { + name: "3 failure domains, 1 outdated machine, 2 new machines, scale up to 4", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineB1New, machineC1Old), + upToDateMachines: collections.FromMachines(machineA1New, machineB1New), + expected: []string{c}, // select fd c because it has no up-to-date machines (less than a, b) + }, + + // scenario A.3: scale up after machines has been remediated or forcefully deleted + { + name: "3 failure domains, 3 machines, 1 new machine, scale up to 4", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineC1Old, machineC1New), + upToDateMachines: collections.FromMachines(machineC1New), + expected: []string{b}, // select fd b because it has no up-to-date machines (like a), 0 machine overall (less than a) + }, + { + name: "3 failure domains, 3 machines, 2 new machines, scale up to 4", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineA1New, machineC1New), + upToDateMachines: collections.FromMachines(machineA1New, machineC1New), + expected: []string{b}, // select fd b because it has no up-to-date machines (less than a, c) + }, + + // Use case B: 2 failure domains, 3 control plane machines + + // scenario B.1: scale up to 3 + { + name: "2 failure domains, 0 machine, scale up from 0 to 1", + fds: fds2, + allMachines: collections.FromMachines(), + upToDateMachines: collections.FromMachines(), + expected: []string{a, b}, // select fd a, b because they all have no up-to-date machines + }, + { + name: "2 failure domains, 1 machine, scale up from 1 to 2", + fds: fds2, + allMachines: collections.FromMachines(machineA1), + upToDateMachines: collections.FromMachines(machineA1), + expected: []string{b}, // select fd b because it has no up-to-date machines + }, + { + name: "2 failure domains, 2 machines, scale up from 2 to 3", + fds: fds2, + allMachines: collections.FromMachines(machineA1, machineB1), + upToDateMachines: collections.FromMachines(machineA1, machineB1), + expected: []string{a, b}, // select fd a, b because they all have 1 up-to-date machine + }, + + // scenario B.2: scale up during a rollout + { + name: "2 failure domains, 3 outdated machines, scale up to 4", + fds: fds2, + allMachines: collections.FromMachines(machineA1Old, machineA2Old, machineB1Old), + upToDateMachines: collections.FromMachines(), + expected: []string{b}, // select fd b because it has no up-to-date machines (like a), 1 machine overall (less than a) + }, + { + name: "2 failure domains, 2 outdated machines, 1 new machine, scale up to 4", + fds: fds2, + allMachines: collections.FromMachines(machineA1Old, machineB1Old, machineB1New), + upToDateMachines: collections.FromMachines(machineB1New), + expected: []string{a}, // select fd a because it has no up-to-date machines (less than b) + }, + { + name: "2 failure domains, 1 outdated machine, 2 new machines, scale up to 4", + fds: fds2, + allMachines: collections.FromMachines(machineA1New, machineB1Old, machineB1New), + upToDateMachines: collections.FromMachines(machineA1New, machineB1New), + expected: []string{a}, // select fd b because it has 1 up-to-date machine (like b), 1 machine overall (less than b) + }, + + // Use case C: 3 failure domains, 5 control plane machines + + // scenario C.1: scale up to 5 + { + name: "3 failure domains, 0 machine, scale up from 0 to 1", + fds: fds3, + allMachines: collections.FromMachines(), + upToDateMachines: collections.FromMachines(), + expected: []string{a, b, c}, // select fd a, b or c because they all have no up-to-date machines + }, + { + name: "3 failure domains, 1 machine, scale up from 1 to 2", + fds: fds3, + allMachines: collections.FromMachines(machineA1), + upToDateMachines: collections.FromMachines(machineA1), + expected: []string{b, c}, // select fd b or c because they all have no up-to-date machines (less than a) + }, + { + name: "3 failure domains, 2 machines, scale up from 2 to 3", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1), + upToDateMachines: collections.FromMachines(machineA1, machineB1), + expected: []string{c}, // select fd c because it has no up-to-date machines (less than a, b) + }, + { + name: "3 failure domains, 3 machines, scale up from 3 to 4", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1, machineC1), + upToDateMachines: collections.FromMachines(machineA1, machineB1, machineC1), + expected: []string{a, b, c}, // select fd a, b or c because they all have 1 up-to-date machines + }, + { + name: "3 failure domains, 4 machines, scale up from 4 to 5", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineA2, machineB1, machineC1), + upToDateMachines: collections.FromMachines(machineA1, machineA2, machineB1, machineC1), + expected: []string{b, c}, // select fd b or c because they all have no up-to-date machines (less than a) + }, + + // scenario C.2: scale up during a rollout + { + name: "3 failure domains, 5 outdated machines, scale up to 6", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineA2Old, machineB1Old, machineB2Old, machineC1Old), + upToDateMachines: collections.FromMachines(), + expected: []string{c}, // select fd c because it has no up-to-date machines (like a, b), 1 machine overall (less than a, b) + }, + { + name: "3 failure domains, 4 outdated machines, 1 new machine, scale up to 6", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineB1Old, machineB2Old, machineC1Old, machineC1New), + upToDateMachines: collections.FromMachines(machineC1New), + expected: []string{a}, // select fd a because it has no up-to-date machines (like b), 1 machine overall (less than b) + }, + { + name: "3 failure domains, 3 outdated machines, 2 new machines, scale up to 6", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineA1New, machineB1Old, machineC1Old, machineC1New), + upToDateMachines: collections.FromMachines(machineA1New, machineC1New), + expected: []string{b}, // select fd b because it has no up-to-date machines + }, + { + name: "3 failure domains, 2 outdated machines, 3 new machines, scale up to 6", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineB1Old, machineB1New, machineC1Old, machineC1New), + upToDateMachines: collections.FromMachines(machineA1New, machineB1New, machineC1New), + expected: []string{a}, // select fd a because it has 1 up-to-date machines (like b,c) 1 machine overall (less than b,c) + }, + { + name: "3 failure domains, 1 outdated machine, 4 new machines, scale up to 6", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineA2New, machineB1New, machineC1Old, machineC1New), + upToDateMachines: collections.FromMachines(machineA1New, machineA2New, machineB1New, machineC1New), + expected: []string{b}, // select fd b because it has 1 up-to-date machines (less than a, like c) 1 machine overall (less than c) + }, + + // Use case D: no failure domains, 3 control plane machines + + // scenario D.1: scale up to 3 + { + name: "3 failure domains, 0 machine, scale up from 0 to 1", + fds: fds0, + allMachines: collections.FromMachines(), + upToDateMachines: collections.FromMachines(), + expected: nil, // no fd + }, + { + name: "3 failure domains, 1 machine, scale up from 1 to 2", + fds: fds0, + allMachines: collections.FromMachines(machineA1), + upToDateMachines: collections.FromMachines(machineA1), + expected: nil, // no fd + }, + { + name: "3 failure domains, 2 machines, scale up from 2 to 3", + fds: fds0, + allMachines: collections.FromMachines(machineA1, machineB1), + upToDateMachines: collections.FromMachines(machineA1, machineB1), + expected: nil, // no fd + }, + + // scenario D.2: scale up during a rollout + { + name: "3 failure domains, 3 outdated machines, scale up to 4", + fds: fds0, + allMachines: collections.FromMachines(machineA1Old, machineB1Old, machineC1Old), + upToDateMachines: collections.FromMachines(), + expected: nil, // no fd + }, + { + name: "3 failure domains, 2 outdated machines, 1 new machine, scale up to 4", + fds: fds0, + allMachines: collections.FromMachines(machineA1New, machineB1Old, machineC1Old), + upToDateMachines: collections.FromMachines(machineA1New), + expected: nil, // no fd + }, + { + name: "3 failure domains, 1 outdated machine, 2 new machines, scale up to 4", + fds: fds0, + allMachines: collections.FromMachines(machineA1New, machineB1New, machineC1Old), + upToDateMachines: collections.FromMachines(machineA1New, machineB1New), + expected: nil, // no fd + }, + } + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + fd := PickFewest(ctx, tc.fds, tc.allMachines, tc.upToDateMachines) + if tc.expected == nil { + g.Expect(fd).To(BeNil()) + } else { + g.Expect(fd).ToNot(BeNil()) + g.Expect(tc.expected).To(ContainElement(*fd)) + } + }) + } +} + +func TestPickMostNew(t *testing.T) { + a := "us-west-1a" + b := "us-west-1b" + c := "us-west-1c" + + fds3 := clusterv1.FailureDomains{ + a: clusterv1.FailureDomainSpec{ControlPlane: true}, + b: clusterv1.FailureDomainSpec{ControlPlane: true}, + c: clusterv1.FailureDomainSpec{ControlPlane: true}, + } + + fds2 := clusterv1.FailureDomains{ + a: clusterv1.FailureDomainSpec{ControlPlane: true}, + b: clusterv1.FailureDomainSpec{ControlPlane: true}, + } + + fds0 := clusterv1.FailureDomains{} + + machineA1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineA2 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a2"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineB1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineB2 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b2"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineC1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "c1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(c)}} + machineA1Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a1-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineA2Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a2-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineB1Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b1-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineB2Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b2-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineC1Old := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "c1-old"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(c)}} + machineA1New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a1-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineA2New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a2-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineB1New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b1-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineB2New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b2-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machineC1New := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "c1-new"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(c)}} + + testcases := []struct { + name string + fds clusterv1.FailureDomains + allMachines collections.Machines + eligibleMachines collections.Machines + expected []string + }{ + // Use case A: 3 failure domains, 3 control plane machines + + // scenario A.1: scale down during a rollout + { + name: "3 failure domains, 3 outdated machines, 1 new machine, scale down from 4 to 3", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineA1New, machineB1Old, machineC1Old), + eligibleMachines: collections.FromMachines(machineA1Old, machineB1Old, machineC1Old), + expected: []string{a}, // select fd a because it has 1 old machine (like b and c) but 2 machines overall (more than b and c) + }, + { + name: "3 failure domains, 2 outdated machines, 2 new machines, scale down from 4 to 3", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineB1Old, machineB1New, machineC1Old), + eligibleMachines: collections.FromMachines(machineB1Old, machineC1Old), + expected: []string{b}, // select fd b because it has 1 old machine (like c) but 2 machines overall (more c); fd a is discarded because it doesn't have any eligible machine + }, + { + name: "3 failure domains, 1 outdated machine, 3 new machines, scale down from 4 to 3", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineB1New, machineC1Old, machineC1New), + eligibleMachines: collections.FromMachines(machineC1Old), + expected: []string{c}, // select fd c because it has 1 old machine; fd a and b are discarded because they don't have any eligible machine + }, + + // scenario A.2: scale down to 0 + { + name: "3 failure domains, 3 machines, scale down from 3 to 2", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineA1, machineB1, machineC1), + expected: []string{a, b, c}, // select fd a, b or c because they have 1 eligible machine + }, + { + name: "3 failure domains, 2 machines, scale down from 2 to 1", + fds: fds3, + allMachines: collections.FromMachines(machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineB1, machineC1), + expected: []string{b, c}, // select fd b or c because they have 1 eligible machine + }, + { + name: "3 failure domains, 1 machine, scale down from 1 to 0", + fds: fds3, + allMachines: collections.FromMachines(machineC1), + eligibleMachines: collections.FromMachines(machineC1), + expected: []string{c}, // select fd c because it has 1 eligible machine + }, + + // scenario A.3: scale down or rollout when a machine with delete annotation or an un-healthy machine is prioritized for deletion + { + name: "3 failure domains, 3 machines, 1 eligible machine", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineA1), + expected: []string{a}, // select fd a because it has 1 eligible machine + }, + { + name: "3 failure domains, 3 machines, 2 eligible machines", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineA1, machineB1), + expected: []string{a, b}, // select fd a or b because they have 1 eligible machine + }, + + // Use case B: 2 failure domains, 3 control plane machines + + // scenario B.1: scale down during a rollout + { + name: "2 failure domains, 3 outdated machines, 1 new machine, scale down from 4 to 3", + fds: fds2, + allMachines: collections.FromMachines(machineA1Old, machineA2Old, machineB1Old, machineB2New), + eligibleMachines: collections.FromMachines(machineA1Old, machineA2Old, machineB1Old), + expected: []string{a}, // select fd a because it has 2 old machines (more than b) + }, + { + name: "2 failure domains, 2 outdated machines, 2 new machines, scale down from 4 to 3", + fds: fds2, + allMachines: collections.FromMachines(machineA1New, machineA2Old, machineB1Old, machineB2New), + eligibleMachines: collections.FromMachines(machineA2Old, machineB1Old), + expected: []string{a, b}, // select fd a or b because both have 1 old machine and 2 machines overall + }, + { + name: "2 failure domains, 1 outdated machine, 3 new machines, scale down from 4 to 3", + fds: fds2, + allMachines: collections.FromMachines(machineA1New, machineA2New, machineB1Old, machineB2New), + eligibleMachines: collections.FromMachines(machineB1Old), + expected: []string{b}, // select fd b because it has 1 old machine (more than a) + }, + + // scenario B.2: scale down to 0 + { + name: "2 failure domains, 3 machines, scale down from 3 to 2", + fds: fds2, + allMachines: collections.FromMachines(machineA1, machineA2, machineB1), + eligibleMachines: collections.FromMachines(machineA1, machineA2, machineB1), + expected: []string{a}, // select fd a because it has 2 eligible machine (more than b) + }, + { + name: "2 failure domains, 2 machines, scale down from 2 to 1", + fds: fds2, + allMachines: collections.FromMachines(machineA1, machineB1), + eligibleMachines: collections.FromMachines(machineA1, machineB1), + expected: []string{a, b}, // select fd a or b because they have 1 eligible machine + }, + { + name: "2 failure domains, 1 machine, scale down from 1 to 0", + fds: fds2, + allMachines: collections.FromMachines(machineB1), + eligibleMachines: collections.FromMachines(machineB1), + expected: []string{b}, // select fd b because it has 1 eligible machine + }, + + // scenario B.3: scale down or rollout when a machine with delete annotation or an un-healthy machine is prioritized for deletion + { + name: "2 failure domains, 3 machines, 1 eligible machine", + fds: fds2, + allMachines: collections.FromMachines(machineA1, machineA2, machineB1), + eligibleMachines: collections.FromMachines(machineB1), + expected: []string{b}, // select fd b because it has 1 eligible machine + }, + { + name: "2 failure domains, 3 machines, 2 eligible machines", + fds: fds2, + allMachines: collections.FromMachines(machineA1, machineA2, machineB1), + eligibleMachines: collections.FromMachines(machineA2, machineB1), + expected: []string{a}, // select fd a because it has 1 eligible machine (like b) and 2 machines overall (more than b) + }, + + // Use case C: 3 failure domains, 5 control plane machines + + // scenario C.1: scale down during a rollout + { + name: "3 failure domains, 5 outdated machines, 1 new machine, scale down from 6 to 1", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineA2Old, machineB1Old, machineB2Old, machineC1Old, machineC1New), + eligibleMachines: collections.FromMachines(machineA1Old, machineA2Old, machineB1Old, machineB2Old, machineC1Old), + expected: []string{a, b}, // select fd a or b because they have 2 old machine (more than c) + }, + { + name: "3 failure domains, 4 outdated machines, 2 new machines, scale down from 6 to 1", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineA1New, machineB1Old, machineB2Old, machineC1Old, machineC1New), + eligibleMachines: collections.FromMachines(machineA1Old, machineB1Old, machineB2Old, machineC1Old), + expected: []string{b}, // select fd b because it has 2 old machine (more than a and c) + }, + { + name: "3 failure domains, 3 outdated machines, 3 new machines, scale down from 6 to 1", + fds: fds3, + allMachines: collections.FromMachines(machineA1Old, machineA1New, machineB1Old, machineB1New, machineC1Old, machineC1New), + eligibleMachines: collections.FromMachines(machineA1Old, machineB1Old, machineC1Old), + expected: []string{a, b, c}, // select fd a, b or c because they have 1 old machine + }, + { + name: "3 failure domains, 2 outdated machines, 4 new machines, scale down from 6 to 1", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineA2New, machineB1Old, machineB1New, machineC1Old, machineC1New), + eligibleMachines: collections.FromMachines(machineB1Old, machineC1Old), + expected: []string{b, c}, // select fd b or c because they have 1 old machine and 2 machines overall; fd a is discarded because it doesn't have any eligible machine + }, + { + name: "3 failure domains, 1 outdated machine, 5 new machines, scale down from 6 to 1", + fds: fds3, + allMachines: collections.FromMachines(machineA1New, machineA2New, machineB1New, machineB2New, machineC1Old, machineC1New), + eligibleMachines: collections.FromMachines(machineC1Old), + expected: []string{c}, // select fd c because it has 1 old machine; fd a and b are discarded because they don't have any eligible machine + }, + + // scenario C.2: scale down to 0 + { + name: "3 failure domains, 5 machines, scale down from 5 to 4", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineA2, machineB1, machineB2, machineC1), + eligibleMachines: collections.FromMachines(machineA1, machineA2, machineB1, machineB2, machineC1), + expected: []string{a, b}, // select fd a or b because they have 2 eligible machine (more than c) + }, + { + name: "3 failure domains, 4 machines, scale down from 4 to 3", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1, machineB2, machineC1), + eligibleMachines: collections.FromMachines(machineA1, machineB1, machineB2, machineC1), + expected: []string{b}, // select fd b because they have 2 eligible machine (more than a, c) + }, + { + name: "3 failure domains, 3 machines, scale down from 3 to 2", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineA1, machineB1, machineC1), + expected: []string{a, b, c}, // select fd a, b or c because they have 1 eligible machine + }, + { + name: "3 failure domains, 2 machines, scale down from 2 to 1", + fds: fds3, + allMachines: collections.FromMachines(machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineB1, machineC1), + expected: []string{b, c}, // select fd b or c because they have 1 eligible machine + }, + { + name: "3 failure domains, 1 machine, scale down from 1 to 0", + fds: fds3, + allMachines: collections.FromMachines(machineC1), + eligibleMachines: collections.FromMachines(machineC1), + expected: []string{c}, // select fd c because it has 1 eligible machine + }, + + // scenario C.3: scale down or rollout when a machine with delete annotation or an un-healthy machine is prioritized for deletion + { + name: "3 failure domains, 5 machines, 1 eligible machine", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineA2, machineB1, machineB2, machineC1), + eligibleMachines: collections.FromMachines(machineC1), + expected: []string{c}, // select fd c because it has 1 eligible machine + }, + { + name: "3 failure domains, 5 machines, 2 eligible machines", + fds: fds3, + allMachines: collections.FromMachines(machineA1, machineA2, machineB1, machineB2, machineC1), + eligibleMachines: collections.FromMachines(machineA2, machineC1), + expected: []string{a}, // select fd a because it has 1 eligible machine (like c) and 2 machines overall (more than c) + }, + + // Use case D: no failure domains, 3 control plane machines + + // scenario D.1: scale down during a rollout + { + name: "3 failure domains, 3 outdated machines, 1 new machine, scale down from 4 to 3", + fds: fds0, + allMachines: collections.FromMachines(machineA1Old, machineA1New, machineB1Old, machineC1Old), + eligibleMachines: collections.FromMachines(machineA1Old, machineB1Old, machineC1Old), + expected: nil, // no fd + }, + { + name: "3 failure domains, 2 outdated machines, 2 new machines, scale down from 4 to 3", + fds: fds0, + allMachines: collections.FromMachines(machineA1New, machineB1Old, machineB1New, machineC1Old), + eligibleMachines: collections.FromMachines(machineB1Old, machineC1Old), + expected: nil, // no fd + }, + { + name: "3 failure domains, 1 outdated machines, 3 new machines, scale down from 4 to 3", + fds: fds0, + allMachines: collections.FromMachines(machineA1New, machineB1New, machineC1Old, machineC1New), + eligibleMachines: collections.FromMachines(machineC1Old), + expected: nil, // no fd + }, + + // scenario D.2: scale down to 0 + { + name: "3 failure domains, 3 machines, scale down from 3 to 2", + fds: fds0, + allMachines: collections.FromMachines(machineA1, machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineA1, machineB1, machineC1), + expected: nil, // no fd + }, + { + name: "3 failure domains, 2 machines, scale down from 2 to 1", + fds: fds0, + allMachines: collections.FromMachines(machineB1, machineC1), + eligibleMachines: collections.FromMachines(machineB1, machineC1), + expected: nil, // no fd + }, + { + name: "3 failure domains, 1 machine, scale down from 1 to 0", + fds: fds0, + allMachines: collections.FromMachines(machineC1), + eligibleMachines: collections.FromMachines(machineC1), + expected: nil, // no fd + }, + + // Edge cases + + { + name: "No eligible machines", // Note: this should never happen (scale down is called only when there are eligible machines) + fds: fds0, + allMachines: collections.FromMachines(machineA1, machineB1, machineC1), + eligibleMachines: collections.FromMachines(), + expected: nil, + }, + { + name: "No machines", // Note: this should never happen (scale down is called only when there are machines) + fds: fds0, + allMachines: collections.FromMachines(), + eligibleMachines: collections.FromMachines(), + expected: nil, + }, + } + for _, tc := range testcases { + t.Run(tc.name, func(t *testing.T) { + g := NewWithT(t) + + fd := PickMost(ctx, tc.fds, tc.allMachines, tc.eligibleMachines) + if tc.expected == nil { + g.Expect(fd).To(BeNil()) + } else { + g.Expect(fd).ToNot(BeNil()) + g.Expect(tc.expected).To(ContainElement(*fd)) } }) } } + +func TestCountByFailureDomain(t *testing.T) { + g := NewWithT(t) + + a := "us-west-1a" + b := "us-west-1b" + + fds := clusterv1.FailureDomains{ + a: clusterv1.FailureDomainSpec{ControlPlane: true}, + b: clusterv1.FailureDomainSpec{ControlPlane: true}, + } + machinea1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machinea2 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "a2"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(a)}} + machineb1 := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "b1"}, Spec: clusterv1.MachineSpec{FailureDomain: ptr.To(b)}} + machinenil := &clusterv1.Machine{ObjectMeta: metav1.ObjectMeta{Name: "nil"}, Spec: clusterv1.MachineSpec{FailureDomain: nil}} + + allMachines := collections.FromMachines(machinea1, machinea2, machineb1, machinenil) + priorityMachines := collections.FromMachines(machinea1) + + aggregations := countByFailureDomain(ctx, fds, allMachines, priorityMachines) + + g.Expect(aggregations).To(HaveLen(2)) + g.Expect(aggregations).To(ConsistOf(failureDomainAggregation{id: a, countPriority: 1, countAll: 2}, failureDomainAggregation{id: b, countPriority: 0, countAll: 1})) +} + +func TestFailureDomainAggregationsSort(t *testing.T) { + g := NewWithT(t) + + aggregations := failureDomainAggregations{ + {id: "fd1", countPriority: 2, countAll: 2}, + {id: "fd2", countPriority: 2, countAll: 3}, + {id: "fd3", countPriority: 2, countAll: 2}, + {id: "fd4", countPriority: 1, countAll: 5}, + } + + sort.Sort(aggregations) + + // the result should be sorted so the fd with less priority machines are first, + // the number of overall machine should be used if the number of priority machines is equal, + // position in the array is used to break ties. + + // fd4 is the failure domain with less priority machines, it should go first + g.Expect(aggregations[0].id).To(Equal("fd4")) + + // fd1, fd2, fd3 all have the same number of priority machines; + + // fd1, fd3 have also the same number of overall machines. + g.Expect(aggregations[1].id).To(Equal("fd1")) + g.Expect(aggregations[2].id).To(Equal("fd3")) + + // fd2 has more overall machines, it should go last + g.Expect(aggregations[3].id).To(Equal("fd2")) +}