Skip to content

Commit

Permalink
Add Failure Domains for Worker nodes feature for Nutanix provider (aw…
Browse files Browse the repository at this point in the history
…s#8837)

* Nutanix Failure Domains for Worker nodes

* Add unit-tests and fix failed

* Add missed test manifest

* Fix comments and rebase
  • Loading branch information
adiantum authored Oct 14, 2024
1 parent 2e9fd40 commit 57aff53
Show file tree
Hide file tree
Showing 18 changed files with 2,414 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ spec:
- type
type: object
type: array
workerMachineGroups:
description: Worker Machine Groups holds the list of worker
machine group names that will use this failure domain.
items:
type: string
type: array
required:
- name
type: object
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,28 @@ spec:
required:
- type
type: object
gpus:
description: List of GPU devices that should be added to the VMs.
items:
description: NutanixGPUIdentifier holds VM GPU device configuration.
properties:
deviceID:
description: deviceID is the device ID of the GPU device.
format: int64
type: integer
name:
description: vendorID is the vendor ID of the GPU device.
type: string
type:
description: type is the type of the GPU device.
enum:
- deviceID
- name
type: string
required:
- type
type: object
type: array
image:
description: image is to identify the OS image uploaded to the Prism
Central (PC) The image identifier (uuid or name) can be obtained
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ require (
go.opentelemetry.io/otel/trace v1.20.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/mod v0.14.0 // indirect
golang.org/x/sync v0.6.0 // indirect
golang.org/x/sync v0.6.0
golang.org/x/sys v0.18.0 // indirect
golang.org/x/term v0.18.0 // indirect
golang.org/x/time v0.5.0 // indirect
Expand Down
6 changes: 5 additions & 1 deletion pkg/api/v1alpha1/nutanixdatacenterconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ type NutanixDatacenterFailureDomain struct {
// Subnets holds the list of subnets identifiers cluster's network subnets.
// +kubebuilder:validation:Required
Subnets []NutanixResourceIdentifier `json:"subnets,omitempty"`

// Worker Machine Groups holds the list of worker machine group names that will use this failure domain.
// +optional
WorkerMachineGroups []string `json:"workerMachineGroups,omitempty"`
}

// NutanixDatacenterConfigStatus defines the observed state of NutanixDatacenterConfig.
Expand Down Expand Up @@ -165,7 +169,7 @@ func (in *NutanixDatacenterConfig) Validate() error {
}
}

if in.Spec.FailureDomains != nil && len(in.Spec.FailureDomains) != 0 {
if len(in.Spec.FailureDomains) != 0 {
dccName := in.Namespace + "/" + in.Name
validateClusterResourceIdentifier := createValidateNutanixResourceFunc("NutanixDatacenterConfig.Spec.FailureDomains.Cluster", "cluster", dccName)
validateSubnetResourceIdentifier := createValidateNutanixResourceFunc("NutanixDatacenterConfig.Spec.FailureDomains.Subnets", "subnet", dccName)
Expand Down
5 changes: 5 additions & 0 deletions pkg/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

104 changes: 104 additions & 0 deletions pkg/providers/nutanix/config/md-template.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,106 @@
{{- if $.failureDomains -}}{{ range $fd := $.failureDomains -}}
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineDeployment
metadata:
labels:
cluster.x-k8s.io/cluster-name: "{{$.clusterName}}"
name: "{{$.workerNodeGroupName}}-{{$fd.Name}}"
namespace: "{{$.eksaSystemNamespace}}"
{{- if $.autoscalingConfig }}
annotations:
cluster.x-k8s.io/cluster-api-autoscaler-node-group-min-size: "{{ $.autoscalingConfig.MinCount }}"
cluster.x-k8s.io/cluster-api-autoscaler-node-group-max-size: "{{ $.autoscalingConfig.MaxCount }}"
{{- end }}
spec:
clusterName: "{{$.clusterName}}"
{{- if not $.autoscalingConfig }}
replicas: {{ index $.failureDomainsReplicas $fd.Name }}
{{- end }}
selector:
matchLabels: {}
template:
metadata:
labels:
cluster.x-k8s.io/cluster-name: "{{$.clusterName}}"
spec:
failureDomain: "{{$fd.Name}}"
bootstrap:
configRef:
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
name: "{{$.workloadkubeadmconfigTemplateName}}"
clusterName: "{{$.clusterName}}"
infrastructureRef:
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
name: "{{$.workloadTemplateName}}-{{$fd.Name}}"
version: "{{$.kubernetesVersion}}"
{{- if $.upgradeRolloutStrategy }}
strategy:
rollingUpdate:
maxSurge: {{$.maxSurge}}
maxUnavailable: {{$.maxUnavailable}}
{{- end }}
---
apiVersion: infrastructure.cluster.x-k8s.io/v1beta1
kind: NutanixMachineTemplate
metadata:
name: "{{$.workloadTemplateName}}-{{$fd.Name}}"
namespace: "{{$.eksaSystemNamespace}}"
spec:
template:
spec:
providerID: "nutanix://{{$.clusterName}}-m1"
vcpusPerSocket: {{$.vcpusPerSocket}}
vcpuSockets: {{$.vcpuSockets}}
memorySize: {{$.memorySize}}
systemDiskSize: {{$.systemDiskSize}}
image:
{{- if (eq $.imageIDType "name") }}
type: name
name: "{{$.imageName}}"
{{ else if (eq $.imageIDType "uuid") }}
type: uuid
uuid: "{{$.imageUUID}}"
{{ end }}
cluster:
{{- if (eq $fd.Cluster.Type "name") }}
type: name
name: "{{ $fd.Cluster.Name }}"
{{- else if (eq $fd.Cluster.Type "uuid") }}
type: uuid
uuid: "{{ $fd.Cluster.UUID }}"
{{ end }}
subnet:
{{- range $subnet := $fd.Subnets }}
{{- if (eq $subnet.Type "name") }}
- type: name
name: "{{ $subnet.Name }}"
{{- else if (eq $subnet.Type "uuid") }}
- type: uuid
uuid: "{{ $subnet.UUID }}"
{{- end }}
{{- end }}
{{- if $.projectIDType}}
project:
{{- if (eq $.projectIDType "name") }}
type: name
name: "{{$.projectName}}"
{{- else if (eq $.projectIDType "uuid") }}
type: uuid
uuid: "{{$.projectUUID}}"
{{ end }}
{{ end }}
{{- if $.additionalCategories}}
additionalCategories:
{{- range $.additionalCategories}}
- key: "{{ $.Key }}"
value: "{{ $.Value }}"
{{- end }}
{{- end }}
---
{{ end -}}
{{- else -}}
apiVersion: cluster.x-k8s.io/v1beta1
kind: MachineDeployment
metadata:
Expand Down Expand Up @@ -107,6 +210,7 @@ spec:
{{- end }}
{{- end }}
---
{{ end -}}
apiVersion: bootstrap.cluster.x-k8s.io/v1beta1
kind: KubeadmConfigTemplate
metadata:
Expand Down
12 changes: 6 additions & 6 deletions pkg/providers/nutanix/provider_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,13 +312,13 @@ func TestNutanixProviderSetupAndValidateCreate(t *testing.T) {
name: "cluster config with unsupported upgrade strategy configuration for cp",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_cp.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
{
name: "cluster config with unsupported upgrade strategy configuration for md",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_md.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
}

Expand Down Expand Up @@ -508,13 +508,13 @@ func TestNutanixProviderSetupAndValidateDeleteCluster(t *testing.T) {
name: "cluster config with unsupported upgrade strategy configuration for cp",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_cp.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
{
name: "cluster config with unsupported upgrade strategy configuration for md",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_md.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
}

Expand Down Expand Up @@ -560,13 +560,13 @@ func TestNutanixProviderSetupAndValidateUpgradeCluster(t *testing.T) {
name: "cluster config with unsupported upgrade strategy configuration for cp",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_cp.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
{
name: "cluster config with unsupported upgrade strategy configuration for md",
clusterConfFile: "testdata/cluster_nutanix_with_upgrade_strategy_md.yaml",
expectErr: true,
expectErrStr: "failed setup and validations: Upgrade rollout strategy customization is not supported for nutanix provider",
expectErrStr: "failed setup and validations: upgrade rollout strategy customization is not supported for nutanix provider",
},
}

Expand Down
45 changes: 45 additions & 0 deletions pkg/providers/nutanix/template.go
Original file line number Diff line number Diff line change
Expand Up @@ -346,10 +346,53 @@ func buildTemplateMapCP(
return values, nil
}

func calcFailureDomainReplicas(workerNodeGroupConfiguration v1alpha1.WorkerNodeGroupConfiguration, failureDomains []v1alpha1.NutanixDatacenterFailureDomain) map[string]int {
replicasPerFailureDomain := make(map[string]int)
failureDomainCount := len(failureDomains)

if workerNodeGroupConfiguration.AutoScalingConfiguration != nil {
return replicasPerFailureDomain
}

if failureDomainCount == 0 {
return replicasPerFailureDomain
}

workerNodeGroupCount := failureDomainCount
if workerNodeGroupConfiguration.Count != nil {
workerNodeGroupCount = int(*workerNodeGroupConfiguration.Count)
}

minCount := int(workerNodeGroupCount / failureDomainCount)

for i := 0; i < len(failureDomains); i++ {
replicasPerFailureDomain[failureDomains[i].Name] = minCount
}
replicasPerFailureDomain[failureDomains[0].Name] = workerNodeGroupCount - (failureDomainCount-1)*minCount

return replicasPerFailureDomain
}

func getFailureDomainsForWorkerNodeGroup(allFailureDomains []v1alpha1.NutanixDatacenterFailureDomain, workerNodeGroupConfigurationName string) []v1alpha1.NutanixDatacenterFailureDomain {
result := make([]v1alpha1.NutanixDatacenterFailureDomain, 0)
for _, fd := range allFailureDomains {
for _, workerMachineGroup := range fd.WorkerMachineGroups {
if workerMachineGroup == workerNodeGroupConfigurationName {
result = append(result, fd)
}
}
}

return result
}

func buildTemplateMapMD(clusterSpec *cluster.Spec, workerNodeGroupMachineSpec v1alpha1.NutanixMachineConfigSpec, workerNodeGroupConfiguration v1alpha1.WorkerNodeGroupConfiguration) (map[string]interface{}, error) {
versionsBundle := clusterSpec.WorkerNodeGroupVersionsBundle(workerNodeGroupConfiguration)
format := "cloud-config"

failureDomainsForWorkerNodeGroup := getFailureDomainsForWorkerNodeGroup(clusterSpec.NutanixDatacenter.Spec.FailureDomains, workerNodeGroupConfiguration.Name)
replicasPerFailureDomain := calcFailureDomainReplicas(workerNodeGroupConfiguration, failureDomainsForWorkerNodeGroup)

values := map[string]interface{}{
"clusterName": clusterSpec.Cluster.Name,
"eksaSystemNamespace": constants.EksaSystemNamespace,
Expand All @@ -374,6 +417,8 @@ func buildTemplateMapMD(clusterSpec *cluster.Spec, workerNodeGroupMachineSpec v1
"subnetUUID": workerNodeGroupMachineSpec.Subnet.UUID,
"workerNodeGroupName": fmt.Sprintf("%s-%s", clusterSpec.Cluster.Name, workerNodeGroupConfiguration.Name),
"workerNodeGroupTaints": workerNodeGroupConfiguration.Taints,
"failureDomains": failureDomainsForWorkerNodeGroup,
"failureDomainsReplicas": replicasPerFailureDomain,
}

if clusterSpec.Cluster.Spec.RegistryMirrorConfiguration != nil {
Expand Down
Loading

0 comments on commit 57aff53

Please sign in to comment.