From 9b892755f0451bd51ab53081384ee2d0d1cbd1f9 Mon Sep 17 00:00:00 2001 From: Olivier Tardieu Date: Tue, 23 Jul 2024 21:14:43 -0400 Subject: [PATCH] Add LendingLimit adjustment tests --- .gitignore | 3 + Makefile | 10 +++- .../controller/appwrapper/fixtures_test.go | 14 +++++ .../appwrapper/node_health_monitor_test.go | 56 ++++++++++++++++++- internal/controller/appwrapper/suite_test.go | 4 +- 5 files changed, 84 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 7a7feec5..a857a722 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,6 @@ go.work *.swp *.swo *~ + +# CRDs for unit tests +dep-crds diff --git a/Makefile b/Makefile index 9c697ad7..22300e6f 100644 --- a/Makefile +++ b/Makefile @@ -94,8 +94,16 @@ fmt: ## Run go fmt against code. vet: ## Run go vet against code. go vet ./... +EXTERNAL_CRDS_DIR ?= $(shell pwd)/dep-crds + +KUEUE_ROOT = $(shell go list -m -mod=readonly -f "{{.Dir}}" sigs.k8s.io/kueue) +.PHONY: dep-crds +dep-crds: ## Copy CRDs from external operators to dep-crds directory. + mkdir -p $(EXTERNAL_CRDS_DIR)/kueue + cp -f $(KUEUE_ROOT)/config/components/crd/bases/* $(EXTERNAL_CRDS_DIR)/kueue + .PHONY: test -test: manifests generate fmt vet envtest ## Run unit tests. +test: manifests generate fmt vet dep-crds envtest ## Run unit tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./api/... ./internal/... ./pkg/...) -v -ginkgo.v -coverprofile cover.out .PHONY: install diff --git a/internal/controller/appwrapper/fixtures_test.go b/internal/controller/appwrapper/fixtures_test.go index 6b85ee14..5d83b4ba 100644 --- a/internal/controller/appwrapper/fixtures_test.go +++ b/internal/controller/appwrapper/fixtures_test.go @@ -30,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/utils/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" "sigs.k8s.io/yaml" workloadv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2" @@ -145,3 +146,16 @@ func malformedPod(milliCPU int64) workloadv1beta2.AppWrapperComponent { Template: runtime.RawExtension{Raw: jsonBytes}, } } + +func slackQueue(queueName string, nominalQuota resource.Quantity) *kueue.ClusterQueue { + return &kueue.ClusterQueue{ + TypeMeta: metav1.TypeMeta{APIVersion: kueue.GroupVersion.String(), Kind: "ClusterQueue"}, + ObjectMeta: metav1.ObjectMeta{Name: queueName}, + Spec: kueue.ClusterQueueSpec{ + ResourceGroups: []kueue.ResourceGroup{{ + CoveredResources: []v1.ResourceName{v1.ResourceName("nvidia.com/gpu")}, + Flavors: []kueue.FlavorQuotas{{ + Name: "default-flavor", + Resources: []kueue.ResourceQuota{{Name: v1.ResourceName("nvidia.com/gpu"), NominalQuota: nominalQuota}}}}}}}, + } +} diff --git a/internal/controller/appwrapper/node_health_monitor_test.go b/internal/controller/appwrapper/node_health_monitor_test.go index 58ff2915..1d01edbc 100644 --- a/internal/controller/appwrapper/node_health_monitor_test.go +++ b/internal/controller/appwrapper/node_health_monitor_test.go @@ -28,6 +28,7 @@ import ( ) var _ = Describe("NodeMonitor Controller", func() { + var slackQueueName = "fake-queue" var node1Name = types.NamespacedName{Name: "fake-node-1"} var node2Name = types.NamespacedName{Name: "fake-node-2"} var nodeMonitor *NodeHealthMonitor @@ -47,6 +48,7 @@ var _ = Describe("NodeMonitor Controller", func() { // Create reconciller awConfig := config.NewAppWrapperConfig() + awConfig.SlackQueueName = slackQueueName nodeMonitor = &NodeHealthMonitor{ Client: k8sClient, Config: awConfig, @@ -106,6 +108,58 @@ var _ = Describe("NodeMonitor Controller", func() { }) It("ClusterQueue Lending Adjustment", func() { - // TODO: tardieu + _, err := nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) + Expect(err).NotTo(HaveOccurred()) + + // start with 6 gpus + queue := slackQueue(slackQueueName, resource.MustParse("6")) + Expect(k8sClient.Create(ctx, queue)).To(Succeed()) + + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed()) + Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil()) + + // remove 4 gpus, lending limit should be 2 + node1 := getNode(node1Name.Name) + node1.Labels["autopilot.ibm.com/gpuhealth"] = "ERR" + Expect(k8sClient.Update(ctx, node1)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed()) + Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2))) + + // remove another 4 gpus, lending limit should be 0 = max(0, 6-4-4) + node2 := getNode(node2Name.Name) + node2.Labels["autopilot.ibm.com/gpuhealth"] = "ERR" + Expect(k8sClient.Update(ctx, node2)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed()) + Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil()) + Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(0))) + + // restore 4 gpus, lending limit should be 2 + node1.Labels["autopilot.ibm.com/gpuhealth"] = "OK" + Expect(k8sClient.Update(ctx, node1)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed()) + Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil()) + Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2))) + + // restore last 4 gpus, lending limit should be nil + node2.Labels["autopilot.ibm.com/gpuhealth"] = "OK" + Expect(k8sClient.Update(ctx, node2)).Should(Succeed()) + _, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name}) + Expect(err).NotTo(HaveOccurred()) + + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed()) + Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil()) + + Expect(k8sClient.Delete(ctx, queue)).To(Succeed()) }) }) diff --git a/internal/controller/appwrapper/suite_test.go b/internal/controller/appwrapper/suite_test.go index 71bda0ee..1f7aabac 100644 --- a/internal/controller/appwrapper/suite_test.go +++ b/internal/controller/appwrapper/suite_test.go @@ -64,7 +64,9 @@ var _ = BeforeSuite(func() { By("bootstrapping test environment") testEnv = &envtest.Environment{ - CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")}, + CRDDirectoryPaths: []string{ + filepath.Join("..", "..", "..", "config", "crd", "bases"), + filepath.Join("..", "..", "..", "dep-crds", "kueue")}, ErrorIfCRDPathMissing: true, // The BinaryAssetsDirectory is only required if you want to run the tests directly