Skip to content

Commit

Permalink
Add LendingLimit adjustment tests (#216)
Browse files Browse the repository at this point in the history
  • Loading branch information
tardieu authored Jul 24, 2024
1 parent 7281f6f commit abd10ce
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ go.work
*.swp
*.swo
*~

# CRDs for unit tests
dep-crds
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,16 @@ fmt: ## Run go fmt against code.
vet: ## Run go vet against code.
go vet ./...

EXTERNAL_CRDS_DIR ?= $(shell pwd)/dep-crds

KUEUE_ROOT = $(shell go list -m -mod=readonly -f "{{.Dir}}" sigs.k8s.io/kueue)
.PHONY: dep-crds
dep-crds: ## Copy CRDs from external operators to dep-crds directory.
mkdir -p $(EXTERNAL_CRDS_DIR)/kueue
cp -f $(KUEUE_ROOT)/config/components/crd/bases/* $(EXTERNAL_CRDS_DIR)/kueue

.PHONY: test
test: manifests generate fmt vet envtest ## Run unit tests.
test: manifests generate fmt vet dep-crds envtest ## Run unit tests.
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./api/... ./internal/... ./pkg/...) -v -ginkgo.v -coverprofile cover.out

.PHONY: install
Expand Down
14 changes: 14 additions & 0 deletions internal/controller/appwrapper/fixtures_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
"sigs.k8s.io/yaml"

workloadv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2"
Expand Down Expand Up @@ -145,3 +146,16 @@ func malformedPod(milliCPU int64) workloadv1beta2.AppWrapperComponent {
Template: runtime.RawExtension{Raw: jsonBytes},
}
}

func slackQueue(queueName string, nominalQuota resource.Quantity) *kueue.ClusterQueue {
return &kueue.ClusterQueue{
TypeMeta: metav1.TypeMeta{APIVersion: kueue.GroupVersion.String(), Kind: "ClusterQueue"},
ObjectMeta: metav1.ObjectMeta{Name: queueName},
Spec: kueue.ClusterQueueSpec{
ResourceGroups: []kueue.ResourceGroup{{
CoveredResources: []v1.ResourceName{v1.ResourceName("nvidia.com/gpu")},
Flavors: []kueue.FlavorQuotas{{
Name: "default-flavor",
Resources: []kueue.ResourceQuota{{Name: v1.ResourceName("nvidia.com/gpu"), NominalQuota: nominalQuota}}}}}}},
}
}
56 changes: 55 additions & 1 deletion internal/controller/appwrapper/node_health_monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
)

var _ = Describe("NodeMonitor Controller", func() {
var slackQueueName = "fake-queue"
var node1Name = types.NamespacedName{Name: "fake-node-1"}
var node2Name = types.NamespacedName{Name: "fake-node-2"}
var nodeMonitor *NodeHealthMonitor
Expand All @@ -47,6 +48,7 @@ var _ = Describe("NodeMonitor Controller", func() {

// Create reconciller
awConfig := config.NewAppWrapperConfig()
awConfig.SlackQueueName = slackQueueName
nodeMonitor = &NodeHealthMonitor{
Client: k8sClient,
Config: awConfig,
Expand Down Expand Up @@ -106,6 +108,58 @@ var _ = Describe("NodeMonitor Controller", func() {
})

It("ClusterQueue Lending Adjustment", func() {
// TODO: tardieu
_, err := nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
Expect(err).NotTo(HaveOccurred())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
Expect(err).NotTo(HaveOccurred())

// start with 6 gpus
queue := slackQueue(slackQueueName, resource.MustParse("6"))
Expect(k8sClient.Create(ctx, queue)).To(Succeed())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())

// remove 4 gpus, lending limit should be 2
node1 := getNode(node1Name.Name)
node1.Labels["autopilot.ibm.com/gpuhealth"] = "ERR"
Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))

// remove another 4 gpus, lending limit should be 0 = max(0, 6-4-4)
node2 := getNode(node2Name.Name)
node2.Labels["autopilot.ibm.com/gpuhealth"] = "ERR"
Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(0)))

// restore 4 gpus, lending limit should be 2
node1.Labels["autopilot.ibm.com/gpuhealth"] = "OK"
Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))

// restore last 4 gpus, lending limit should be nil
node2.Labels["autopilot.ibm.com/gpuhealth"] = "OK"
Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())

Expect(k8sClient.Delete(ctx, queue)).To(Succeed())
})
})
4 changes: 3 additions & 1 deletion internal/controller/appwrapper/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ var _ = BeforeSuite(func() {

By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")},
CRDDirectoryPaths: []string{
filepath.Join("..", "..", "..", "config", "crd", "bases"),
filepath.Join("..", "..", "..", "dep-crds", "kueue")},
ErrorIfCRDPathMissing: true,

// The BinaryAssetsDirectory is only required if you want to run the tests directly
Expand Down

0 comments on commit abd10ce

Please sign in to comment.