Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add LendingLimit adjustment tests #216

Merged
merged 1 commit into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,6 @@ go.work
*.swp
*.swo
*~

# CRDs for unit tests
dep-crds
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,16 @@ fmt: ## Run go fmt against code.
vet: ## Run go vet against code.
go vet ./...

EXTERNAL_CRDS_DIR ?= $(shell pwd)/dep-crds

KUEUE_ROOT = $(shell go list -m -mod=readonly -f "{{.Dir}}" sigs.k8s.io/kueue)
.PHONY: dep-crds
dep-crds: ## Copy CRDs from external operators to dep-crds directory.
mkdir -p $(EXTERNAL_CRDS_DIR)/kueue
cp -f $(KUEUE_ROOT)/config/components/crd/bases/* $(EXTERNAL_CRDS_DIR)/kueue

.PHONY: test
test: manifests generate fmt vet envtest ## Run unit tests.
test: manifests generate fmt vet dep-crds envtest ## Run unit tests.
KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) --bin-dir $(LOCALBIN) -p path)" go test $$(go list ./api/... ./internal/... ./pkg/...) -v -ginkgo.v -coverprofile cover.out

.PHONY: install
Expand Down
14 changes: 14 additions & 0 deletions internal/controller/appwrapper/fixtures_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"
"sigs.k8s.io/controller-runtime/pkg/client"
kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1"
"sigs.k8s.io/yaml"

workloadv1beta2 "github.com/project-codeflare/appwrapper/api/v1beta2"
Expand Down Expand Up @@ -145,3 +146,16 @@ func malformedPod(milliCPU int64) workloadv1beta2.AppWrapperComponent {
Template: runtime.RawExtension{Raw: jsonBytes},
}
}

func slackQueue(queueName string, nominalQuota resource.Quantity) *kueue.ClusterQueue {
return &kueue.ClusterQueue{
TypeMeta: metav1.TypeMeta{APIVersion: kueue.GroupVersion.String(), Kind: "ClusterQueue"},
ObjectMeta: metav1.ObjectMeta{Name: queueName},
Spec: kueue.ClusterQueueSpec{
ResourceGroups: []kueue.ResourceGroup{{
CoveredResources: []v1.ResourceName{v1.ResourceName("nvidia.com/gpu")},
Flavors: []kueue.FlavorQuotas{{
Name: "default-flavor",
Resources: []kueue.ResourceQuota{{Name: v1.ResourceName("nvidia.com/gpu"), NominalQuota: nominalQuota}}}}}}},
}
}
56 changes: 55 additions & 1 deletion internal/controller/appwrapper/node_health_monitor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
)

var _ = Describe("NodeMonitor Controller", func() {
var slackQueueName = "fake-queue"
var node1Name = types.NamespacedName{Name: "fake-node-1"}
var node2Name = types.NamespacedName{Name: "fake-node-2"}
var nodeMonitor *NodeHealthMonitor
Expand All @@ -47,6 +48,7 @@ var _ = Describe("NodeMonitor Controller", func() {

// Create reconciller
awConfig := config.NewAppWrapperConfig()
awConfig.SlackQueueName = slackQueueName
nodeMonitor = &NodeHealthMonitor{
Client: k8sClient,
Config: awConfig,
Expand Down Expand Up @@ -106,6 +108,58 @@ var _ = Describe("NodeMonitor Controller", func() {
})

It("ClusterQueue Lending Adjustment", func() {
// TODO: tardieu
_, err := nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
Expect(err).NotTo(HaveOccurred())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
Expect(err).NotTo(HaveOccurred())

// start with 6 gpus
queue := slackQueue(slackQueueName, resource.MustParse("6"))
Expect(k8sClient.Create(ctx, queue)).To(Succeed())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())

// remove 4 gpus, lending limit should be 2
node1 := getNode(node1Name.Name)
node1.Labels["autopilot.ibm.com/gpuhealth"] = "ERR"
Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))

// remove another 4 gpus, lending limit should be 0 = max(0, 6-4-4)
node2 := getNode(node2Name.Name)
node2.Labels["autopilot.ibm.com/gpuhealth"] = "ERR"
Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(0)))

// restore 4 gpus, lending limit should be 2
node1.Labels["autopilot.ibm.com/gpuhealth"] = "OK"
Expect(k8sClient.Update(ctx, node1)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node1Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).ShouldNot(BeNil())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit.Value()).Should(Equal(int64(2)))

// restore last 4 gpus, lending limit should be nil
node2.Labels["autopilot.ibm.com/gpuhealth"] = "OK"
Expect(k8sClient.Update(ctx, node2)).Should(Succeed())
_, err = nodeMonitor.Reconcile(ctx, reconcile.Request{NamespacedName: node2Name})
Expect(err).NotTo(HaveOccurred())

Expect(k8sClient.Get(ctx, types.NamespacedName{Name: slackQueueName}, queue)).Should(Succeed())
Expect(queue.Spec.ResourceGroups[0].Flavors[0].Resources[0].LendingLimit).Should(BeNil())

Expect(k8sClient.Delete(ctx, queue)).To(Succeed())
})
})
4 changes: 3 additions & 1 deletion internal/controller/appwrapper/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,9 @@ var _ = BeforeSuite(func() {

By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "..", "config", "crd", "bases")},
CRDDirectoryPaths: []string{
filepath.Join("..", "..", "..", "config", "crd", "bases"),
filepath.Join("..", "..", "..", "dep-crds", "kueue")},
ErrorIfCRDPathMissing: true,

// The BinaryAssetsDirectory is only required if you want to run the tests directly
Expand Down