diff --git a/.github/workflows/code_verify.yaml b/.github/workflows/code_verify.yaml index 0457f9c841..dc1d4ea7d6 100644 --- a/.github/workflows/code_verify.yaml +++ b/.github/workflows/code_verify.yaml @@ -18,7 +18,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Checkout code uses: actions/checkout@v3 diff --git a/.github/workflows/e2e_parallel_jobs.yaml b/.github/workflows/e2e_parallel_jobs.yaml index cc10c723a8..614948e41f 100644 --- a/.github/workflows/e2e_parallel_jobs.yaml +++ b/.github/workflows/e2e_parallel_jobs.yaml @@ -16,7 +16,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Install musl run: | @@ -31,8 +31,8 @@ jobs: - name: Install dependences run: | - GO111MODULE="on" go install sigs.k8s.io/kind@v0.15.0 - curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl + GO111MODULE="on" go install sigs.k8s.io/kind@v0.21.0 + curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.29.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code uses: actions/checkout@v3 diff --git a/.github/workflows/e2e_scheduling_actions.yaml b/.github/workflows/e2e_scheduling_actions.yaml index 2bd495b8c1..667e24e0fc 100644 --- a/.github/workflows/e2e_scheduling_actions.yaml +++ b/.github/workflows/e2e_scheduling_actions.yaml @@ -16,7 +16,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Install musl run: | @@ -31,8 +31,8 @@ jobs: - name: Install dependences run: | - GO111MODULE="on" go install sigs.k8s.io/kind@v0.15.0 - curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl + GO111MODULE="on" go install sigs.k8s.io/kind@v0.21.0 + curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.29.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code uses: actions/checkout@v3 diff --git a/.github/workflows/e2e_scheduling_basic.yaml b/.github/workflows/e2e_scheduling_basic.yaml index 5ef5d941da..f1a68c49c9 100644 --- a/.github/workflows/e2e_scheduling_basic.yaml +++ b/.github/workflows/e2e_scheduling_basic.yaml @@ -16,7 +16,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Install musl run: | @@ -31,8 +31,8 @@ jobs: - name: Install dependences run: | - GO111MODULE="on" go install sigs.k8s.io/kind@v0.15.0 - curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl + GO111MODULE="on" go install sigs.k8s.io/kind@v0.21.0 + curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.29.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code uses: actions/checkout@v3 diff --git a/.github/workflows/e2e_sequence.yaml b/.github/workflows/e2e_sequence.yaml index dc800f7a07..7b4d8a2f45 100644 --- a/.github/workflows/e2e_sequence.yaml +++ b/.github/workflows/e2e_sequence.yaml @@ -16,7 +16,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Install musl run: | @@ -31,8 +31,8 @@ jobs: - name: Install dependences run: | - GO111MODULE="on" go install sigs.k8s.io/kind@v0.15.0 - curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl + GO111MODULE="on" go install sigs.k8s.io/kind@v0.21.0 + curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.29.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code uses: actions/checkout@v3 diff --git a/.github/workflows/e2e_spark.yaml b/.github/workflows/e2e_spark.yaml index a4c226fe12..cc35a90200 100644 --- a/.github/workflows/e2e_spark.yaml +++ b/.github/workflows/e2e_spark.yaml @@ -50,7 +50,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Set up Docker Buildx id: buildx uses: docker/setup-buildx-action@v2 diff --git a/.github/workflows/e2e_vcctl.yaml b/.github/workflows/e2e_vcctl.yaml index 7ab38a0d14..1fae0e4601 100644 --- a/.github/workflows/e2e_vcctl.yaml +++ b/.github/workflows/e2e_vcctl.yaml @@ -16,7 +16,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Install musl run: | @@ -31,8 +31,8 @@ jobs: - name: Install dependences run: | - GO111MODULE="on" go install sigs.k8s.io/kind@v0.15.0 - curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.23.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl + GO111MODULE="on" go install sigs.k8s.io/kind@v0.21.0 + curl -LO https://storage.googleapis.com/kubernetes-release/release/v1.29.0/bin/linux/amd64/kubectl && sudo install kubectl /usr/local/bin/kubectl - name: Checkout code uses: actions/checkout@v3 diff --git a/.github/workflows/fossa.yml b/.github/workflows/fossa.yml index d77045d0fd..796948ebb6 100644 --- a/.github/workflows/fossa.yml +++ b/.github/workflows/fossa.yml @@ -12,7 +12,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - run: go version # Runs a set of commands to initialize and analyze with FOSSA - name: run FOSSA analysis diff --git a/.github/workflows/licenses_lint.yaml b/.github/workflows/licenses_lint.yaml index 8b96fd9a9f..4a61624008 100644 --- a/.github/workflows/licenses_lint.yaml +++ b/.github/workflows/licenses_lint.yaml @@ -16,7 +16,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Checkout code uses: actions/checkout@v3 - name: generate license mirror diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c091092896..40c2847ab5 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -18,7 +18,7 @@ jobs: - name: Install Go uses: actions/setup-go@v4 with: - go-version: 1.20.x + go-version: 1.21.x - name: Install musl run: | diff --git a/config/crd/volcano/bases/scheduling.volcano.sh_queues.yaml b/config/crd/volcano/bases/scheduling.volcano.sh_queues.yaml index 7b5269201b..6ce160cce1 100644 --- a/config/crd/volcano/bases/scheduling.volcano.sh_queues.yaml +++ b/config/crd/volcano/bases/scheduling.volcano.sh_queues.yaml @@ -82,6 +82,16 @@ spec: x-kubernetes-int-or-string: true description: ResourceList is a set of (resource name, quantity) pairs. type: object + deserved: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: The amount of resources configured by the user. This + part of resource can be shared with other queues and reclaimed back. + type: object extendClusters: description: extendCluster indicate the jobs in this Queue will be dispatched to these clusters. diff --git a/config/crd/volcano/v1beta1/scheduling.volcano.sh_queues.yaml b/config/crd/volcano/v1beta1/scheduling.volcano.sh_queues.yaml index 8681f1993c..8edb32e37b 100644 --- a/config/crd/volcano/v1beta1/scheduling.volcano.sh_queues.yaml +++ b/config/crd/volcano/v1beta1/scheduling.volcano.sh_queues.yaml @@ -81,6 +81,16 @@ spec: x-kubernetes-int-or-string: true description: ResourceList is a set of (resource name, quantity) pairs. type: object + deserved: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: The amount of resources configured by the user. This part + of resource can be shared with other queues and reclaimed back. + type: object extendClusters: description: extendCluster indicate the jobs in this Queue will be dispatched to these clusters. diff --git a/docs/user-guide/how_to_use_capacity_plugin.md b/docs/user-guide/how_to_use_capacity_plugin.md new file mode 100644 index 0000000000..93def4062b --- /dev/null +++ b/docs/user-guide/how_to_use_capacity_plugin.md @@ -0,0 +1,191 @@ +# Capacity Plugin User Guide + +## Introduction + +Capacity plugin is a replacement of proportion plugin, but instead of dividing the queue's deserved resources by weight, it realizes elastic queue capacity management i.e., queue's resource borrowing and lending mechanism by specifying the amount of deserved resources for each dimension resource of the queue. + +A queue can use the idle resources of other queues, and when other queues submit jobs, they can reclaim the resources that have been lent, and the amount of reclaimed resources is the amount of queue's deserved resources. For more detail, please see [Capacity scheduling design](../design/capacity-scheduling.md) + +## Environment setup + +### Install volcano + +Refer to [Install Guide](https://github.com/volcano-sh/volcano/blob/master/installer/README.md) to install volcano. + +After installed, update the scheduler configuration: + +```shell +kubectl edit cm -n volcano-system volcano-scheduler-configmap +``` + +Make sure capacity plugin are enabled and remove proportion plugin. + +Note: capacity and proportion plugin are in conflict, the two plugins cannot be used together. + +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + enablePreemptable: false + - name: conformance + - plugins: + - name: drf + enablePreemptable: false + - name: predicates + - name: capacity # add this field and remove proportion plugin. + - name: nodeorder + - name: binpack +``` + +## Config queue's deserved resources + +Assume there are two nodes and two queues named queue1 and queue2 in your kubernetes cluster, and each node has 4 CPU and 16Gi memory, then there will be total 8 CPU and 32Gi memory in your cluster. + +```yaml +allocatable: + cpu: "4" + memory: 16Gi + pods: "110" +``` + +config queue1's deserved field with 2 cpu and 8Gi memory. + +```yaml +apiVersion: scheduling.volcano.sh/v1beta1 +kind: Queue +metadata: + name: queue1 +spec: + reclaimable: true + deserved: # set the deserved field. + cpu: 2 + memeory: 8Gi +``` + +config queue2's deserved field with 6 cpu and 24Gi memory. + +```yaml +apiVersion: scheduling.volcano.sh/v1beta1 +kind: Queue +metadata: + name: queue2 +spec: + reclaimable: true + deserved: # set the deserved field. + cpu: 6 + memory: 24Gi +``` + +## Submit pods to each queue + +First, submit a deployment named demo-1 to queue1 with replicas=8 and each pod requests 1 cpu and 4Gi memory, because queue2 is idle, so queue1 can use the whole clusters' resources, and you can see that 8 pods are in Running state. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-1 +spec: + selector: + matchLabels: + app: demo-1 + replicas: 8 + template: + metadata: + labels: + app: demo-1 + annotations: + scheduling.volcano.sh/queue-name: "queue1" # set the queue + spec: + schedulerName: volcano + containers: + - name: nginx + image: nginx:1.14.2 + resources: + requests: + cpu: 1 + memory: 4Gi + ports: + - containerPort: 80 +``` + +Expected result: + +```shell +$ kubectl get po +NAME READY STATUS RESTARTS AGE +demo-1-7bc649f544-2wjg7 1/1 Running 0 5s +demo-1-7bc649f544-cvsmr 1/1 Running 0 5s +demo-1-7bc649f544-j5lzp 1/1 Running 0 5s +demo-1-7bc649f544-jvlbx 1/1 Running 0 5s +demo-1-7bc649f544-mzgg2 1/1 Running 0 5s +demo-1-7bc649f544-ntrs2 1/1 Running 0 5s +demo-1-7bc649f544-nv424 1/1 Running 0 5s +demo-1-7bc649f544-zd6d9 1/1 Running 0 5s +``` + +Then submit a deployment named demo-2 to queue2 with replicas=8 and each pod requests 1 cpu and 4Gi memory. + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: demo-2 +spec: + selector: + matchLabels: + app: demo-2 + replicas: 8 + template: + metadata: + labels: + app: demo-2 + annotations: + scheduling.volcano.sh/queue-name: "queue2" # set the queue + spec: + schedulerName: volcano + containers: + - name: nginx + image: nginx:1.14.2 + resources: + requests: + cpu: 1 + memory: 4Gi + ports: + - containerPort: 80 +``` + +Because queue1 occupied queue2's resources, so queue2 will reclaim its deserved resources with 6 cpu and 24Gi memory. And each pod of demo-2 request 1 cpu and 4Gi memory, so there will be 6 Pods in Running state of demo-2, and demo-1's pods will be evicted. + +Finally, you can see that there are 2 Running pods in demo-1(belongs to queue1), and 6 Running pods in demo-2(belongs to queue2), which meets queue's deserved resources respectively. + +```shell +$ kubectl get po +NAME READY STATUS RESTARTS AGE +demo-1-7bc649f544-4vvdv 0/1 Pending 0 37s +demo-1-7bc649f544-c6mds 0/1 Pending 0 37s +demo-1-7bc649f544-j5lzp 1/1 Running 0 14m +demo-1-7bc649f544-mzgg2 1/1 Running 0 14m +demo-1-7bc649f544-pqdgk 0/1 Pending 0 37s +demo-1-7bc649f544-tx6wp 0/1 Pending 0 37s +demo-1-7bc649f544-wmshq 0/1 Pending 0 37s +demo-1-7bc649f544-wrhrr 0/1 Pending 0 37s +demo-2-6dfb86c49b-2jvgm 0/1 Pending 0 37s +demo-2-6dfb86c49b-dnjzv 1/1 Running 0 37s +demo-2-6dfb86c49b-fzvmp 1/1 Running 0 37s +demo-2-6dfb86c49b-jlf69 1/1 Running 0 37s +demo-2-6dfb86c49b-k62f7 1/1 Running 0 37s +demo-2-6dfb86c49b-k9b9v 1/1 Running 0 37s +demo-2-6dfb86c49b-rpzvg 0/1 Pending 0 37s +demo-2-6dfb86c49b-zch7w 1/1 Running 0 37s +``` + diff --git a/docs/user-guide/how_to_use_gpu_number.md b/docs/user-guide/how_to_use_gpu_number.md index b4a38a22a3..10da190655 100644 --- a/docs/user-guide/how_to_use_gpu_number.md +++ b/docs/user-guide/how_to_use_gpu_number.md @@ -1,9 +1,5 @@ # GPU Number User guide -## Note: GPU Number will be deprecated in volcano v1.9, please use volcano vgpu instead - -[Volcano vgpu](./how_to_use_vgpu.md) - ## Environment setup ### Install volcano @@ -12,12 +8,43 @@ Refer to [Install Guide](../../installer/README.md) to install volcano. +> **Note** The Volcano VGPU feature has been transferred to the HAMI project, click [here](https://github.com/Project-HAMi/volcano-vgpu-device-plugin) to access + After installed, update the scheduler configuration: ```shell script kubectl edit cm -n volcano-system volcano-scheduler-configmap ``` +For volcano v1.8.2+(v1.8.2 excluded), use the following configMap + +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + - name: conformance + - plugins: + - name: drf + - name: deviceshare + arguments: + deviceshare.GPUNumberEnable: true # enable gpu number + - name: predicates + - name: proportion + - name: nodeorder + - name: binpack +``` + +For volcano v1.8.2-(v1.8.2 included), use the following configMap + ```yaml kind: ConfigMap apiVersion: v1 diff --git a/docs/user-guide/how_to_use_gpu_sharing.md b/docs/user-guide/how_to_use_gpu_sharing.md index 5f760cd3c3..d2b2f32e41 100644 --- a/docs/user-guide/how_to_use_gpu_sharing.md +++ b/docs/user-guide/how_to_use_gpu_sharing.md @@ -1,9 +1,5 @@ # GPU Sharing User guide -## Note: GPU Sharing will be deprecated in volcano v1.9, please use volcano vgpu instead - -[Volcano vgpu](./how_to_use_vgpu.md) - ## Environment setup ### Install volcano @@ -12,12 +8,43 @@ Refer to [Install Guide](../../installer/README.md) to install volcano. +> **Note** The Volcano VGPU feature has been transferred to the HAMI project, click [here](https://github.com/Project-HAMi/volcano-vgpu-device-plugin) to access + After installed, update the scheduler configuration: ```shell script kubectl edit cm -n volcano-system volcano-scheduler-configmap ``` +For volcano v1.8.2+(v1.8.2 excluded), use the following configMap + +```yaml +kind: ConfigMap +apiVersion: v1 +metadata: + name: volcano-scheduler-configmap + namespace: volcano-system +data: + volcano-scheduler.conf: | + actions: "enqueue, allocate, backfill" + tiers: + - plugins: + - name: priority + - name: gang + - name: conformance + - plugins: + - name: drf + - name: deviceshare + arguments: + deviceshare.GPUSharingEnable: true # enable gpu sharing + - name: predicates + - name: proportion + - name: nodeorder + - name: binpack +``` + +For volcano v1.8.2-(v1.8.2 included), use the following configMap + ```yaml kind: ConfigMap apiVersion: v1 diff --git a/docs/user-guide/how_to_use_vgpu.md b/docs/user-guide/how_to_use_vgpu.md deleted file mode 100644 index 7512b54629..0000000000 --- a/docs/user-guide/how_to_use_vgpu.md +++ /dev/null @@ -1,195 +0,0 @@ -# GPU Sharing User guide - -## Contact And Bug Report - -Thanks staff of 4paradigm.com for contributing this feature to volcano, if you encountered any issues, feel free to submit an issue, or sending an email to - -## Main features - -***GPU sharing***: Each task can allocate a portion of GPU instead of a whole GPU card, thus GPU can be shared among multiple tasks. - -***Device Memory Control***: GPUs can be allocated with certain device memory size (i.e 3000M) or device memory percentage of whole GPU(i.e 50%) and have made it that it does not exceed the boundary. - -***Easy to use***: See the examples below. - -## Environment setup - -### Install volcano - -#### 1. Install from source - -Refer to [Install Guide](../../installer/README.md) to install volcano. - -After installed, update the scheduler configuration: - -```shell script -kubectl edit cm -n volcano-system volcano-scheduler-configmap -``` - -```yaml -kind: ConfigMap -apiVersion: v1 -metadata: - name: volcano-scheduler-configmap - namespace: volcano-system -data: - volcano-scheduler.conf: | - actions: "enqueue, allocate, backfill" - tiers: - - plugins: - - name: priority - - name: gang - - name: conformance - - plugins: - - name: drf - - name: predicates - arguments: - predicate.VGPUEnable: true # enable vgpu - - name: proportion - - name: nodeorder - - name: binpack -``` - -#### 2. Install from release package - -Same as above, after installed, update the scheduler configuration in `volcano-scheduler-configmap` configmap. - -### Install Volcano device plugin - -Please refer to [volcano device plugin](https://github.com/volcano-sh/devices/blob/master/README.md#quick-start) - -### Verify environment is ready - -Check the node status, it is ok if `volcano.sh/vgpu-number` is included in the allocatable resources. - -```shell script -$ kubectl get node {node name} -oyaml -... -status: - addresses: - - address: 172.17.0.3 - type: InternalIP - - address: volcano-control-plane - type: Hostname - allocatable: - cpu: "4" - ephemeral-storage: 123722704Ki - hugepages-1Gi: "0" - hugepages-2Mi: "0" - memory: 8174332Ki - pods: "110" - volcano.sh/gpu-number: "10" # vGPU resource - capacity: - cpu: "4" - ephemeral-storage: 123722704Ki - hugepages-1Gi: "0" - hugepages-2Mi: "0" - memory: 8174332Ki - pods: "110" - volcano.sh/gpu-memory: "89424" - volcano.sh/gpu-number: "10" # vGPU resource -``` - -### Running GPU Sharing Jobs - -NVIDIA GPUs can now be shared via container level resource requirements using the resource name `volcano.sh/vgpu-memory` and `volcano.sh/vgpu-number`: - -```shell script -$ cat </dev/null 2>&1 if [[ $? -ne 0 ]]; then echo "Installing kind ..." - go install sigs.k8s.io/kind@v0.15.0 + go install sigs.k8s.io/kind@v0.21.0 else echo -n "Found kind, version: " && kind version fi diff --git a/hack/run-e2e-kind.sh b/hack/run-e2e-kind.sh index 62173b1f2e..65ce93c219 100755 --- a/hack/run-e2e-kind.sh +++ b/hack/run-e2e-kind.sh @@ -73,6 +73,19 @@ function generate-log { kubectl logs deployment/${CLUSTER_NAME}-scheduler -n kube-system > volcano-scheduler.log } +function show-log() { + log_files=("volcano-admission.log" "volcano-controller.log" "volcano-scheduler.log") + for log_file in "${log_files[@]}"; do + if [ -f "$log_file" ]; then + echo "Showing ${log_file}..." + cat "$log_file" + else + echo "${log_file} not found" + fi + done +} + + # clean up function cleanup { uninstall-volcano @@ -81,8 +94,7 @@ function cleanup { kind delete cluster ${CLUSTER_CONTEXT} if [[ ${SHOW_VOLCANO_LOGS} -eq 1 ]]; then - #TODO: Add volcano logs support in future. - echo "Volcano logs are currently not supported." + show-log fi } diff --git a/installer/dockerfile/controller-manager/Dockerfile b/installer/dockerfile/controller-manager/Dockerfile index e09eac7328..6a3fde947d 100644 --- a/installer/dockerfile/controller-manager/Dockerfile +++ b/installer/dockerfile/controller-manager/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM golang:1.20.1 AS builder +FROM golang:1.21.8 AS builder WORKDIR /go/src/volcano.sh/ COPY go.mod go.sum ./ RUN go mod download diff --git a/installer/dockerfile/scheduler/Dockerfile b/installer/dockerfile/scheduler/Dockerfile index dffa8924f6..78749af9ff 100644 --- a/installer/dockerfile/scheduler/Dockerfile +++ b/installer/dockerfile/scheduler/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM golang:1.20.1 AS builder +FROM golang:1.21.8 AS builder WORKDIR /go/src/volcano.sh/ COPY go.mod go.sum ./ RUN go mod download diff --git a/installer/dockerfile/webhook-manager/Dockerfile b/installer/dockerfile/webhook-manager/Dockerfile index 3edcf278d6..fe9e13a6a0 100644 --- a/installer/dockerfile/webhook-manager/Dockerfile +++ b/installer/dockerfile/webhook-manager/Dockerfile @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -FROM golang:1.20.1 AS builder +FROM golang:1.21.8 AS builder WORKDIR /go/src/volcano.sh/ COPY go.mod go.sum ./ RUN go mod download diff --git a/installer/helm/chart/volcano/crd/bases/scheduling.volcano.sh_queues.yaml b/installer/helm/chart/volcano/crd/bases/scheduling.volcano.sh_queues.yaml index a6b4e30b84..ae8972d063 100644 --- a/installer/helm/chart/volcano/crd/bases/scheduling.volcano.sh_queues.yaml +++ b/installer/helm/chart/volcano/crd/bases/scheduling.volcano.sh_queues.yaml @@ -80,6 +80,16 @@ spec: x-kubernetes-int-or-string: true description: ResourceList is a set of (resource name, quantity) pairs. type: object + deserved: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: The amount of resources configured by the user. This + part of resource can be shared with other queues and reclaimed back. + type: object extendClusters: description: extendCluster indicate the jobs in this Queue will be dispatched to these clusters. diff --git a/installer/helm/chart/volcano/crd/v1beta1/scheduling.volcano.sh_queues.yaml b/installer/helm/chart/volcano/crd/v1beta1/scheduling.volcano.sh_queues.yaml index 141f90dfdd..94d1e3f556 100644 --- a/installer/helm/chart/volcano/crd/v1beta1/scheduling.volcano.sh_queues.yaml +++ b/installer/helm/chart/volcano/crd/v1beta1/scheduling.volcano.sh_queues.yaml @@ -79,6 +79,16 @@ spec: x-kubernetes-int-or-string: true description: ResourceList is a set of (resource name, quantity) pairs. type: object + deserved: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: The amount of resources configured by the user. This part + of resource can be shared with other queues and reclaimed back. + type: object extendClusters: description: extendCluster indicate the jobs in this Queue will be dispatched to these clusters. diff --git a/installer/helm/chart/volcano/templates/controllers.yaml b/installer/helm/chart/volcano/templates/controllers.yaml index 3ec0c7c4fa..c312e94289 100644 --- a/installer/helm/chart/volcano/templates/controllers.yaml +++ b/installer/helm/chart/volcano/templates/controllers.yaml @@ -47,7 +47,7 @@ rules: verbs: ["get", "list", "watch", "create", "delete", "update"] - apiGroups: [""] resources: ["secrets"] - verbs: ["get", "list", "watch", "create", "delete", "update"] + verbs: ["get", "create", "delete", "update"] - apiGroups: ["scheduling.incubator.k8s.io", "scheduling.volcano.sh"] resources: ["podgroups", "queues", "queues/status"] verbs: ["get", "list", "watch", "create", "delete", "update"] diff --git a/installer/volcano-development.yaml b/installer/volcano-development.yaml index 2c77b1182a..2fa68640f0 100644 --- a/installer/volcano-development.yaml +++ b/installer/volcano-development.yaml @@ -3968,7 +3968,7 @@ rules: verbs: ["get", "list", "watch", "create", "delete", "update"] - apiGroups: [""] resources: ["secrets"] - verbs: ["get", "list", "watch", "create", "delete", "update"] + verbs: ["get", "create", "delete", "update"] - apiGroups: ["scheduling.incubator.k8s.io", "scheduling.volcano.sh"] resources: ["podgroups", "queues", "queues/status"] verbs: ["get", "list", "watch", "create", "delete", "update"] @@ -4468,6 +4468,16 @@ spec: x-kubernetes-int-or-string: true description: ResourceList is a set of (resource name, quantity) pairs. type: object + deserved: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: The amount of resources configured by the user. This + part of resource can be shared with other queues and reclaimed back. + type: object extendClusters: description: extendCluster indicate the jobs in this Queue will be dispatched to these clusters. diff --git a/pkg/scheduler/actions/allocate/allocate_test.go b/pkg/scheduler/actions/allocate/allocate_test.go index cfab5ed80c..24007f5cd0 100644 --- a/pkg/scheduler/actions/allocate/allocate_test.go +++ b/pkg/scheduler/actions/allocate/allocate_test.go @@ -165,7 +165,7 @@ func TestAllocate(t *testing.T) { t.Run(test.name, func(t *testing.T) { binder := &util.FakeBinder{ Binds: map[string]string{}, - Channel: make(chan string), + Channel: make(chan string, 10), } schedulerCache := &cache.SchedulerCache{ Nodes: make(map[string]*api.NodeInfo), @@ -323,7 +323,7 @@ func TestAllocateWithDynamicPVC(t *testing.T) { fakeVolumeBinder := util.NewFakeVolumeBinder(kubeClient) binder := &util.FakeBinder{ Binds: map[string]string{}, - Channel: make(chan string), + Channel: make(chan string, 10), } schedulerCache := &cache.SchedulerCache{ Nodes: make(map[string]*api.NodeInfo), diff --git a/pkg/scheduler/actions/preempt/preempt.go b/pkg/scheduler/actions/preempt/preempt.go index 0ff3032b16..504c635dac 100644 --- a/pkg/scheduler/actions/preempt/preempt.go +++ b/pkg/scheduler/actions/preempt/preempt.go @@ -209,13 +209,11 @@ func preempt( } predicateFn := func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { - // Allows scheduling to nodes that are in Success or Unschedulable state after filtering by predicate. var statusSets util.StatusSets - statusSets, err := ssn.PredicateFn(task, node) - if err != nil { - return nil, api.NewFitError(task, node, err.Error()) - } + statusSets, _ = ssn.PredicateFn(task, node) + // When filtering candidate nodes, need to consider the node statusSets instead of the err information. + // refer to kube-scheduler preemption code: https://github.com/kubernetes/kubernetes/blob/9d87fa215d9e8020abdc17132d1252536cd752d2/pkg/scheduler/framework/preemption/preemption.go#L422 if statusSets.ContainsUnschedulableAndUnresolvable() || statusSets.ContainsErrorSkipOrWait() { return nil, api.NewFitError(task, node, statusSets.Message()) } @@ -255,17 +253,7 @@ func preempt( continue } - victimsQueue := util.NewPriorityQueue(func(l, r interface{}) bool { - lv := l.(*api.TaskInfo) - rv := r.(*api.TaskInfo) - if lv.Job != rv.Job { - return !ssn.JobOrderFn(ssn.Jobs[lv.Job], ssn.Jobs[rv.Job]) - } - return !ssn.TaskOrderFn(l, r) - }) - for _, victim := range victims { - victimsQueue.Push(victim) - } + victimsQueue := ssn.BuildVictimsPriorityQueue(victims) // Preempt victims for tasks, pick lowest priority task first. preempted := api.EmptyResource() diff --git a/pkg/scheduler/actions/reclaim/reclaim.go b/pkg/scheduler/actions/reclaim/reclaim.go index 430ec99a98..a0c9612e81 100644 --- a/pkg/scheduler/actions/reclaim/reclaim.go +++ b/pkg/scheduler/actions/reclaim/reclaim.go @@ -95,6 +95,10 @@ func (ra *Action) Execute(ssn *framework.Session) { klog.V(3).Infof("Queue <%s> is overused, ignore it.", queue.Name) continue } + if !ssn.Preemptive(queue) { + klog.V(3).Infof("Queue <%s> can not reclaim by preempt others, ignore it.", queue.Name) + continue + } // Found "high" priority job jobs, found := preemptorsMap[queue.UID] @@ -124,14 +128,10 @@ func (ra *Action) Execute(ssn *framework.Session) { assigned := false for _, n := range ssn.Nodes { var statusSets util.StatusSets - statusSets, err := ssn.PredicateFn(task, n) - if err != nil { - klog.V(5).Infof("reclaim predicates failed for task <%s/%s> on node <%s>: %v", - task.Namespace, task.Name, n.Name, err) - continue - } + statusSets, _ = ssn.PredicateFn(task, n) - // Allows scheduling to nodes that are in Success or Unschedulable state after filtering by predicate. + // When filtering candidate nodes, need to consider the node statusSets instead of the err information. + // refer to kube-scheduler preemption code: https://github.com/kubernetes/kubernetes/blob/9d87fa215d9e8020abdc17132d1252536cd752d2/pkg/scheduler/framework/preemption/preemption.go#L422 if statusSets.ContainsUnschedulableAndUnresolvable() || statusSets.ContainsErrorSkipOrWait() { klog.V(5).Infof("predicates failed in reclaim for task <%s/%s> on node <%s>, reason is %s.", task.Namespace, task.Name, n.Name, statusSets.Message()) @@ -174,11 +174,14 @@ func (ra *Action) Execute(ssn *framework.Session) { continue } + victimsQueue := ssn.BuildVictimsPriorityQueue(victims) + resreq := task.InitResreq.Clone() reclaimed := api.EmptyResource() // Reclaim victims for tasks. - for _, reclaimee := range victims { + for !victimsQueue.Empty() { + reclaimee := victimsQueue.Pop().(*api.TaskInfo) klog.Errorf("Try to reclaim Task <%s/%s> for Tasks <%s/%s>", reclaimee.Namespace, reclaimee.Name, task.Namespace, task.Name) if err := ssn.Evict(reclaimee, "reclaim"); err != nil { diff --git a/pkg/scheduler/actions/reclaim/reclaim_test.go b/pkg/scheduler/actions/reclaim/reclaim_test.go index 2dc3b379a7..208292430e 100644 --- a/pkg/scheduler/actions/reclaim/reclaim_test.go +++ b/pkg/scheduler/actions/reclaim/reclaim_test.go @@ -17,145 +17,123 @@ limitations under the License. package reclaim import ( - "reflect" "testing" - "time" - "github.com/agiledragon/gomonkey/v2" v1 "k8s.io/api/core/v1" schedulingv1 "k8s.io/api/scheduling/v1" - "k8s.io/client-go/tools/record" schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" "volcano.sh/volcano/pkg/scheduler/api" - "volcano.sh/volcano/pkg/scheduler/cache" "volcano.sh/volcano/pkg/scheduler/conf" "volcano.sh/volcano/pkg/scheduler/framework" "volcano.sh/volcano/pkg/scheduler/plugins/conformance" "volcano.sh/volcano/pkg/scheduler/plugins/gang" + "volcano.sh/volcano/pkg/scheduler/plugins/priority" "volcano.sh/volcano/pkg/scheduler/plugins/proportion" + "volcano.sh/volcano/pkg/scheduler/uthelper" "volcano.sh/volcano/pkg/scheduler/util" ) func TestReclaim(t *testing.T) { - var tmp *cache.SchedulerCache - patchUpdateQueueStatus := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "UpdateQueueStatus", func(scCache *cache.SchedulerCache, queue *api.QueueInfo) error { - return nil - }) - defer patchUpdateQueueStatus.Reset() - - framework.RegisterPluginBuilder("conformance", conformance.New) - framework.RegisterPluginBuilder("gang", gang.New) - framework.RegisterPluginBuilder("proportion", proportion.New) - defer framework.CleanupPluginBuilders() - - tests := []struct { - name string - podGroups []*schedulingv1beta1.PodGroup - pods []*v1.Pod - nodes []*v1.Node - queues []*schedulingv1beta1.Queue - expected int - }{ + tests := []uthelper.TestCommonStruct{ { - name: "Two Queue with one Queue overusing resource, should reclaim", - podGroups: []*schedulingv1beta1.PodGroup{ + Name: "Two Queue with one Queue overusing resource, should reclaim", + Plugins: map[string]framework.PluginBuilder{ + conformance.PluginName: conformance.New, + gang.PluginName: gang.New, + proportion.PluginName: proportion.New, + }, + PodGroups: []*schedulingv1beta1.PodGroup{ util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, nil, schedulingv1beta1.PodGroupInqueue, "low-priority"), util.BuildPodGroupWithPrio("pg2", "c1", "q2", 0, nil, schedulingv1beta1.PodGroupInqueue, "high-priority"), }, - pods: []*v1.Pod{ - util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), - util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", make(map[string]string), make(map[string]string)), - util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", make(map[string]string), make(map[string]string)), + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee3", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg2", make(map[string]string), make(map[string]string)), }, - nodes: []*v1.Node{ + Nodes: []*v1.Node{ util.BuildNode("n1", api.BuildResourceList("3", "3Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), }, - queues: []*schedulingv1beta1.Queue{ + Queues: []*schedulingv1beta1.Queue{ util.BuildQueue("q1", 1, nil), util.BuildQueue("q2", 1, nil), }, - expected: 1, + EvictNum: 1, + Evicted: []string{"c1/preemptee2"}, // let pod2 in the middle when sort tasks be preemptable and will not disturb + }, + { + Name: "sort reclaimees when reclaiming from overusing queue", + Plugins: map[string]framework.PluginBuilder{ + conformance.PluginName: conformance.New, + gang.PluginName: gang.New, + priority.PluginName: priority.New, + proportion.PluginName: proportion.New, + }, + PriClass: []*schedulingv1.PriorityClass{ + util.BuildPriorityClass("low-priority", 100), + util.BuildPriorityClass("mid-priority", 500), + util.BuildPriorityClass("high-priority", 1000), + }, + PodGroups: []*schedulingv1beta1.PodGroup{ + util.BuildPodGroupWithPrio("pg1", "c1", "q1", 0, nil, schedulingv1beta1.PodGroupInqueue, "mid-priority"), + util.BuildPodGroupWithPrio("pg2", "c1", "q2", 0, nil, schedulingv1beta1.PodGroupInqueue, "low-priority"), // reclaimed first + util.BuildPodGroupWithPrio("pg3", "c1", "q3", 0, nil, schedulingv1beta1.PodGroupInqueue, "high-priority"), + }, + Pods: []*v1.Pod{ + util.BuildPod("c1", "preemptee1-1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee1-2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg1", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2-1", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "true"}, make(map[string]string)), + util.BuildPod("c1", "preemptee2-2", "n1", v1.PodRunning, api.BuildResourceList("1", "1G"), "pg2", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)), + util.BuildPod("c1", "preemptor1", "", v1.PodPending, api.BuildResourceList("1", "1G"), "pg3", make(map[string]string), make(map[string]string)), + }, + Nodes: []*v1.Node{ + util.BuildNode("n1", api.BuildResourceList("4", "4Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), make(map[string]string)), + }, + Queues: []*schedulingv1beta1.Queue{ + util.BuildQueue("q1", 1, nil), + util.BuildQueue("q2", 1, nil), + util.BuildQueue("q3", 1, nil), + }, + EvictNum: 1, + Evicted: []string{"c1/preemptee2-1"}, // low priority job's preemptable pod is evicted }, } reclaim := New() - - for i, test := range tests { - binder := &util.FakeBinder{ - Binds: map[string]string{}, - Channel: make(chan string), - } - evictor := &util.FakeEvictor{ - Channel: make(chan string), - } - schedulerCache := &cache.SchedulerCache{ - Nodes: make(map[string]*api.NodeInfo), - Jobs: make(map[api.JobID]*api.JobInfo), - Queues: make(map[api.QueueID]*api.QueueInfo), - Binder: binder, - Evictor: evictor, - StatusUpdater: &util.FakeStatusUpdater{}, - VolumeBinder: &util.FakeVolumeBinder{}, - PriorityClasses: make(map[string]*schedulingv1.PriorityClass), - - Recorder: record.NewFakeRecorder(100), - } - schedulerCache.PriorityClasses["high-priority"] = &schedulingv1.PriorityClass{ - Value: 100000, - } - schedulerCache.PriorityClasses["low-priority"] = &schedulingv1.PriorityClass{ - Value: 10, - } - for _, node := range test.nodes { - schedulerCache.AddOrUpdateNode(node) - } - for _, pod := range test.pods { - schedulerCache.AddPod(pod) - } - - for _, ss := range test.podGroups { - schedulerCache.AddPodGroupV1beta1(ss) - } - - for _, q := range test.queues { - schedulerCache.AddQueueV1beta1(q) - } - - trueValue := true - ssn := framework.OpenSession(schedulerCache, []conf.Tier{ - { - Plugins: []conf.PluginOption{ - { - Name: "conformance", - EnabledReclaimable: &trueValue, - }, - { - Name: "gang", - EnabledReclaimable: &trueValue, - }, - { - Name: "proportion", - EnabledReclaimable: &trueValue, - }, + trueValue := true + tiers := []conf.Tier{ + { + Plugins: []conf.PluginOption{ + { + Name: "conformance", + EnabledReclaimable: &trueValue, + }, + { + Name: "gang", + EnabledReclaimable: &trueValue, + }, + { // proportion plugin will cause deserved resource large than preemptable pods's usage, and return less victims + Name: "proportion", + EnabledReclaimable: &trueValue, + }, + { + Name: priority.PluginName, + EnabledJobOrder: &trueValue, + EnabledTaskOrder: &trueValue, }, }, - }, nil) - defer framework.CloseSession(ssn) - - reclaim.Execute(ssn) - - for i := 0; i < test.expected; i++ { - select { - case <-evictor.Channel: - case <-time.After(3 * time.Second): - t.Errorf("Failed to get Evictor request.") + }, + } + for i, test := range tests { + t.Run(test.Name, func(t *testing.T) { + test.RegistSession(tiers, nil) + defer test.Close() + test.Run([]framework.Action{reclaim}) + if err := test.CheckAll(i); err != nil { + t.Fatal(err) } - } - - if test.expected != len(evictor.Evicts()) { - t.Errorf("case %d (%s): expected: %v, got %v ", i, test.name, test.expected, len(evictor.Evicts())) - } + }) } } diff --git a/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go b/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go index 3604698364..8291c09def 100644 --- a/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go +++ b/pkg/scheduler/api/devices/nvidia/gpushare/device_info.go @@ -148,7 +148,7 @@ func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) erro return nil } -func (gs *GPUDevices) FilterNode(pod *v1.Pod) (int, string, error) { +func (gs *GPUDevices) FilterNode(pod *v1.Pod, schedulePolicy string) (int, string, error) { klog.V(4).Infoln("DeviceSharing:Into FitInPod", pod.Name) if GpuSharingEnable { fit, err := checkNodeGPUSharingPredicate(pod, gs) @@ -172,6 +172,10 @@ func (gs *GPUDevices) GetStatus() string { return "" } +func (gs *GPUDevices) ScoreNode(pod *v1.Pod, schedulePolicy string) float64 { + return 0 +} + func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error { klog.V(4).Infoln("DeviceSharing:Into AllocateToPod", pod.Name) if getGPUMemoryOfPod(pod) > 0 { diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go index 196492226d..9c5c2d68ca 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/device_info.go @@ -58,6 +58,9 @@ type GPUDevice struct { type GPUDevices struct { Name string + // We cache score in filter step according to schedulePolicy, to avoid recalculating in score + Score float64 + Device map[int]*GPUDevice } @@ -90,7 +93,7 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices { return nil } for _, val := range nodedevices.Device { - klog.V(3).Infoln("name=", nodedevices.Name, "val=", *val) + klog.V(4).Infoln("name=", nodedevices.Name, "val=", *val) } // We have to handshake here in order to avoid time-inconsistency between scheduler and nodes @@ -100,7 +103,7 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices { klog.Infof("node %v device %s leave", node.Name, handshake) tmppat := make(map[string]string) - tmppat[handshake] = "Deleted_" + time.Now().Format("2006.01.02 15:04:05") + tmppat[VolcanoVGPUHandshake] = "Deleted_" + time.Now().Format("2006.01.02 15:04:05") patchNodeAnnotations(node, tmppat) return nil } @@ -114,6 +117,14 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices { return nodedevices } +func (gs *GPUDevices) ScoreNode(pod *v1.Pod, schedulePolicy string) float64 { + /* TODO: we need a base score to be campatable with preemption, it means a node without evicting a task has + a higher score than those needs to evict a task */ + + // Use cached stored in filter state in order to avoid recalculating. + return gs.Score +} + func (gs *GPUDevices) GetIgnoredDevices() []string { return []string{VolcanoVGPUMemory, VolcanoVGPUMemoryPercentage, VolcanoVGPUCores} } @@ -131,7 +142,7 @@ func (gs *GPUDevices) AddResource(pod *v1.Pod) { break } for index, gsdevice := range gs.Device { - if strings.Compare(gsdevice.UUID, deviceused.UUID) == 0 { + if gsdevice.UUID == deviceused.UUID { klog.V(4).Infoln("VGPU recording pod", pod.Name, "device", deviceused) gs.Device[index].UsedMem += uint(deviceused.Usedmem) gs.Device[index].UsedNum++ @@ -156,7 +167,7 @@ func (gs *GPUDevices) SubResource(pod *v1.Pod) { break } for index, gsdevice := range gs.Device { - if strings.Compare(gsdevice.UUID, deviceused.UUID) == 0 { + if gsdevice.UUID == deviceused.UUID { klog.V(4).Infoln("VGPU subsctracting pod", pod.Name, "device", deviceused) gs.Device[index].UsedMem -= uint(deviceused.Usedmem) gs.Device[index].UsedNum-- @@ -179,23 +190,24 @@ func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) erro return nil } -func (gs *GPUDevices) FilterNode(pod *v1.Pod) (int, string, error) { +func (gs *GPUDevices) FilterNode(pod *v1.Pod, schedulePolicy string) (int, string, error) { if VGPUEnable { - klog.V(5).Infoln("4pdvgpu DeviceSharing starts filtering pods", pod.Name) - fit, _, err := checkNodeGPUSharingPredicate(pod, gs, true) + klog.V(4).Infoln("hami-vgpu DeviceSharing starts filtering pods", pod.Name) + fit, _, score, err := checkNodeGPUSharingPredicateAndScore(pod, gs, true, schedulePolicy) if err != nil || !fit { klog.Errorln("deviceSharing err=", err.Error()) - return devices.Unschedulable, fmt.Sprintf("4pdvgpuDeviceSharing %s", err.Error()), err + return devices.Unschedulable, fmt.Sprintf("hami-vgpuDeviceSharing %s", err.Error()), err } - klog.V(5).Infoln("4pdvgpu DeviceSharing successfully filters pods") + gs.Score = score + klog.V(4).Infoln("hami-vgpu DeviceSharing successfully filters pods") } return devices.Success, "", nil } func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error { if VGPUEnable { - klog.V(3).Infoln("VGPU DeviceSharing:Into AllocateToPod", pod.Name) - fit, device, err := checkNodeGPUSharingPredicate(pod, gs, false) + klog.V(4).Infoln("hami-vgpu DeviceSharing:Into AllocateToPod", pod.Name) + fit, device, _, err := checkNodeGPUSharingPredicateAndScore(pod, gs, false, "") if err != nil || !fit { klog.Errorln("DeviceSharing err=", err.Error()) return err diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/metrics.go b/pkg/scheduler/api/devices/nvidia/vgpu/metrics.go index 6698171fdf..eda20a8fbc 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/metrics.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/metrics.go @@ -69,10 +69,12 @@ var ( func (gs *GPUDevices) GetStatus() string { for _, val := range gs.Device { - VGPUDevicesSharedNumber.WithLabelValues(val.UUID).Set(float64(val.UsedNum)) - VGPUDevicesSharedMemory.WithLabelValues(val.UUID).Set(float64(val.UsedMem)) - VGPUDevicesMemoryLimit.WithLabelValues(val.UUID).Set(float64(val.Memory)) - VGPUDevicesSharedCores.WithLabelValues(val.UUID).Set(float64(val.UsedCore)) + if val != nil { + VGPUDevicesSharedNumber.WithLabelValues(val.UUID).Set(float64(val.UsedNum)) + VGPUDevicesSharedMemory.WithLabelValues(val.UUID).Set(float64(val.UsedMem)) + VGPUDevicesMemoryLimit.WithLabelValues(val.UUID).Set(float64(val.Memory)) + VGPUDevicesSharedCores.WithLabelValues(val.UUID).Set(float64(val.UsedCore)) + } } return "" } diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/type.go b/pkg/scheduler/api/devices/nvidia/vgpu/type.go index 16544c5473..020ee2196c 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/type.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/type.go @@ -53,7 +53,17 @@ const ( UnhealthyGPUIDs = "volcano.sh/gpu-unhealthy-ids" // DeviceName used to indicate this device - DeviceName = "vgpu4pd" + DeviceName = "hamivgpu" + + // binpack means the lower device memory remained after this allocation, the better + binpackPolicy = "binpack" + // spread means better put this task into an idle GPU card than a shared GPU card + spreadPolicy = "spread" + // 101 means wo don't assign defaultMemPercentage value + + DefaultMemPercentage = 101 + binpackMultiplier = 100 + spreadMultiplier = 100 ) type ContainerDeviceRequest struct { diff --git a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go index 44ced09387..89170c431c 100644 --- a/pkg/scheduler/api/devices/nvidia/vgpu/utils.go +++ b/pkg/scheduler/api/devices/nvidia/vgpu/utils.go @@ -40,7 +40,7 @@ func init() { var err error kubeClient, err = NewClient() if err != nil { - klog.Errorf("init kubeclient in 4pdvgpu failed: %s", err.Error()) + klog.Errorf("init kubeclient in hamivgpu failed: %s", err.Error()) } else { klog.V(3).Infoln("init kubeclient success") } @@ -96,6 +96,7 @@ func decodeNodeDevices(name string, str string) *GPUDevices { retval := &GPUDevices{ Name: name, Device: make(map[int]*GPUDevice), + Score: float64(0), } for index, val := range tmp { if strings.Contains(val, ",") { @@ -296,7 +297,7 @@ func checkType(annos map[string]string, d GPUDevice, n ContainerDeviceRequest) b if !strings.Contains(d.Type, n.Type) { return false } - if strings.Compare(n.Type, NvidiaGPUDevice) == 0 { + if n.Type == NvidiaGPUDevice { return checkGPUtype(annos, d.Type) } klog.Errorf("Unrecognized device %v", n.Type) @@ -307,33 +308,37 @@ func getGPUDeviceSnapShot(snap *GPUDevices) *GPUDevices { ret := GPUDevices{ Name: snap.Name, Device: make(map[int]*GPUDevice), + Score: float64(0), } for index, val := range snap.Device { - ret.Device[index] = &GPUDevice{ - ID: val.ID, - UUID: val.UUID, - PodMap: val.PodMap, - Memory: val.Memory, - Number: val.Number, - Type: val.Type, - Health: val.Health, - UsedNum: val.UsedNum, - UsedMem: val.UsedMem, - UsedCore: val.UsedCore, + if val != nil { + ret.Device[index] = &GPUDevice{ + ID: val.ID, + UUID: val.UUID, + PodMap: val.PodMap, + Memory: val.Memory, + Number: val.Number, + Type: val.Type, + Health: val.Health, + UsedNum: val.UsedNum, + UsedMem: val.UsedMem, + UsedCore: val.UsedCore, + } } } return &ret } // checkNodeGPUSharingPredicate checks if a pod with gpu requirement can be scheduled on a node. -func checkNodeGPUSharingPredicate(pod *v1.Pod, gssnap *GPUDevices, replicate bool) (bool, []ContainerDevices, error) { +func checkNodeGPUSharingPredicateAndScore(pod *v1.Pod, gssnap *GPUDevices, replicate bool, schedulePolicy string) (bool, []ContainerDevices, float64, error) { // no gpu sharing request + score := float64(0) if !checkVGPUResourcesInPod(pod) { - return true, []ContainerDevices{}, nil + return true, []ContainerDevices{}, 0, nil } ctrReq := resourcereqs(pod) if len(ctrReq) == 0 { - return true, []ContainerDevices{}, nil + return true, []ContainerDevices{}, 0, nil } var gs *GPUDevices if replicate { @@ -345,13 +350,13 @@ func checkNodeGPUSharingPredicate(pod *v1.Pod, gssnap *GPUDevices, replicate boo for _, val := range ctrReq { devs := []ContainerDevice{} if int(val.Nums) > len(gs.Device) { - return false, []ContainerDevices{}, fmt.Errorf("no enough gpu cards on node %s", gs.Name) + return false, []ContainerDevices{}, 0, fmt.Errorf("no enough gpu cards on node %s", gs.Name) } - klog.V(3).Infoln("Allocating device for container request", val) + klog.V(3).InfoS("Allocating device for container", "request", val) for i := len(gs.Device) - 1; i >= 0; i-- { - klog.V(3).Info("Scoring pod ", val.Memreq, ":", val.MemPercentagereq, ":", val.Coresreq, ":", val.Nums, "i", i, "device:", gs.Device[i].ID) - klog.V(3).Infoln("gs", i, "=", gs.Device[i].Memory, gs.Device[i].UsedMem, gs.Device[i].UsedNum) + klog.V(3).InfoS("Scoring pod request", "memReq", val.Memreq, "memPercentageReq", val.MemPercentagereq, "coresReq", val.Coresreq, "Nums", val.Nums, "Index", i, "ID", gs.Device[i].ID) + klog.V(3).InfoS("Current Device", "Index", i, "TotalMemory", gs.Device[i].Memory, "UsedMemory", gs.Device[i].UsedMem, "UsedCores", gs.Device[i].UsedNum) if gs.Device[i].Number <= uint(gs.Device[i].UsedNum) { continue } @@ -379,7 +384,7 @@ func checkNodeGPUSharingPredicate(pod *v1.Pod, gssnap *GPUDevices, replicate boo //total += gs.Devices[i].Count //free += node.Devices[i].Count - node.Devices[i].Used if val.Nums > 0 { - klog.V(3).Infoln("device", gs.Device[i].ID, "fitted") + klog.V(3).InfoS("device fitted", "ID", gs.Device[i].ID) val.Nums-- gs.Device[i].UsedNum++ gs.Device[i].UsedMem += uint(val.Memreq) @@ -390,17 +395,27 @@ func checkNodeGPUSharingPredicate(pod *v1.Pod, gssnap *GPUDevices, replicate boo Usedmem: val.Memreq, Usedcores: val.Coresreq, }) + switch schedulePolicy { + case binpackPolicy: + score += binpackMultiplier * (float64(gs.Device[i].UsedMem) / float64(gs.Device[i].Memory)) + case spreadPolicy: + if gs.Device[i].UsedNum == 1 { + score += spreadMultiplier + } + default: + score = float64(0) + } } if val.Nums == 0 { break } } if val.Nums > 0 { - return false, []ContainerDevices{}, fmt.Errorf("not enough gpu fitted on this node") + return false, []ContainerDevices{}, 0, fmt.Errorf("not enough gpu fitted on this node") } ctrdevs = append(ctrdevs, devs) } - return true, ctrdevs, nil + return true, ctrdevs, score, nil } func patchPodAnnotations(pod *v1.Pod, annotations map[string]string) error { diff --git a/pkg/scheduler/api/node_info.go b/pkg/scheduler/api/node_info.go index 0ea8be490f..b9f1570f2b 100644 --- a/pkg/scheduler/api/node_info.go +++ b/pkg/scheduler/api/node_info.go @@ -345,11 +345,12 @@ func (ni *NodeInfo) SetNode(node *v1.Node) { // setNodeOthersResource initialize sharable devices func (ni *NodeInfo) setNodeOthersResource(node *v1.Node) { - IgnoredDevicesList = []string{} ni.Others[GPUSharingDevice] = gpushare.NewGPUDevices(ni.Name, node) ni.Others[vgpu.DeviceName] = vgpu.NewGPUDevices(ni.Name, node) - IgnoredDevicesList = append(IgnoredDevicesList, ni.Others[GPUSharingDevice].(Devices).GetIgnoredDevices()...) - IgnoredDevicesList = append(IgnoredDevicesList, ni.Others[vgpu.DeviceName].(Devices).GetIgnoredDevices()...) + IgnoredDevicesList.Set( + ni.Others[GPUSharingDevice].(Devices).GetIgnoredDevices(), + ni.Others[vgpu.DeviceName].(Devices).GetIgnoredDevices(), + ) } // setNode sets kubernetes node object to nodeInfo object without assertion diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index 75f36e71f2..22611ac39c 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -88,12 +88,13 @@ func NewResource(rl v1.ResourceList) *Resource { //NOTE: When converting this back to k8s resource, we need record the format as well as / 1000 if v1helper.IsScalarResourceName(rName) { ignore := false - for _, val := range IgnoredDevicesList { - if strings.Compare(rName.String(), val) == 0 { + IgnoredDevicesList.Range(func(_ int, val string) bool { + if rName.String() == val { ignore = true - break + return false } - } + return true + }) if !ignore { r.AddScalar(rName, float64(rQuant.MilliValue())) } else { diff --git a/pkg/scheduler/api/shared_device_pool.go b/pkg/scheduler/api/shared_device_pool.go index 865dba0dcd..c6d162ef29 100644 --- a/pkg/scheduler/api/shared_device_pool.go +++ b/pkg/scheduler/api/shared_device_pool.go @@ -17,6 +17,8 @@ package api import ( + "sync" + v1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" @@ -32,13 +34,13 @@ type Devices interface { //following two functions used in node_info //AddResource is to add the corresponding device resource of this 'pod' into current scheduler cache AddResource(pod *v1.Pod) - //SubResoure is to substract the corresponding device resource of this 'pod' from current scheduler cache + //SubResource is to subtract the corresponding device resource of this 'pod' from current scheduler cache SubResource(pod *v1.Pod) //following four functions used in predicate //HasDeviceRequest checks if the 'pod' request this device HasDeviceRequest(pod *v1.Pod) bool - //FiltreNode checks if the 'pod' fit in current node + // FilterNode checks if the 'pod' fit in current node // The first return value represents the filtering result, and the value range is "0, 1, 2, 3" // 0: Success // Success means that plugin ran correctly and found pod schedulable. @@ -57,24 +59,52 @@ type Devices interface { // preemption would not change anything. Plugins should return Unschedulable if it is possible // that the pod can get scheduled with preemption. // The accompanying status message should explain why the pod is unschedulable. - FilterNode(pod *v1.Pod) (int, string, error) - //Allocate action in predicate + FilterNode(pod *v1.Pod, policy string) (int, string, error) + // ScoreNode will be invoked when using devicescore plugin, devices api can use it to implement multiple + // scheduling policies. + ScoreNode(pod *v1.Pod, policy string) float64 + + // Allocate action in predicate Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) error - //Release action in predicate + // Release action in predicate Release(kubeClient kubernetes.Interface, pod *v1.Pod) error - //IgnredDevices notify vc-scheduler to ignore devices in return list + // GetIgnoredDevices notify vc-scheduler to ignore devices in return list GetIgnoredDevices() []string - //used for debug and monitor + // GetStatus used for debug and monitor GetStatus() string } // make sure GPUDevices implements Devices interface var _ Devices = new(gpushare.GPUDevices) -var IgnoredDevicesList []string - var RegisteredDevices = []string{ GPUSharingDevice, vgpu.DeviceName, } + +var IgnoredDevicesList = ignoredDevicesList{} + +type ignoredDevicesList struct { + sync.RWMutex + ignoredDevices []string +} + +func (l *ignoredDevicesList) Set(deviceLists ...[]string) { + l.Lock() + defer l.Unlock() + l.ignoredDevices = l.ignoredDevices[:0] + for _, devices := range deviceLists { + l.ignoredDevices = append(l.ignoredDevices, devices...) + } +} + +func (l *ignoredDevicesList) Range(f func(i int, device string) bool) { + l.RLock() + defer l.RUnlock() + for i, device := range l.ignoredDevices { + if !f(i, device) { + break + } + } +} diff --git a/pkg/scheduler/api/shared_device_pool_test.go b/pkg/scheduler/api/shared_device_pool_test.go new file mode 100644 index 0000000000..904efdc1f0 --- /dev/null +++ b/pkg/scheduler/api/shared_device_pool_test.go @@ -0,0 +1,126 @@ +package api + +import ( + "sync" + "testing" + + "github.com/stretchr/testify/assert" +) + +func Test_ignoredDevicesList_Set_BasicUsage(t *testing.T) { + tests := []struct { + name string + deviceLists [][]string + expectedIgnoredDevices []string + }{ + { + name: "set several values to ignoredDevicesList", + deviceLists: [][]string{{"volcano.sh/vgpu-memory", "volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"}}, + expectedIgnoredDevices: []string{"volcano.sh/vgpu-memory", "volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"}, + }, + { + name: "set several lists of values to ignoredDevicesList atomically", + deviceLists: [][]string{{"volcano.sh/vgpu-memory"}, {"volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"}}, + expectedIgnoredDevices: []string{"volcano.sh/vgpu-memory", "volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"}, + }, + { + name: "possible way to clear ignoredDevicesList", + deviceLists: nil, + expectedIgnoredDevices: nil, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + lst := ignoredDevicesList{} + lst.Set(tt.deviceLists...) + assert.Equal(t, tt.expectedIgnoredDevices, lst.ignoredDevices) + }) + } +} + +func Test_ignoredDevicesList_Range_BasicUsage(t *testing.T) { + lst := ignoredDevicesList{} + lst.Set([]string{"volcano.sh/vgpu-memory", "volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"}) + + t.Run("read and copy values from the ignoredDevicesList", func(t *testing.T) { + ignoredDevices := make([]string, 0, len(lst.ignoredDevices)) + lst.Range(func(_ int, device string) bool { + ignoredDevices = append(ignoredDevices, device) + return true + }) + assert.Equal(t, lst.ignoredDevices, ignoredDevices) + }) + + t.Run("break iteration through the ignoredDevicesList", func(t *testing.T) { + i := 0 + flag := false + lst.Range(func(_ int, device string) bool { + i++ + if lst.ignoredDevices[1] == device { + flag = true + return false + } + return true + }) + + assert.Equal(t, true, flag) + assert.Equal(t, 2, i) + }) +} + +func Test_ignoredDevicesList_Set_Concurrent(t *testing.T) { + lst := ignoredDevicesList{} + expected := []string{"volcano.sh/vgpu-memory", "volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"} + + var wg sync.WaitGroup + wg.Add(8) + for i := 0; i < 8; i++ { + go func() { + defer wg.Done() + lst.Set(expected) + }() + } + wg.Wait() + + assert.Equal(t, expected, lst.ignoredDevices) +} + +func Test_ignoredDevicesList_Range_Concurrent(t *testing.T) { + lst := ignoredDevicesList{} + lst.Set([]string{"volcano.sh/vgpu-memory", "volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"}) + + var wg sync.WaitGroup + wg.Add(8) + for i := 0; i < 8; i++ { + go func() { + defer wg.Done() + ignoredDevices := make([]string, 0, len(lst.ignoredDevices)) + lst.Range(func(_ int, device string) bool { + ignoredDevices = append(ignoredDevices, device) + return true + }) + assert.Equal(t, ignoredDevices, lst.ignoredDevices) + }() + } + wg.Wait() +} + +func Test_ignoredDevicesList_NoRace(t *testing.T) { + lst := ignoredDevicesList{} + + var wg sync.WaitGroup + wg.Add(16) + for i := 0; i < 8; i++ { + go func() { + defer wg.Done() + lst.Set([]string{"volcano.sh/vgpu-memory", "volcano.sh/vgpu-memory-percentage", "volcano.sh/vgpu-cores"}) + }() + go func() { + defer wg.Done() + lst.Range(func(_ int, _ string) bool { + return true + }) + }() + } + wg.Wait() +} diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 103610769d..2c7cf79438 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -310,6 +310,22 @@ func (su *defaultStatusUpdater) UpdatePodGroup(pg *schedulingapi.PodGroup) (*sch return podGroupInfo, nil } +// UpdateQueueStatus will update the status of queue +func (su *defaultStatusUpdater) UpdateQueueStatus(queue *schedulingapi.QueueInfo) error { + var newQueue = &vcv1beta1.Queue{} + if err := schedulingscheme.Scheme.Convert(queue.Queue, newQueue, nil); err != nil { + klog.Errorf("error occurred in converting scheduling.Queue to v1beta1.Queue: %s", err.Error()) + return err + } + + _, err := su.vcclient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}) + if err != nil { + klog.Errorf("error occurred in updating Queue <%s>: %s", newQueue.Name, err.Error()) + return err + } + return nil +} + type defaultVolumeBinder struct { volumeBinder volumescheduling.SchedulerVolumeBinder } @@ -442,7 +458,7 @@ func (sc *SchedulerCache) setBatchBindParallel() { func (sc *SchedulerCache) setDefaultVolumeBinder() { logger := klog.FromContext(context.TODO()) var capacityCheck *volumescheduling.CapacityCheck - if options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) { + if options.ServerOpts != nil && options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) { capacityCheck = &volumescheduling.CapacityCheck{ CSIDriverInformer: sc.csiDriverInformer, CSIStorageCapacityInformer: sc.csiStorageCapacityInformer, @@ -661,7 +677,7 @@ func (sc *SchedulerCache) addEventHandler() { }, ) - if options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) { + if options.ServerOpts != nil && options.ServerOpts.EnableCSIStorage && utilfeature.DefaultFeatureGate.Enabled(features.CSIStorage) { sc.csiDriverInformer = informerFactory.Storage().V1().CSIDrivers() sc.csiStorageCapacityInformer = informerFactory.Storage().V1beta1().CSIStorageCapacities() } @@ -699,7 +715,7 @@ func (sc *SchedulerCache) addEventHandler() { }, }) - if options.ServerOpts.EnablePriorityClass && utilfeature.DefaultFeatureGate.Enabled(features.PriorityClass) { + if options.ServerOpts != nil && options.ServerOpts.EnablePriorityClass && utilfeature.DefaultFeatureGate.Enabled(features.PriorityClass) { sc.pcInformer = informerFactory.Scheduling().V1().PriorityClasses() sc.pcInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: sc.AddPriorityClass, @@ -1447,18 +1463,7 @@ func (sc *SchedulerCache) UpdateJobStatus(job *schedulingapi.JobInfo, updatePG b // UpdateQueueStatus update the status of queue. func (sc *SchedulerCache) UpdateQueueStatus(queue *schedulingapi.QueueInfo) error { - var newQueue = &vcv1beta1.Queue{} - if err := schedulingscheme.Scheme.Convert(queue.Queue, newQueue, nil); err != nil { - klog.Errorf("error occurred in converting scheduling.Queue to v1beta1.Queue: %s", err.Error()) - return err - } - - _, err := sc.vcClient.SchedulingV1beta1().Queues().UpdateStatus(context.TODO(), newQueue, metav1.UpdateOptions{}) - if err != nil { - klog.Errorf("error occurred in updating Queue <%s>: %s", newQueue.Name, err.Error()) - return err - } - return nil + return sc.StatusUpdater.UpdateQueueStatus(queue) } func (sc *SchedulerCache) recordPodGroupEvent(podGroup *schedulingapi.PodGroup, eventType, reason, msg string) { diff --git a/pkg/scheduler/cache/cache_mock.go b/pkg/scheduler/cache/cache_mock.go index 03f08cc190..87c17ece20 100644 --- a/pkg/scheduler/cache/cache_mock.go +++ b/pkg/scheduler/cache/cache_mock.go @@ -84,7 +84,7 @@ func checkAndSetDefaultInterface(sc *SchedulerCache) { } func getNodeWorkers() uint32 { - if options.ServerOpts.NodeWorkerThreads > 0 { + if options.ServerOpts != nil && options.ServerOpts.NodeWorkerThreads > 0 { return options.ServerOpts.NodeWorkerThreads } threads, err := strconv.Atoi(os.Getenv("NODE_WORKER_THREADS")) @@ -116,7 +116,7 @@ func newMockSchedulerCache(schedulerName string) *SchedulerCache { NodeList: []string{}, } - if len(options.ServerOpts.NodeSelector) > 0 { + if options.ServerOpts != nil && len(options.ServerOpts.NodeSelector) > 0 { msc.updateNodeSelectors(options.ServerOpts.NodeSelector) } msc.setBatchBindParallel() diff --git a/pkg/scheduler/cache/interface.go b/pkg/scheduler/cache/interface.go index dd68896b33..63c704e4ad 100644 --- a/pkg/scheduler/cache/interface.go +++ b/pkg/scheduler/cache/interface.go @@ -111,6 +111,7 @@ type Evictor interface { type StatusUpdater interface { UpdatePodCondition(pod *v1.Pod, podCondition *v1.PodCondition) (*v1.Pod, error) UpdatePodGroup(pg *api.PodGroup) (*api.PodGroup, error) + UpdateQueueStatus(queue *api.QueueInfo) error } // BatchBinder updates podgroup or job information diff --git a/pkg/scheduler/conf/scheduler_conf.go b/pkg/scheduler/conf/scheduler_conf.go index 148ffb76ac..e911e0b69e 100644 --- a/pkg/scheduler/conf/scheduler_conf.go +++ b/pkg/scheduler/conf/scheduler_conf.go @@ -61,11 +61,13 @@ type PluginOption struct { EnabledPreemptable *bool `yaml:"enablePreemptable"` // EnabledReclaimable defines whether reclaimableFn is enabled EnabledReclaimable *bool `yaml:"enableReclaimable"` + // EnablePreemptive defines whether preemptiveFn is enabled + EnablePreemptive *bool `yaml:"enablePreemptive"` // EnabledQueueOrder defines whether queueOrderFn is enabled EnabledQueueOrder *bool `yaml:"enableQueueOrder"` - // EnabledPredicate defines whether predicateFn is enabled - EnabledClusterOrder *bool `yaml:"EnabledClusterOrder"` // EnableClusterOrder defines whether clusterOrderFn is enabled + EnabledClusterOrder *bool `yaml:"EnabledClusterOrder"` + // EnabledPredicate defines whether predicateFn is enabled EnabledPredicate *bool `yaml:"enablePredicate"` // EnabledBestNode defines whether bestNodeFn is enabled EnabledBestNode *bool `yaml:"enableBestNode"` diff --git a/pkg/scheduler/framework/session.go b/pkg/scheduler/framework/session.go index 10feb5657e..993238dbbb 100644 --- a/pkg/scheduler/framework/session.go +++ b/pkg/scheduler/framework/session.go @@ -88,6 +88,9 @@ type Session struct { preemptableFns map[string]api.EvictableFn reclaimableFns map[string]api.EvictableFn overusedFns map[string]api.ValidateFn + // preemptiveFns means whether current queue can reclaim from other queue, + // while reclaimableFns means whether current queue's resources can be reclaimed. + preemptiveFns map[string]api.ValidateFn allocatableFns map[string]api.AllocatableFn jobReadyFns map[string]api.ValidateFn jobPipelinedFns map[string]api.VoteFn @@ -133,6 +136,7 @@ func openSession(cache cache.Cache) *Session { preemptableFns: map[string]api.EvictableFn{}, reclaimableFns: map[string]api.EvictableFn{}, overusedFns: map[string]api.ValidateFn{}, + preemptiveFns: map[string]api.ValidateFn{}, allocatableFns: map[string]api.AllocatableFn{}, jobReadyFns: map[string]api.ValidateFn{}, jobPipelinedFns: map[string]api.VoteFn{}, diff --git a/pkg/scheduler/framework/session_plugins.go b/pkg/scheduler/framework/session_plugins.go index b8bcff7a50..a95fc80b28 100644 --- a/pkg/scheduler/framework/session_plugins.go +++ b/pkg/scheduler/framework/session_plugins.go @@ -22,6 +22,7 @@ import ( "volcano.sh/apis/pkg/apis/scheduling" "volcano.sh/volcano/pkg/controllers/job/helpers" "volcano.sh/volcano/pkg/scheduler/api" + "volcano.sh/volcano/pkg/scheduler/util" ) // AddJobOrderFn add job order function @@ -104,6 +105,11 @@ func (ssn *Session) AddOverusedFn(name string, fn api.ValidateFn) { ssn.overusedFns[name] = fn } +// AddPreemptiveFn add preemptive function +func (ssn *Session) AddPreemptiveFn(name string, fn api.ValidateFn) { + ssn.preemptiveFns[name] = fn +} + // AddAllocatableFn add allocatable function func (ssn *Session) AddAllocatableFn(name string, fn api.AllocatableFn) { ssn.allocatableFns[name] = fn @@ -266,6 +272,26 @@ func (ssn *Session) Overused(queue *api.QueueInfo) bool { return false } +// Preemptive invoke can preemptive function of the plugins +func (ssn *Session) Preemptive(queue *api.QueueInfo) bool { + for _, tier := range ssn.Tiers { + for _, plugin := range tier.Plugins { + of, found := ssn.preemptiveFns[plugin.Name] + if !isEnabled(plugin.EnablePreemptive) { + continue + } + if !found { + continue + } + if !of(queue) { + return false + } + } + } + + return true +} + // Allocatable invoke allocatable function of the plugins func (ssn *Session) Allocatable(queue *api.QueueInfo, candidate *api.TaskInfo) bool { for _, tier := range ssn.Tiers { @@ -763,3 +789,21 @@ func (ssn *Session) NodeOrderReduceFn(task *api.TaskInfo, pluginNodeScoreMap map } return nodeScoreMap, nil } + +// BuildVictimsPriorityQueue returns a priority queue with victims sorted by: +// if victims has same job id, sorted by !ssn.TaskOrderFn +// if victims has different job id, sorted by !ssn.JobOrderFn +func (ssn *Session) BuildVictimsPriorityQueue(victims []*api.TaskInfo) *util.PriorityQueue { + victimsQueue := util.NewPriorityQueue(func(l, r interface{}) bool { + lv := l.(*api.TaskInfo) + rv := r.(*api.TaskInfo) + if lv.Job == rv.Job { + return !ssn.TaskOrderFn(l, r) + } + return !ssn.JobOrderFn(ssn.Jobs[lv.Job], ssn.Jobs[rv.Job]) + }) + for _, victim := range victims { + victimsQueue.Push(victim) + } + return victimsQueue +} diff --git a/pkg/scheduler/plugins/capacity/capacity.go b/pkg/scheduler/plugins/capacity/capacity.go new file mode 100644 index 0000000000..50a5bca39e --- /dev/null +++ b/pkg/scheduler/plugins/capacity/capacity.go @@ -0,0 +1,360 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package capacity + +import ( + "math" + + v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + + "volcano.sh/apis/pkg/apis/scheduling" + "volcano.sh/volcano/pkg/scheduler/api" + "volcano.sh/volcano/pkg/scheduler/api/helpers" + "volcano.sh/volcano/pkg/scheduler/framework" + "volcano.sh/volcano/pkg/scheduler/metrics" + "volcano.sh/volcano/pkg/scheduler/plugins/util" +) + +const ( + PluginName = "capacity" +) + +type capacityPlugin struct { + totalResource *api.Resource + totalGuarantee *api.Resource + + queueOpts map[api.QueueID]*queueAttr + // Arguments given for the plugin + pluginArguments framework.Arguments +} + +type queueAttr struct { + queueID api.QueueID + name string + share float64 + + deserved *api.Resource + allocated *api.Resource + request *api.Resource + // elastic represents the sum of job's elastic resource, job's elastic = job.allocated - job.minAvailable + elastic *api.Resource + // inqueue represents the resource request of the inqueue job + inqueue *api.Resource + capability *api.Resource + // realCapability represents the resource limit of the queue, LessEqual capability + realCapability *api.Resource + guarantee *api.Resource +} + +// New return capacityPlugin action +func New(arguments framework.Arguments) framework.Plugin { + return &capacityPlugin{ + totalResource: api.EmptyResource(), + totalGuarantee: api.EmptyResource(), + queueOpts: map[api.QueueID]*queueAttr{}, + pluginArguments: arguments, + } +} + +func (cp *capacityPlugin) Name() string { + return PluginName +} + +func (cp *capacityPlugin) OnSessionOpen(ssn *framework.Session) { + // Prepare scheduling data for this session. + cp.totalResource.Add(ssn.TotalResource) + + klog.V(4).Infof("The total resource is <%v>", cp.totalResource) + for _, queue := range ssn.Queues { + if len(queue.Queue.Spec.Guarantee.Resource) == 0 { + continue + } + guarantee := api.NewResource(queue.Queue.Spec.Guarantee.Resource) + cp.totalGuarantee.Add(guarantee) + } + klog.V(4).Infof("The total guarantee resource is <%v>", cp.totalGuarantee) + // Build attributes for Queues. + for _, job := range ssn.Jobs { + klog.V(4).Infof("Considering Job <%s/%s>.", job.Namespace, job.Name) + if _, found := cp.queueOpts[job.Queue]; !found { + queue := ssn.Queues[job.Queue] + attr := &queueAttr{ + queueID: queue.UID, + name: queue.Name, + + deserved: api.NewResource(queue.Queue.Spec.Deserved), + allocated: api.EmptyResource(), + request: api.EmptyResource(), + elastic: api.EmptyResource(), + inqueue: api.EmptyResource(), + guarantee: api.EmptyResource(), + } + if len(queue.Queue.Spec.Capability) != 0 { + attr.capability = api.NewResource(queue.Queue.Spec.Capability) + if attr.capability.MilliCPU <= 0 { + attr.capability.MilliCPU = math.MaxFloat64 + } + if attr.capability.Memory <= 0 { + attr.capability.Memory = math.MaxFloat64 + } + } + if len(queue.Queue.Spec.Guarantee.Resource) != 0 { + attr.guarantee = api.NewResource(queue.Queue.Spec.Guarantee.Resource) + } + realCapability := cp.totalResource.Clone().Sub(cp.totalGuarantee).Add(attr.guarantee) + if attr.capability == nil { + attr.realCapability = realCapability + } else { + realCapability.MinDimensionResource(attr.capability, api.Infinity) + attr.realCapability = realCapability + } + cp.queueOpts[job.Queue] = attr + klog.V(4).Infof("Added Queue <%s> attributes.", job.Queue) + } + + attr := cp.queueOpts[job.Queue] + for status, tasks := range job.TaskStatusIndex { + if api.AllocatedStatus(status) { + for _, t := range tasks { + attr.allocated.Add(t.Resreq) + attr.request.Add(t.Resreq) + } + } else if status == api.Pending { + for _, t := range tasks { + attr.request.Add(t.Resreq) + } + } + } + + if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue { + attr.inqueue.Add(job.GetMinResources()) + } + + // calculate inqueue resource for running jobs + // the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition: + // Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement. + if job.PodGroup.Status.Phase == scheduling.PodGroupRunning && + job.PodGroup.Spec.MinResources != nil && + int32(util.CalculateAllocatedTaskNum(job)) >= job.PodGroup.Spec.MinMember { + inqueued := util.GetInqueueResource(job, job.Allocated) + attr.inqueue.Add(inqueued) + } + attr.elastic.Add(job.GetElasticResources()) + klog.V(5).Infof("Queue %s allocated <%s> request <%s> inqueue <%s> elastic <%s>", + attr.name, attr.allocated.String(), attr.request.String(), attr.inqueue.String(), attr.elastic.String()) + } + + for _, attr := range cp.queueOpts { + if attr.realCapability != nil { + attr.deserved.MinDimensionResource(attr.realCapability, api.Infinity) + } + // When scalar resource not specified in deserved such as "pods", we should skip it and consider deserved resource as infinity. + attr.deserved.MinDimensionResource(attr.request, api.Infinity) + + attr.deserved = helpers.Max(attr.deserved, attr.guarantee) + cp.updateShare(attr) + klog.V(4).Infof("The attributes of queue <%s> in capacity: deserved <%v>, realCapability <%v>, allocate <%v>, request <%v>, elastic <%v>, share <%0.2f>", + attr.name, attr.deserved, attr.realCapability, attr.allocated, attr.request, attr.elastic, attr.share) + } + + // Record metrics + for queueID, queueInfo := range ssn.Queues { + queue := ssn.Queues[queueID] + if attr, ok := cp.queueOpts[queueID]; ok { + metrics.UpdateQueueDeserved(attr.name, attr.deserved.MilliCPU, attr.deserved.Memory) + metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) + metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory) + metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue) + metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending) + metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running) + metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown) + continue + } + deservedCPU, deservedMem := 0.0, 0.0 + if queue.Queue.Spec.Deserved != nil { + deservedCPU = float64(queue.Queue.Spec.Deserved.Cpu().MilliValue()) + deservedMem = float64(queue.Queue.Spec.Deserved.Memory().Value()) + } + metrics.UpdateQueueDeserved(queueInfo.Name, deservedCPU, deservedMem) + metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0) + metrics.UpdateQueueRequest(queueInfo.Name, 0, 0) + metrics.UpdateQueuePodGroupInqueueCount(queueInfo.Name, 0) + metrics.UpdateQueuePodGroupPendingCount(queueInfo.Name, 0) + metrics.UpdateQueuePodGroupRunningCount(queueInfo.Name, 0) + metrics.UpdateQueuePodGroupUnknownCount(queueInfo.Name, 0) + } + + ssn.AddQueueOrderFn(cp.Name(), func(l, r interface{}) int { + lv := l.(*api.QueueInfo) + rv := r.(*api.QueueInfo) + + if cp.queueOpts[lv.UID].share == cp.queueOpts[rv.UID].share { + return 0 + } + + if cp.queueOpts[lv.UID].share < cp.queueOpts[rv.UID].share { + return -1 + } + + return 1 + }) + + ssn.AddReclaimableFn(cp.Name(), func(reclaimer *api.TaskInfo, reclaimees []*api.TaskInfo) ([]*api.TaskInfo, int) { + var victims []*api.TaskInfo + allocations := map[api.QueueID]*api.Resource{} + + for _, reclaimee := range reclaimees { + job := ssn.Jobs[reclaimee.Job] + attr := cp.queueOpts[job.Queue] + + if _, found := allocations[job.Queue]; !found { + allocations[job.Queue] = attr.allocated.Clone() + } + allocated := allocations[job.Queue] + if allocated.LessPartly(reclaimer.Resreq, api.Zero) { + klog.V(3).Infof("Failed to allocate resource for Task <%s/%s> in Queue <%s>, not enough resource.", + reclaimee.Namespace, reclaimee.Name, job.Queue) + continue + } + + exceptReclaimee := allocated.Clone().Sub(reclaimee.Resreq) + // When scalar resource not specified in deserved such as "pods", we should skip it and consider it as infinity, + // so the following first condition will be true and the current queue will not be reclaimed. + if allocated.LessEqual(attr.deserved, api.Infinity) || !attr.guarantee.LessEqual(exceptReclaimee, api.Zero) { + continue + } + allocated.Sub(reclaimee.Resreq) + victims = append(victims, reclaimee) + } + klog.V(4).InfoS("Victims from capacity plugin", "victims", victims, "reclaimer", reclaimer) + return victims, util.Permit + }) + + ssn.AddPreemptiveFn(cp.Name(), func(obj interface{}) bool { + queue := obj.(*api.QueueInfo) + attr := cp.queueOpts[queue.UID] + + overused := attr.deserved.LessEqual(attr.allocated, api.Zero) + metrics.UpdateQueueOverused(attr.name, overused) + if overused { + klog.V(3).Infof("Queue <%v> can not reclaim, deserved <%v>, allocated <%v>, share <%v>", + queue.Name, attr.deserved, attr.allocated, attr.share) + } + + return !overused + }) + + ssn.AddAllocatableFn(cp.Name(), func(queue *api.QueueInfo, candidate *api.TaskInfo) bool { + attr := cp.queueOpts[queue.UID] + + free, _ := attr.realCapability.Diff(attr.allocated, api.Zero) + allocatable := candidate.Resreq.LessEqual(free, api.Zero) + if !allocatable { + klog.V(3).Infof("Queue <%v>: realCapability <%v>, allocated <%v>; Candidate <%v>: resource request <%v>", + queue.Name, attr.realCapability, attr.allocated, candidate.Name, candidate.Resreq) + } + + return allocatable + }) + + ssn.AddJobEnqueueableFn(cp.Name(), func(obj interface{}) int { + job := obj.(*api.JobInfo) + queueID := job.Queue + attr := cp.queueOpts[queueID] + queue := ssn.Queues[queueID] + // If no capability is set, always enqueue the job. + if attr.realCapability == nil { + klog.V(4).Infof("Capability of queue <%s> was not set, allow job <%s/%s> to Inqueue.", + queue.Name, job.Namespace, job.Name) + return util.Permit + } + + if job.PodGroup.Spec.MinResources == nil { + klog.V(4).Infof("job %s MinResources is null.", job.Name) + return util.Permit + } + minReq := job.GetMinResources() + + klog.V(5).Infof("job %s min resource <%s>, queue %s capability <%s> allocated <%s> inqueue <%s> elastic <%s>", + job.Name, minReq.String(), queue.Name, attr.realCapability.String(), attr.allocated.String(), attr.inqueue.String(), attr.elastic.String()) + // The queue resource quota limit has not reached + r := minReq.Add(attr.allocated).Add(attr.inqueue).Sub(attr.elastic) + rr := attr.realCapability.Clone() + + for name := range rr.ScalarResources { + if _, ok := r.ScalarResources[name]; !ok { + delete(rr.ScalarResources, name) + } + } + + inqueue := r.LessEqual(rr, api.Infinity) + klog.V(5).Infof("job %s inqueue %v", job.Name, inqueue) + if inqueue { + attr.inqueue.Add(job.GetMinResources()) + return util.Permit + } + ssn.RecordPodGroupEvent(job.PodGroup, v1.EventTypeNormal, string(scheduling.PodGroupUnschedulableType), "queue resource quota insufficient") + return util.Reject + }) + + // Register event handlers. + ssn.AddEventHandler(&framework.EventHandler{ + AllocateFunc: func(event *framework.Event) { + job := ssn.Jobs[event.Task.Job] + attr := cp.queueOpts[job.Queue] + attr.allocated.Add(event.Task.Resreq) + metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) + + cp.updateShare(attr) + + klog.V(4).Infof("Capacity AllocateFunc: task <%v/%v>, resreq <%v>, share <%v>", + event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share) + }, + DeallocateFunc: func(event *framework.Event) { + job := ssn.Jobs[event.Task.Job] + attr := cp.queueOpts[job.Queue] + attr.allocated.Sub(event.Task.Resreq) + metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) + + cp.updateShare(attr) + + klog.V(4).Infof("Capacity EvictFunc: task <%v/%v>, resreq <%v>, share <%v>", + event.Task.Namespace, event.Task.Name, event.Task.Resreq, attr.share) + }, + }) +} + +func (cp *capacityPlugin) OnSessionClose(ssn *framework.Session) { + cp.totalResource = nil + cp.totalGuarantee = nil + cp.queueOpts = nil +} + +func (cp *capacityPlugin) updateShare(attr *queueAttr) { + res := float64(0) + + for _, rn := range attr.deserved.ResourceNames() { + share := helpers.Share(attr.allocated.Get(rn), attr.deserved.Get(rn)) + if share > res { + res = share + } + } + + attr.share = res + metrics.UpdateQueueShare(attr.name, attr.share) +} diff --git a/pkg/scheduler/plugins/capacity/capacity_test.go b/pkg/scheduler/plugins/capacity/capacity_test.go new file mode 100644 index 0000000000..ef1fcc156e --- /dev/null +++ b/pkg/scheduler/plugins/capacity/capacity_test.go @@ -0,0 +1,141 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package capacity + +import ( + "testing" + + corev1 "k8s.io/api/core/v1" + schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" + + "volcano.sh/volcano/cmd/scheduler/app/options" + "volcano.sh/volcano/pkg/scheduler/actions/allocate" + "volcano.sh/volcano/pkg/scheduler/actions/reclaim" + "volcano.sh/volcano/pkg/scheduler/api" + "volcano.sh/volcano/pkg/scheduler/conf" + "volcano.sh/volcano/pkg/scheduler/framework" + "volcano.sh/volcano/pkg/scheduler/plugins/predicates" + "volcano.sh/volcano/pkg/scheduler/uthelper" + "volcano.sh/volcano/pkg/scheduler/util" +) + +func Test_capacityPlugin_OnSessionOpen(t *testing.T) { + plugins := map[string]framework.PluginBuilder{PluginName: New, predicates.PluginName: predicates.New} + trueValue := true + actions := []framework.Action{allocate.New(), reclaim.New()} + options.Default() + + // nodes + n1 := util.BuildNode("n1", api.BuildResourceList("2", "4Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), map[string]string{"selector": "worker"}) + n2 := util.BuildNode("n2", api.BuildResourceList("2", "4Gi", []api.ScalarResource{{Name: "pods", Value: "10"}}...), map[string]string{}) + + // resources for test case 0 + // pod + p1 := util.BuildPod("ns1", "p1", "n1", corev1.PodRunning, api.BuildResourceList("1", "1Gi"), "pg1", make(map[string]string), make(map[string]string)) + p2 := util.BuildPod("ns1", "p2", "", corev1.PodPending, api.BuildResourceList("1", "1Gi"), "pg2", make(map[string]string), map[string]string{"selector": "worker"}) + // podgroup + pg1 := util.BuildPodGroup("pg1", "ns1", "q1", 1, nil, schedulingv1beta1.PodGroupRunning) + pg2 := util.BuildPodGroup("pg2", "ns1", "q1", 1, nil, schedulingv1beta1.PodGroupInqueue) + // queue + queue1 := util.BuildQueueWithResourcesQuantity("q1", nil, api.BuildResourceList("2", "2Gi")) + + // resources for test case 1 + // pod + p3 := util.BuildPod("ns1", "p3", "n1", corev1.PodRunning, api.BuildResourceList("1", "1Gi"), "pg3", make(map[string]string), make(map[string]string)) + p4 := util.BuildPod("ns1", "p4", "", corev1.PodPending, api.BuildResourceList("1", "1Gi"), "pg4", make(map[string]string), make(map[string]string)) + // podgroup + pg3 := util.BuildPodGroup("pg3", "ns1", "q2", 1, nil, schedulingv1beta1.PodGroupRunning) + pg4 := util.BuildPodGroup("pg4", "ns1", "q2", 1, nil, schedulingv1beta1.PodGroupInqueue) + // queue + queue2 := util.BuildQueueWithResourcesQuantity("q2", nil, api.BuildResourceList("1.5", "1.5Gi")) + + // resources for test case 2 + // pod + p5 := util.BuildPod("ns1", "p5", "n1", corev1.PodRunning, api.BuildResourceList("2", "4Gi"), "pg5", map[string]string{schedulingv1beta1.PodPreemptable: "false"}, make(map[string]string)) + p6 := util.BuildPod("ns1", "p6", "n2", corev1.PodRunning, api.BuildResourceList("2", "4Gi"), "pg5", make(map[string]string), make(map[string]string)) + p7 := util.BuildPod("ns1", "p7", "", corev1.PodPending, api.BuildResourceList("2", "4Gi"), "pg6", make(map[string]string), make(map[string]string)) + // podgroup + pg5 := util.BuildPodGroup("pg5", "ns1", "q3", 1, nil, schedulingv1beta1.PodGroupRunning) + pg6 := util.BuildPodGroup("pg6", "ns1", "q4", 1, nil, schedulingv1beta1.PodGroupInqueue) + // queue + queue3 := util.BuildQueueWithResourcesQuantity("q3", api.BuildResourceList("2", "4Gi"), nil) + queue4 := util.BuildQueueWithResourcesQuantity("q4", api.BuildResourceList("2", "4Gi"), nil) + + tests := []uthelper.TestCommonStruct{ + { + Name: "case0: Pod allocatable when queue has not exceed capability", + Plugins: plugins, + Pods: []*corev1.Pod{p1, p2}, + Nodes: []*corev1.Node{n1, n2}, + PodGroups: []*schedulingv1beta1.PodGroup{pg1, pg2}, + Queues: []*schedulingv1beta1.Queue{queue1}, + Bind: map[string]string{ + "ns1/p2": "n1", + }, + BindsNum: 1, + }, + { + Name: "case1: Pod not allocatable when queue exceed queue capability", + Plugins: plugins, + Pods: []*corev1.Pod{p3, p4}, + Nodes: []*corev1.Node{n1, n2}, + PodGroups: []*schedulingv1beta1.PodGroup{pg3, pg4}, + Queues: []*schedulingv1beta1.Queue{queue2}, + BindsNum: 0, + }, + { + Name: "case2: Can reclaim from other queues when allocated < deserved", + Plugins: plugins, + Pods: []*corev1.Pod{p5, p6, p7}, + Nodes: []*corev1.Node{n1, n2}, + PodGroups: []*schedulingv1beta1.PodGroup{pg5, pg6}, + Queues: []*schedulingv1beta1.Queue{queue3, queue4}, + PipeLined: map[string][]string{ + "ns1/pg6": {"n2"}, + }, + Evicted: []string{"ns1/p6"}, + EvictNum: 1, + }, + } + + tiers := []conf.Tier{ + { + Plugins: []conf.PluginOption{ + { + Name: PluginName, + EnabledAllocatable: &trueValue, + EnablePreemptive: &trueValue, + EnabledReclaimable: &trueValue, + }, + { + Name: predicates.PluginName, + EnabledPredicate: &trueValue, + }, + }, + }, + } + for i, test := range tests { + t.Run(test.Name, func(t *testing.T) { + test.RegistSession(tiers, nil) + defer test.Close() + test.Run(actions) + if err := test.CheckAll(i); err != nil { + t.Fatal(err) + } + }) + } +} diff --git a/pkg/scheduler/plugins/defaults.go b/pkg/scheduler/plugins/defaults.go index c9ca3fad1c..0a719716f8 100644 --- a/pkg/scheduler/plugins/defaults.go +++ b/pkg/scheduler/plugins/defaults.go @@ -43,6 +43,9 @@ func ApplyPluginConfDefaults(option *conf.PluginOption) { if option.EnabledReclaimable == nil { option.EnabledReclaimable = &t } + if option.EnablePreemptive == nil { + option.EnablePreemptive = &t + } if option.EnabledQueueOrder == nil { option.EnabledQueueOrder = &t } diff --git a/pkg/scheduler/plugins/deviceshare/deviceshare.go b/pkg/scheduler/plugins/deviceshare/deviceshare.go new file mode 100644 index 0000000000..62b083cba7 --- /dev/null +++ b/pkg/scheduler/plugins/deviceshare/deviceshare.go @@ -0,0 +1,167 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceshare + +import ( + "context" + "fmt" + "math" + "reflect" + + v1 "k8s.io/api/core/v1" + "k8s.io/klog/v2" + k8sframework "k8s.io/kubernetes/pkg/scheduler/framework" + + "volcano.sh/volcano/pkg/scheduler/api" + "volcano.sh/volcano/pkg/scheduler/api/devices" + "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" + "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" + "volcano.sh/volcano/pkg/scheduler/framework" +) + +// PluginName indicates name of volcano scheduler plugin. +const ( + PluginName = "deviceshare" + // GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML + GPUSharingPredicate = "deviceshare.GPUSharingEnable" + NodeLockEnable = "deviceshare.NodeLockEnable" + GPUNumberPredicate = "deviceshare.GPUNumberEnable" + + VGPUEnable = "deviceshare.VGPUEnable" + + SchedulePolicyArgument = "deviceshare.SchedulePolicy" + ScheduleWeight = "deviceshare.ScheduleWeight" +) + +type deviceSharePlugin struct { + // Arguments given for the plugin + pluginArguments framework.Arguments + schedulePolicy string + scheduleWeight int +} + +// New return priority plugin +func New(arguments framework.Arguments) framework.Plugin { + dsp := &deviceSharePlugin{pluginArguments: arguments, schedulePolicy: "", scheduleWeight: 0} + enablePredicate(dsp) + return dsp +} + +func (dp *deviceSharePlugin) Name() string { + return PluginName +} + +func enablePredicate(dsp *deviceSharePlugin) { + // Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct. + args := dsp.pluginArguments + args.GetBool(&gpushare.GpuSharingEnable, GPUSharingPredicate) + args.GetBool(&gpushare.GpuNumberEnable, GPUNumberPredicate) + args.GetBool(&gpushare.NodeLockEnable, NodeLockEnable) + args.GetBool(&vgpu.VGPUEnable, VGPUEnable) + + _, ok := args[SchedulePolicyArgument] + if ok { + dsp.schedulePolicy = args[SchedulePolicyArgument].(string) + } + args.GetInt(&dsp.scheduleWeight, ScheduleWeight) + + if gpushare.GpuSharingEnable && gpushare.GpuNumberEnable { + klog.Fatal("can not define true in both gpu sharing and gpu number") + } + if (gpushare.GpuSharingEnable || gpushare.GpuNumberEnable) && vgpu.VGPUEnable { + klog.Fatal("gpu-share and vgpu can't be used together") + } +} + +func createStatus(code int, reason string) *api.Status { + status := api.Status{ + Code: code, + Reason: reason, + } + return &status +} + +func getDeviceScore(ctx context.Context, pod *v1.Pod, node *api.NodeInfo, schedulePolicy string) (int64, *k8sframework.Status) { + s := float64(0) + for _, devices := range node.Others { + if devices.(api.Devices).HasDeviceRequest(pod) { + ns := devices.(api.Devices).ScoreNode(pod, schedulePolicy) + s += ns + } + } + klog.V(4).Infof("deviceScore for task %s/%s is: %v", pod.Namespace, pod.Name, s) + return int64(math.Floor(s + 0.5)), nil +} + +func (dp *deviceSharePlugin) OnSessionOpen(ssn *framework.Session) { + // Register event handlers to update task info in PodLister & nodeMap + ssn.AddPredicateFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) ([]*api.Status, error) { + predicateStatus := make([]*api.Status, 0) + // Check PredicateWithCache + for _, val := range api.RegisteredDevices { + if dev, ok := node.Others[val].(api.Devices); ok { + if reflect.ValueOf(dev).IsNil() { + // TODO When a pod requests a device of the current type, but the current node does not have such a device, an error is thrown + if dev == nil || dev.HasDeviceRequest(task.Pod) { + predicateStatus = append(predicateStatus, &api.Status{ + Code: devices.Unschedulable, + Reason: "node not initialized with device" + val, + }) + return predicateStatus, fmt.Errorf("node not initialized with device %s", val) + } + klog.V(4).Infof("pod %s/%s did not request device %s on %s, skipping it", task.Pod.Namespace, task.Pod.Name, val, node.Name) + continue + } + code, msg, err := dev.FilterNode(task.Pod, dp.schedulePolicy) + if err != nil { + predicateStatus = append(predicateStatus, createStatus(code, msg)) + return predicateStatus, err + } + filterNodeStatus := createStatus(code, msg) + if filterNodeStatus.Code != api.Success { + predicateStatus = append(predicateStatus, filterNodeStatus) + return predicateStatus, fmt.Errorf("plugin device filternode predicates failed %s", msg) + } + } else { + klog.Warningf("Devices %s assertion conversion failed, skip", val) + } + } + + klog.V(4).Infof("checkDevices predicates Task <%s/%s> on Node <%s>: fit ", + task.Namespace, task.Name, node.Name) + + return predicateStatus, nil + }) + + ssn.AddNodeOrderFn(dp.Name(), func(task *api.TaskInfo, node *api.NodeInfo) (float64, error) { + // DeviceScore + if len(dp.schedulePolicy) > 0 { + score, status := getDeviceScore(context.TODO(), task.Pod, node, dp.schedulePolicy) + if !status.IsSuccess() { + klog.Warningf("Node: %s, Calculate Device Score Failed because of Error: %v", node.Name, status.AsError()) + return 0, status.AsError() + } + + // TODO: we should use a seperate plugin for devices, and seperate them from predicates and nodeOrder plugin. + nodeScore := float64(score) * float64(dp.scheduleWeight) + klog.V(5).Infof("Node: %s, task<%s/%s> Device Score weight %d, score: %f", node.Name, task.Namespace, task.Name, dp.scheduleWeight, nodeScore) + } + return 0, nil + }) +} + +func (dp *deviceSharePlugin) OnSessionClose(ssn *framework.Session) {} diff --git a/pkg/scheduler/plugins/deviceshare/deviceshare_test.go b/pkg/scheduler/plugins/deviceshare/deviceshare_test.go new file mode 100644 index 0000000000..e3a2de1f7b --- /dev/null +++ b/pkg/scheduler/plugins/deviceshare/deviceshare_test.go @@ -0,0 +1,130 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deviceshare + +import ( + "testing" + + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + + "volcano.sh/volcano/pkg/scheduler/api" + "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" + "volcano.sh/volcano/pkg/scheduler/framework" + "volcano.sh/volcano/pkg/scheduler/util" +) + +func TestArguments(t *testing.T) { + framework.RegisterPluginBuilder(PluginName, New) + defer framework.CleanupPluginBuilders() + + arguments := framework.Arguments{ + "deviceshare.VGPUEnable": true, + "deviceshare.SchedulePolicy": "binpack", + "deviceshare.ScheduleWeight": 10, + } + + builder, ok := framework.GetPluginBuilder(PluginName) + + if !ok { + t.Fatalf("should have plugin named %s", PluginName) + } + + plugin := builder(arguments) + deviceshare, ok := plugin.(*deviceSharePlugin) + + if !ok { + t.Fatalf("plugin should be %T, but not %T", deviceshare, plugin) + } + + weight := deviceshare.scheduleWeight + + if weight != 10 { + t.Errorf("weight should be 10, but not %v", weight) + } + + if deviceshare.schedulePolicy != "binpack" { + t.Errorf("policy should be binpack, but not %s", deviceshare.schedulePolicy) + } +} + +func addResource(resourceList v1.ResourceList, name v1.ResourceName, need string) { + resourceList[name] = resource.MustParse(need) +} + +func TestVgpuScore(t *testing.T) { + gpuNode1 := vgpu.GPUDevices{ + Name: "node1", + Score: float64(0), + Device: make(map[int]*vgpu.GPUDevice), + } + gpuNode1.Device[0] = vgpu.NewGPUDevice(0, 30000) + gpuNode1.Device[0].Type = "NVIDIA" + gpuNode1.Device[0].Number = 10 + gpuNode1.Device[0].UsedNum = 1 + gpuNode1.Device[0].UsedMem = 3000 + + gpunumber := v1.ResourceName("volcano.sh/vgpu-number") + gpumemory := v1.ResourceName("volcano.sh/vgpu-memory") + + vgpu.VGPUEnable = true + + p1 := util.BuildPod("c1", "p3", "", v1.PodPending, api.BuildResourceList("2", "10Gi"), "pg1", make(map[string]string), make(map[string]string)) + addResource(p1.Spec.Containers[0].Resources.Requests, gpunumber, "1") + addResource(p1.Spec.Containers[0].Resources.Requests, gpumemory, "1000") + p1.Spec.Containers[0].Resources.Limits = make(v1.ResourceList) + addResource(p1.Spec.Containers[0].Resources.Limits, gpunumber, "1") + addResource(p1.Spec.Containers[0].Resources.Limits, gpumemory, "1000") + + canAccess, _, err := gpuNode1.FilterNode(p1, "binpack") + if err != nil || canAccess != 0 { + t.Errorf("binpack filter failed %s", err.Error()) + } + + score := gpuNode1.ScoreNode(p1, "binpack") + if score-float64(4000*100)/float64(30000) > 0.05 { + t.Errorf("score failed expected %f, get %f", float64(4000*100)/float64(30000), score) + } + + gpuNode2 := vgpu.GPUDevices{ + Name: "node2", + Score: float64(0), + Device: make(map[int]*vgpu.GPUDevice), + } + gpuNode2.Device[0] = vgpu.NewGPUDevice(0, 30000) + gpuNode2.Device[0].Type = "NVIDIA" + gpuNode2.Device[0].Number = 10 + gpuNode2.Device[0].UsedNum = 0 + gpuNode2.Device[0].UsedMem = 0 + p2 := util.BuildPod("c2", "p4", "", v1.PodPending, api.BuildResourceList("2", "10Gi"), "pg1", make(map[string]string), make(map[string]string)) + addResource(p2.Spec.Containers[0].Resources.Requests, gpunumber, "1") + addResource(p2.Spec.Containers[0].Resources.Requests, gpumemory, "1000") + p2.Spec.Containers[0].Resources.Limits = make(v1.ResourceList) + addResource(p2.Spec.Containers[0].Resources.Limits, gpunumber, "1") + addResource(p2.Spec.Containers[0].Resources.Limits, gpumemory, "1000") + + canAccess, _, err = gpuNode2.FilterNode(p2, "spread") + if err != nil || canAccess != 0 { + t.Errorf("binpack filter failed %s", err.Error()) + } + + score = gpuNode2.ScoreNode(p1, "spread") + if score-float64(100) > 0.05 { + t.Errorf("score failed expected %f, get %f", float64(4000*100)/float64(30000), score) + } + +} diff --git a/pkg/scheduler/plugins/drf/hdrf_test.go b/pkg/scheduler/plugins/drf/hdrf_test.go index cc398a1fae..e73d328a65 100644 --- a/pkg/scheduler/plugins/drf/hdrf_test.go +++ b/pkg/scheduler/plugins/drf/hdrf_test.go @@ -252,7 +252,7 @@ func TestHDRF(t *testing.T) { binder := &util.FakeBinder{ Binds: map[string]string{}, - Channel: make(chan string), + Channel: make(chan string, 300), } schedulerCache := &cache.SchedulerCache{ Nodes: make(map[string]*api.NodeInfo), diff --git a/pkg/scheduler/plugins/factory.go b/pkg/scheduler/plugins/factory.go index 2a2056cc3b..7e654bbf6f 100644 --- a/pkg/scheduler/plugins/factory.go +++ b/pkg/scheduler/plugins/factory.go @@ -19,8 +19,10 @@ package plugins import ( "volcano.sh/volcano/pkg/scheduler/framework" "volcano.sh/volcano/pkg/scheduler/plugins/binpack" + "volcano.sh/volcano/pkg/scheduler/plugins/capacity" "volcano.sh/volcano/pkg/scheduler/plugins/cdp" "volcano.sh/volcano/pkg/scheduler/plugins/conformance" + "volcano.sh/volcano/pkg/scheduler/plugins/deviceshare" "volcano.sh/volcano/pkg/scheduler/plugins/drf" "volcano.sh/volcano/pkg/scheduler/plugins/extender" "volcano.sh/volcano/pkg/scheduler/plugins/gang" @@ -44,6 +46,7 @@ func init() { // Plugins for Jobs framework.RegisterPluginBuilder(drf.PluginName, drf.New) framework.RegisterPluginBuilder(gang.PluginName, gang.New) + framework.RegisterPluginBuilder(deviceshare.PluginName, deviceshare.New) framework.RegisterPluginBuilder(predicates.PluginName, predicates.New) framework.RegisterPluginBuilder(priority.PluginName, priority.New) framework.RegisterPluginBuilder(nodeorder.PluginName, nodeorder.New) @@ -62,6 +65,7 @@ func init() { // Plugins for Queues framework.RegisterPluginBuilder(proportion.PluginName, proportion.New) + framework.RegisterPluginBuilder(capacity.PluginName, capacity.New) // Plugins for Extender framework.RegisterPluginBuilder(extender.PluginName, extender.New) diff --git a/pkg/scheduler/plugins/gang/gang.go b/pkg/scheduler/plugins/gang/gang.go index a1c88c01c9..f2f49c22c6 100644 --- a/pkg/scheduler/plugins/gang/gang.go +++ b/pkg/scheduler/plugins/gang/gang.go @@ -99,7 +99,7 @@ func (gp *gangPlugin) OnSessionOpen(ssn *framework.Session) { } } - klog.V(4).Infof("Victims from Gang plugins are %+v", victims) + klog.V(4).InfoS("Victims from Gang plugins", "victims", victims, "preemptor", preemptor) return victims, util.Permit } diff --git a/pkg/scheduler/plugins/predicates/main_test.go b/pkg/scheduler/plugins/predicates/main_test.go new file mode 100644 index 0000000000..130832dc2e --- /dev/null +++ b/pkg/scheduler/plugins/predicates/main_test.go @@ -0,0 +1,29 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package predicates + +import ( + "os" + "testing" + + "volcano.sh/volcano/cmd/scheduler/app/options" +) + +func TestMain(m *testing.M) { + options.Default() + os.Exit(m.Run()) +} diff --git a/pkg/scheduler/plugins/predicates/predicates.go b/pkg/scheduler/plugins/predicates/predicates.go index a5a99b0a5c..8c59f6f629 100644 --- a/pkg/scheduler/plugins/predicates/predicates.go +++ b/pkg/scheduler/plugins/predicates/predicates.go @@ -39,9 +39,6 @@ import ( "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumezone" "volcano.sh/volcano/pkg/scheduler/api" - "volcano.sh/volcano/pkg/scheduler/api/devices" - "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/gpushare" - "volcano.sh/volcano/pkg/scheduler/api/devices/nvidia/vgpu" "volcano.sh/volcano/pkg/scheduler/framework" "volcano.sh/volcano/pkg/scheduler/plugins/util/k8s" ) @@ -71,13 +68,6 @@ const ( // PodTopologySpreadEnable is the key for enabling Pod Topology Spread Predicates in scheduler configmap PodTopologySpreadEnable = "predicate.PodTopologySpreadEnable" - // GPUSharingPredicate is the key for enabling GPU Sharing Predicate in YAML - GPUSharingPredicate = "predicate.GPUSharingEnable" - NodeLockEnable = "predicate.NodeLockEnable" - GPUNumberPredicate = "predicate.GPUNumberEnable" - - VGPUEnable = "predicate.VGPUEnable" - // CachePredicate control cache predicate feature CachePredicate = "predicate.CacheEnable" @@ -176,18 +166,6 @@ func enablePredicate(args framework.Arguments) predicateEnable { args.GetBool(&predicate.volumeZoneEnable, VolumeZoneEnable) args.GetBool(&predicate.podTopologySpreadEnable, PodTopologySpreadEnable) - // Checks whether predicate.GPUSharingEnable is provided or not, if given, modifies the value in predicateEnable struct. - args.GetBool(&gpushare.GpuSharingEnable, GPUSharingPredicate) - args.GetBool(&gpushare.GpuNumberEnable, GPUNumberPredicate) - args.GetBool(&gpushare.NodeLockEnable, NodeLockEnable) - args.GetBool(&vgpu.VGPUEnable, VGPUEnable) - - if gpushare.GpuSharingEnable && gpushare.GpuNumberEnable { - klog.Fatal("can not define true in both gpu sharing and gpu number") - } - if (gpushare.GpuSharingEnable || gpushare.GpuNumberEnable) && vgpu.VGPUEnable { - klog.Fatal("gpu-share and vgpu can't be used together") - } args.GetBool(&predicate.cacheEnable, CachePredicate) // Checks whether predicate.ProportionalEnable is provided or not, if given, modifies the value in predicateEnable struct. args.GetBool(&predicate.proportionalEnable, ProportionalPredicate) @@ -527,35 +505,6 @@ func (pp *predicatesPlugin) OnSessionOpen(ssn *framework.Session) { } } - for _, val := range api.RegisteredDevices { - if dev, ok := node.Others[val].(api.Devices); ok { - if dev == nil { - predicateStatus = append(predicateStatus, &api.Status{ - Code: devices.Unschedulable, - Reason: "node not initialized with device" + val, - }) - return predicateStatus, fmt.Errorf("node not initialized with device %s", val) - } - code, msg, err := dev.FilterNode(task.Pod) - filterNodeStatus := &api.Status{ - Code: code, - Reason: msg, - } - if err != nil { - return predicateStatus, err - } - if filterNodeStatus.Code != api.Success { - predicateStatus = append(predicateStatus, filterNodeStatus) - return predicateStatus, fmt.Errorf("plugin device filternode predicates failed %s", msg) - } - } else { - klog.Warningf("Devices %s assertion conversion failed, skip", val) - } - } - - klog.V(4).Infof("checkNodeGPUPredicate predicates Task <%s/%s> on Node <%s>: fit %v", - task.Namespace, task.Name, node.Name, fit) - if predicate.proportionalEnable { // Check ProportionalPredicate proportionalStatus, err := checkNodeResourceIsProportional(task, node, predicate.proportional) diff --git a/pkg/scheduler/plugins/predicates/predicates_test.go b/pkg/scheduler/plugins/predicates/predicates_test.go index 1cbb1e77b1..70bc1bff00 100644 --- a/pkg/scheduler/plugins/predicates/predicates_test.go +++ b/pkg/scheduler/plugins/predicates/predicates_test.go @@ -1,28 +1,20 @@ package predicates import ( - "reflect" "testing" - "github.com/agiledragon/gomonkey/v2" - "github.com/spf13/pflag" - apiv1 "k8s.io/api/core/v1" schedulingv1 "k8s.io/api/scheduling/v1" - "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/tools/record" schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - "volcano.sh/volcano/cmd/scheduler/app/options" - "volcano.sh/volcano/pkg/kube" "volcano.sh/volcano/pkg/scheduler/actions/allocate" "volcano.sh/volcano/pkg/scheduler/api" - "volcano.sh/volcano/pkg/scheduler/cache" "volcano.sh/volcano/pkg/scheduler/conf" "volcano.sh/volcano/pkg/scheduler/framework" "volcano.sh/volcano/pkg/scheduler/plugins/gang" "volcano.sh/volcano/pkg/scheduler/plugins/priority" + "volcano.sh/volcano/pkg/scheduler/uthelper" "volcano.sh/volcano/pkg/scheduler/util" ) @@ -48,31 +40,12 @@ func getWorkerAffinity() *apiv1.Affinity { } func TestEventHandler(t *testing.T) { - var tmp *cache.SchedulerCache - patches := gomonkey.ApplyMethod(reflect.TypeOf(tmp), "AddBindTask", func(scCache *cache.SchedulerCache, task *api.TaskInfo) error { - scCache.Binder.Bind(nil, []*api.TaskInfo{task}) - return nil - }) - defer patches.Reset() - - framework.RegisterPluginBuilder(PluginName, New) - framework.RegisterPluginBuilder(gang.PluginName, gang.New) - framework.RegisterPluginBuilder(priority.PluginName, priority.New) - options.ServerOpts = options.NewServerOption() - defer framework.CleanupPluginBuilders() - - option := options.NewServerOption() - option.AddFlags(pflag.CommandLine) - option.RegisterOptions() - - config, err := kube.BuildConfig(option.KubeClientOptions) - if err != nil { - return + plugins := map[string]framework.PluginBuilder{ + PluginName: New, + gang.PluginName: gang.New, + priority.PluginName: priority.New, } - sc := cache.New(config, option.SchedulerNames, option.DefaultQueue, option.NodeSelector, option.NodeWorkerThreads, nil) - schedulerCache := sc.(*cache.SchedulerCache) - // pending pods w1 := util.BuildPod("ns1", "worker-1", "", apiv1.PodPending, api.BuildResourceList("3", "3k"), "pg1", map[string]string{"role": "worker"}, map[string]string{"selector": "worker"}) w2 := util.BuildPod("ns1", "worker-2", "", apiv1.PodPending, api.BuildResourceList("5", "5k"), "pg1", map[string]string{"role": "worker"}, map[string]string{}) @@ -84,8 +57,6 @@ func TestEventHandler(t *testing.T) { // nodes n1 := util.BuildNode("node1", api.BuildResourceList("4", "4k", []api.ScalarResource{{Name: "pods", Value: "10"}}...), map[string]string{"selector": "worker"}) n2 := util.BuildNode("node2", api.BuildResourceList("3", "3k", []api.ScalarResource{{Name: "pods", Value: "10"}}...), map[string]string{}) - n1.Status.Allocatable["pods"] = resource.MustParse("15") - n2.Status.Allocatable["pods"] = resource.MustParse("15") n1.Labels["kubernetes.io/hostname"] = "node1" n2.Labels["kubernetes.io/hostname"] = "node2" @@ -93,88 +64,34 @@ func TestEventHandler(t *testing.T) { p1 := &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: "p1"}, Value: 1} p2 := &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: "p2"}, Value: 2} // podgroup - pg1 := &schedulingv1beta1.PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "ns1", - Name: "pg1", - }, - Spec: schedulingv1beta1.PodGroupSpec{ - Queue: "q1", - MinMember: int32(2), - PriorityClassName: p2.Name, - }, - } - pg2 := &schedulingv1beta1.PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Namespace: "ns1", - Name: "pg2", - }, - Spec: schedulingv1beta1.PodGroupSpec{ - Queue: "q1", - MinMember: int32(1), - PriorityClassName: p1.Name, - }, - } + pg1 := util.BuildPodGroupWithPrio("pg1", "ns1", "q1", 2, nil, schedulingv1beta1.PodGroupInqueue, p2.Name) + pg2 := util.BuildPodGroupWithPrio("pg2", "ns1", "q1", 1, nil, schedulingv1beta1.PodGroupInqueue, p1.Name) + // queue - queue1 := &schedulingv1beta1.Queue{ - ObjectMeta: metav1.ObjectMeta{ - Name: "q1", - }, - } + queue1 := util.BuildQueue("q1", 0, nil) // tests - tests := []struct { - name string - pods []*apiv1.Pod - nodes []*apiv1.Node - pcs []*schedulingv1.PriorityClass - pgs []*schedulingv1beta1.PodGroup - expected map[string]string - }{ + tests := []uthelper.TestCommonStruct{ { - name: "pod-deallocate", - pods: []*apiv1.Pod{w1, w2, w3}, - nodes: []*apiv1.Node{n1, n2}, - pcs: []*schedulingv1.PriorityClass{p1, p2}, - pgs: []*schedulingv1beta1.PodGroup{pg1, pg2}, - expected: map[string]string{ // podKey -> node + Name: "pod-deallocate", + Plugins: plugins, + Pods: []*apiv1.Pod{w1, w2, w3}, + Nodes: []*apiv1.Node{n1, n2}, + PriClass: []*schedulingv1.PriorityClass{p1, p2}, + PodGroups: []*schedulingv1beta1.PodGroup{pg1, pg2}, + Queues: []*schedulingv1beta1.Queue{queue1}, + Bind: map[string]string{ // podKey -> node "ns1/worker-3": "node1", }, + BindsNum: 1, }, } - for _, test := range tests { - // initialize schedulerCache - binder := &util.FakeBinder{ - Binds: map[string]string{}, - Channel: make(chan string), - } - recorder := record.NewFakeRecorder(100) - go func() { - for { - event := <-recorder.Events - t.Logf("%s: [Event] %s", test.name, event) - } - }() - for _, node := range test.nodes { - schedulerCache.AddOrUpdateNode(node) - } - for _, pod := range test.pods { - schedulerCache.AddPod(pod) - } - for _, pc := range test.pcs { - schedulerCache.PriorityClasses[pc.Name] = pc - } - for _, pg := range test.pgs { - pg.Status = schedulingv1beta1.PodGroupStatus{ - Phase: schedulingv1beta1.PodGroupInqueue, - } - schedulerCache.AddPodGroupV1beta1(pg) - } - schedulerCache.AddQueueV1beta1(queue1) - // session + for i, test := range tests { + // allocate + actions := []framework.Action{allocate.New()} trueValue := true - ssn := framework.OpenSession(schedulerCache, []conf.Tier{ + tiers := []conf.Tier{ { Plugins: []conf.PluginOption{ { @@ -192,15 +109,14 @@ func TestEventHandler(t *testing.T) { }, }, }, - }, nil) - // allocate - allocator := allocate.New() - allocator.Execute(ssn) - framework.CloseSession(ssn) - - t.Logf("expected: %#v, got: %#v", test.expected, binder.Binds) - if !reflect.DeepEqual(test.expected, binder.Binds) { - t.Errorf("expected: %v, got %v ", test.expected, binder.Binds) } + t.Run(test.Name, func(t *testing.T) { + test.RegistSession(tiers, nil) + defer test.Close() + test.Run(actions) + if err := test.CheckAll(i); err != nil { + t.Fatal(err) + } + }) } } diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go index 5baf64a7f5..0f397a4744 100644 --- a/pkg/scheduler/plugins/proportion/proportion.go +++ b/pkg/scheduler/plugins/proportion/proportion.go @@ -121,7 +121,8 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) { if attr.capability == nil { attr.realCapability = realCapability } else { - attr.realCapability = helpers.Min(realCapability, attr.capability) + realCapability.MinDimensionResource(attr.capability, api.Infinity) + attr.realCapability = realCapability } pp.queueOpts[job.Queue] = attr klog.V(4).Infof("Added Queue <%s> attributes.", job.Queue) @@ -159,22 +160,25 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) { attr.name, attr.allocated.String(), attr.request.String(), attr.inqueue.String(), attr.elastic.String()) } + // Record metrics for queueID, queueInfo := range ssn.Queues { - if _, ok := pp.queueOpts[queueID]; !ok { - metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0) + if attr, ok := pp.queueOpts[queueID]; ok { + metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) + metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory) + metrics.UpdateQueueWeight(attr.name, attr.weight) + queue := ssn.Queues[attr.queueID] + metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue) + metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending) + metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running) + metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown) + continue } - } - - // Record metrics - for _, attr := range pp.queueOpts { - metrics.UpdateQueueAllocated(attr.name, attr.allocated.MilliCPU, attr.allocated.Memory) - metrics.UpdateQueueRequest(attr.name, attr.request.MilliCPU, attr.request.Memory) - metrics.UpdateQueueWeight(attr.name, attr.weight) - queue := ssn.Queues[attr.queueID] - metrics.UpdateQueuePodGroupInqueueCount(attr.name, queue.Queue.Status.Inqueue) - metrics.UpdateQueuePodGroupPendingCount(attr.name, queue.Queue.Status.Pending) - metrics.UpdateQueuePodGroupRunningCount(attr.name, queue.Queue.Status.Running) - metrics.UpdateQueuePodGroupUnknownCount(attr.name, queue.Queue.Status.Unknown) + metrics.UpdateQueueAllocated(queueInfo.Name, 0, 0) + metrics.UpdateQueueRequest(queueInfo.Name, 0, 0) + metrics.UpdateQueuePodGroupInqueueCount(queueInfo.Name, 0) + metrics.UpdateQueuePodGroupPendingCount(queueInfo.Name, 0) + metrics.UpdateQueuePodGroupRunningCount(queueInfo.Name, 0) + metrics.UpdateQueuePodGroupUnknownCount(queueInfo.Name, 0) } remaining := pp.totalResource.Clone() diff --git a/pkg/scheduler/plugins/proportion/proportion_test.go b/pkg/scheduler/plugins/proportion/proportion_test.go index ea773179a5..f66b9d4414 100644 --- a/pkg/scheduler/plugins/proportion/proportion_test.go +++ b/pkg/scheduler/plugins/proportion/proportion_test.go @@ -124,6 +124,7 @@ func TestProportion(t *testing.T) { w1 := util.BuildPod("ns1", "worker-1", "", apiv1.PodRunning, api.BuildResourceList("3", "3k"), "pg1", map[string]string{"role": "worker"}, map[string]string{"selector": "worker"}) w2 := util.BuildPod("ns1", "worker-2", "", apiv1.PodRunning, api.BuildResourceList("5", "5k"), "pg1", map[string]string{"role": "worker"}, map[string]string{}) w3 := util.BuildPod("ns1", "worker-3", "", apiv1.PodRunning, api.BuildResourceList("4", "4k"), "pg2", map[string]string{"role": "worker"}, map[string]string{}) + w4 := util.BuildPod("ns1", "rdma-demo", "", apiv1.PodRunning, api.BuildResourceList("1", "1k", []api.ScalarResource{{Name: "nvidia.com/gpu", Value: "1"}, {Name: "rdma/hca", Value: "1"}}...), "pg3", map[string]string{}, map[string]string{}) w1.Spec.Affinity = getWorkerAffinity() w2.Spec.Affinity = getWorkerAffinity() w3.Spec.Affinity = getWorkerAffinity() @@ -131,10 +132,13 @@ func TestProportion(t *testing.T) { // nodes n1 := util.BuildNode("node1", api.BuildResourceList("4", "4k", []api.ScalarResource{{Name: "pods", Value: "10"}}...), map[string]string{"selector": "worker"}) n2 := util.BuildNode("node2", api.BuildResourceList("3", "3k", []api.ScalarResource{{Name: "pods", Value: "10"}}...), map[string]string{}) + n3 := util.BuildNode("node3", api.BuildResourceList("4", "4k", []api.ScalarResource{{Name: "pods", Value: "10"}, {Name: "nvidia.com/gpu", Value: "8"}, {Name: "rdma/hca", Value: "1k"}}...), map[string]string{}) n1.Status.Allocatable["pods"] = resource.MustParse("15") n2.Status.Allocatable["pods"] = resource.MustParse("15") + n3.Status.Allocatable["pods"] = resource.MustParse("15") n1.Labels["kubernetes.io/hostname"] = "node1" n2.Labels["kubernetes.io/hostname"] = "node2" + n3.Labels["kubernetes.io/hostname"] = "node3" // priority p1 := &schedulingv1.PriorityClass{ObjectMeta: metav1.ObjectMeta{Name: "p1"}, Value: 1} @@ -162,6 +166,20 @@ func TestProportion(t *testing.T) { PriorityClassName: p1.Name, }, } + pgRes3 := api.BuildResourceList("1", "1k", []api.ScalarResource{{Name: "nvidia.com/gpu", Value: "1"}, {Name: "rdma/hca", Value: "1"}}...) + pg3 := &schedulingv1beta1.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: "ns1", + Name: "pg3", + }, + Spec: schedulingv1beta1.PodGroupSpec{ + Queue: "q2", + MinMember: int32(1), + PriorityClassName: p1.Name, + MinResources: &pgRes3, + }, + } + // queue queue1 := &schedulingv1beta1.Queue{ ObjectMeta: metav1.ObjectMeta{ @@ -169,6 +187,16 @@ func TestProportion(t *testing.T) { }, } + // queue + queue2 := &schedulingv1beta1.Queue{ + ObjectMeta: metav1.ObjectMeta{ + Name: "q2", + }, + Spec: schedulingv1beta1.QueueSpec{ + Capability: api.BuildResourceList("2", "2k", []api.ScalarResource{{Name: "pods", Value: "10"}, {Name: "nvidia.com/gpu", Value: "4"}}...), + }, + } + // tests tests := []struct { name string @@ -188,6 +216,16 @@ func TestProportion(t *testing.T) { "ns1/worker-3": "node1", }, }, + { + name: "realcapability-test", + pods: []*apiv1.Pod{w1, w2, w3, w4}, + nodes: []*apiv1.Node{n1, n2, n3}, + pcs: []*schedulingv1.PriorityClass{p1, p2}, + pgs: []*schedulingv1beta1.PodGroup{pg1, pg2, pg3}, + expected: map[string]string{ // podKey -> node + "ns1/rdma-demo": "node3", + }, + }, } for _, test := range tests { @@ -232,6 +270,7 @@ func TestProportion(t *testing.T) { schedulerCache.AddPodGroupV1beta1(pg) } schedulerCache.AddQueueV1beta1(queue1) + schedulerCache.AddQueueV1beta1(queue2) // session trueValue := true diff --git a/pkg/scheduler/plugins/task-topology/topology.go b/pkg/scheduler/plugins/task-topology/topology.go index 5c05b66a51..268deaea84 100644 --- a/pkg/scheduler/plugins/task-topology/topology.go +++ b/pkg/scheduler/plugins/task-topology/topology.go @@ -247,10 +247,15 @@ func affinityCheck(job *api.JobInfo, affinity [][]string) error { var taskNumber = len(job.Tasks) var taskRef = make(map[string]bool, taskNumber) + var jobNamePrefix = job.Name + "-" for _, task := range job.Tasks { - tmpStrings := strings.Split(task.Name, "-") - if _, exist := taskRef[tmpStrings[len(tmpStrings)-2]]; !exist { - taskRef[tmpStrings[len(tmpStrings)-2]] = true + // the full task name looks like "${job name}-${task name}-${index}", + // so we can trim the jobNamePrefix and the indexSuffix to get the short task name. + tmpTaskName := task.Name[:strings.LastIndex(task.Name, "-")] + tmpTaskName = strings.TrimPrefix(tmpTaskName, jobNamePrefix) + + if _, exist := taskRef[tmpTaskName]; !exist { + taskRef[tmpTaskName] = true } } diff --git a/pkg/scheduler/plugins/task-topology/topology_test.go b/pkg/scheduler/plugins/task-topology/topology_test.go index 2d4b0af608..6d304e6bd3 100644 --- a/pkg/scheduler/plugins/task-topology/topology_test.go +++ b/pkg/scheduler/plugins/task-topology/topology_test.go @@ -100,6 +100,72 @@ func Test_readTopologyFromPgAnnotations(t *testing.T) { }, err: nil, }, + { + description: "correct annotation with tasks whose names contain `-`", + job: &api.JobInfo{ + Name: "job1", + Namespace: "default", + Tasks: map[api.TaskID]*api.TaskInfo{ + "0": { + Name: "job1-ps-some-0", + }, + "1": { + Name: "job1-ps-some-1", + }, + "2": { + Name: "job1-worker-another-some-0", + }, + "3": { + Name: "job1-worker-another-some-1", + }, + "4": { + Name: "job1-chief-kk-0", + }, + "5": { + Name: "job1-evaluator-tt-0", + }, + }, + PodGroup: &api.PodGroup{ + PodGroup: scheduling.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + JobAffinityAnnotations: "ps-some,worker-another-some;ps-some,chief-kk", + JobAntiAffinityAnnotations: "ps-some;worker-another-some,chief-kk", + TaskOrderAnnotations: "ps-some,worker-another-some,chief-kk,evaluator-tt", + }, + }, + }, + }, + }, + topology: &TaskTopology{ + Affinity: [][]string{ + { + "ps-some", + "worker-another-some", + }, + { + "ps-some", + "chief-kk", + }, + }, + AntiAffinity: [][]string{ + { + "ps-some", + }, + { + "worker-another-some", + "chief-kk", + }, + }, + TaskOrder: []string{ + "ps-some", + "worker-another-some", + "chief-kk", + "evaluator-tt", + }, + }, + err: nil, + }, { description: "nil annotation", job: &api.JobInfo{ diff --git a/pkg/scheduler/plugins/util/util.go b/pkg/scheduler/plugins/util/util.go index 039b54ca89..8e3dab2916 100644 --- a/pkg/scheduler/plugins/util/util.go +++ b/pkg/scheduler/plugins/util/util.go @@ -106,12 +106,13 @@ func GetInqueueResource(job *api.JobInfo, allocated *api.Resource) *api.Resource continue } ignore := false - for _, ignoredDevice := range api.IgnoredDevicesList { + api.IgnoredDevicesList.Range(func(_ int, ignoredDevice string) bool { if len(ignoredDevice) > 0 && strings.Contains(rName.String(), ignoredDevice) { ignore = true - break + return false } - } + return true + }) if ignore { continue } diff --git a/pkg/scheduler/uthelper/helper.go b/pkg/scheduler/uthelper/helper.go new file mode 100644 index 0000000000..a3fb030ec6 --- /dev/null +++ b/pkg/scheduler/uthelper/helper.go @@ -0,0 +1,234 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uthelper + +import ( + "fmt" + "reflect" + "time" + + v1 "k8s.io/api/core/v1" + schedulingv1 "k8s.io/api/scheduling/v1" + + "volcano.sh/apis/pkg/apis/scheduling" + vcapisv1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" + "volcano.sh/volcano/pkg/scheduler/api" + "volcano.sh/volcano/pkg/scheduler/cache" + "volcano.sh/volcano/pkg/scheduler/conf" + "volcano.sh/volcano/pkg/scheduler/framework" + "volcano.sh/volcano/pkg/scheduler/util" +) + +// RegistPlugins plugins +func RegistPlugins(plugins map[string]framework.PluginBuilder) { + for name, plugin := range plugins { + framework.RegisterPluginBuilder(name, plugin) + } +} + +// TestCommonStruct is the most common used resource when do UT +// others can wrap it in a new struct +type TestCommonStruct struct { + Name string + Plugins map[string]framework.PluginBuilder // plugins for each case + Pods []*v1.Pod + Nodes []*v1.Node + PodGroups []*vcapisv1.PodGroup + Queues []*vcapisv1.Queue + PriClass []*schedulingv1.PriorityClass + Bind map[string]string // bind results: ns/podName -> nodeName + PipeLined map[string][]string // pipelined results: map[jobID][]{nodename} + Evicted []string // evicted pods list of ns/podName + Status map[api.JobID]scheduling.PodGroupPhase // final status + BindsNum int // binds events numbers + EvictNum int // evict events numbers, include preempted and reclaimed evict events + + // fake interface instance when check results need + stop chan struct{} + binder cache.Binder + evictor cache.Evictor + stsUpdator cache.StatusUpdater + volBinder cache.VolumeBinder + ssn *framework.Session // store opened session +} + +var _ Interface = &TestCommonStruct{} + +// RegistSession open session with tiers and configuration, and mock schedulerCache with self-defined FakeBinder and FakeEvictor +func (test *TestCommonStruct) RegistSession(tiers []conf.Tier, config []conf.Configuration) *framework.Session { + binder := &util.FakeBinder{ + Binds: map[string]string{}, + Channel: make(chan string), + } + evictor := &util.FakeEvictor{ + Channel: make(chan string), + } + stsUpdator := &util.FakeStatusUpdater{} + test.binder = binder + test.evictor = evictor + test.stop = make(chan struct{}) + // Create scheduler cache with self-defined binder and evictor + schedulerCache := cache.NewCustomMockSchedulerCache("utmock-scheduler", binder, evictor, stsUpdator, nil, nil, nil) + test.stsUpdator = schedulerCache.StatusUpdater + test.volBinder = schedulerCache.VolumeBinder + + for _, node := range test.Nodes { + schedulerCache.AddOrUpdateNode(node) + } + for _, pod := range test.Pods { + schedulerCache.AddPod(pod) + } + for _, pg := range test.PodGroups { + schedulerCache.AddPodGroupV1beta1(pg) + } + for _, queue := range test.Queues { + schedulerCache.AddQueueV1beta1(queue) + } + for _, pc := range test.PriClass { + schedulerCache.AddPriorityClass(pc) + } + + RegistPlugins(test.Plugins) + ssn := framework.OpenSession(schedulerCache, tiers, config) + test.ssn = ssn + schedulerCache.Run(test.stop) + return ssn +} + +// Run choose to run passed in actions; if no actions provided, will panic +func (test *TestCommonStruct) Run(actions []framework.Action) { + if len(actions) == 0 { + panic("no actions provided, please specify a list of actions to execute") + } + for _, action := range actions { + action.Initialize() + action.Execute(test.ssn) + action.UnInitialize() + } +} + +// Close do release resource and clean up +func (test *TestCommonStruct) Close() { + framework.CloseSession(test.ssn) + framework.CleanupPluginBuilders() + close(test.stop) +} + +// CheckAll checks all the need status +func (test *TestCommonStruct) CheckAll(caseIndex int) (err error) { + if err = test.CheckBind(caseIndex); err != nil { + return + } + if err = test.CheckEvict(caseIndex); err != nil { + return + } + if err = test.CheckPipelined(caseIndex); err != nil { + return + } + return test.CheckPGStatus(caseIndex) +} + +// CheckBind check expected bind result +func (test *TestCommonStruct) CheckBind(caseIndex int) error { + binder := test.binder.(*util.FakeBinder) + for i := 0; i < test.BindsNum; i++ { + select { + case <-binder.Channel: + case <-time.After(300 * time.Millisecond): + return fmt.Errorf("Failed to get Bind request in case %d(%s).", caseIndex, test.Name) + } + } + + if len(test.Bind) != len(binder.Binds) { + return fmt.Errorf("case %d(%s) check bind: \nwant: %v, \ngot %v ", caseIndex, test.Name, test.Bind, binder.Binds) + } + for key, value := range test.Bind { + got := binder.Binds[key] + if value != got { + return fmt.Errorf("case %d(%s) check bind: \nwant: %v->%v\n got: %v->%v ", caseIndex, test.Name, key, value, key, got) + } + } + return nil +} + +// CheckEvict check the evicted result +func (test *TestCommonStruct) CheckEvict(caseIndex int) error { + evictor := test.evictor.(*util.FakeEvictor) + for i := 0; i < test.EvictNum; i++ { + select { + case <-evictor.Channel: + case <-time.After(300 * time.Millisecond): + return fmt.Errorf("Failed to get Evict request in case %d(%s).", caseIndex, test.Name) + } + } + + evicts := evictor.Evicts() + if len(test.Evicted) != len(evicts) { + return fmt.Errorf("case %d(%s) check evict: \nwant: %v, \ngot %v ", caseIndex, test.Name, test.Evicted, evicts) + } + + expect := map[string]int{} // evicted number + got := map[string]int{} + for _, v := range test.Evicted { + expect[v]++ + } + for _, v := range evicts { + got[v]++ + } + + if !reflect.DeepEqual(expect, got) { + return fmt.Errorf("case %d(%s) check evict: \nwant: %v\n got: %v ", caseIndex, test.Name, expect, got) + } + return nil +} + +// CheckPGStatus check job's podgroups status +func (test *TestCommonStruct) CheckPGStatus(caseIndex int) error { + ssn := test.ssn + for jobID, phase := range test.Status { + job := ssn.Jobs[jobID] + if job == nil { + return fmt.Errorf("case %d(%s) check podgroup status, job <%v> doesn't exist in session", caseIndex, test.Name, jobID) + } + got := job.PodGroup.Status.Phase + if phase != got { + return fmt.Errorf("case %d(%s) check podgroup <%v> status:\n want: %v, got: %v", caseIndex, test.Name, jobID, phase, got) + } + } + return nil +} + +// CheckPipelined checks pipeline results +func (test *TestCommonStruct) CheckPipelined(caseIndex int) error { + ssn := test.ssn + for jobID, nodes := range test.PipeLined { + job := ssn.Jobs[api.JobID(jobID)] + if job == nil { + return fmt.Errorf("case %d(%s) check pipeline, job <%v> doesn't exist in session", caseIndex, test.Name, jobID) + } + pipeLined := job.TaskStatusIndex[api.Pipelined] + if len(pipeLined) == 0 { + return fmt.Errorf("case %d(%s) check pipeline, want pipelined job: %v, actualy, no tasks pipelined to nodes %v", caseIndex, test.Name, jobID, nodes) + } + for _, task := range pipeLined { + if !Contains(nodes, task.NodeName) { + return fmt.Errorf("case %d(%s) check pipeline: actual: %v->%v, want: %v->%v", caseIndex, test.Name, task.Name, task.NodeName, task.Name, nodes) + } + } + } + return nil +} diff --git a/pkg/scheduler/uthelper/interface.go b/pkg/scheduler/uthelper/interface.go new file mode 100644 index 0000000000..b67d8c1761 --- /dev/null +++ b/pkg/scheduler/uthelper/interface.go @@ -0,0 +1,42 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uthelper + +import ( + "volcano.sh/volcano/pkg/scheduler/conf" + "volcano.sh/volcano/pkg/scheduler/framework" +) + +// Interface is UT framework interface +type Interface interface { + // Run executes the actions + Run(actions []framework.Action) + // RegistSession init the session + RegistSession(tiers []conf.Tier, config []conf.Configuration) *framework.Session + // Close release session and do cleanup + Close() + // CheckAll do all checks + CheckAll(caseIndex int) (err error) + // CheckBind just check bind results in allocate action + CheckBind(caseIndex int) error + // CheckEvict just check evict results in preempt or reclaim action + CheckEvict(caseIndex int) error + // CheckPipelined check the pipelined results + CheckPipelined(caseIndex int) error + // CheckPGStatus check job's status + CheckPGStatus(caseIndex int) error +} diff --git a/pkg/scheduler/uthelper/util.go b/pkg/scheduler/uthelper/util.go new file mode 100644 index 0000000000..e767fb7d2d --- /dev/null +++ b/pkg/scheduler/uthelper/util.go @@ -0,0 +1,27 @@ +/* +Copyright 2024 The Volcano Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package uthelper + +// Contains compares a key in a slice and returns true if exist in it +func Contains[T comparable](elements []T, key T) bool { + for _, v := range elements { + if v == key { + return true + } + } + return false +} diff --git a/pkg/scheduler/util/test_utils.go b/pkg/scheduler/util/test_utils.go index 70f208ed33..a11a0c10f8 100644 --- a/pkg/scheduler/util/test_utils.go +++ b/pkg/scheduler/util/test_utils.go @@ -23,6 +23,7 @@ import ( "time" v1 "k8s.io/api/core/v1" + schedulingv1 "k8s.io/api/scheduling/v1" storagev1 "k8s.io/api/storage/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -32,7 +33,6 @@ import ( "k8s.io/klog/v2" schedulingv1beta1 "volcano.sh/apis/pkg/apis/scheduling/v1beta1" - "volcano.sh/volcano/pkg/scheduler/api" volumescheduling "volcano.sh/volcano/pkg/scheduler/capabilities/volumebinding" ) @@ -179,61 +179,14 @@ func BuildDynamicPVC(namespace, name string, req v1.ResourceList) (*v1.Persisten // BuildBestEffortPod builds a BestEffort pod object func BuildBestEffortPod(namespace, name, nodeName string, p v1.PodPhase, groupName string, labels map[string]string, selector map[string]string) *v1.Pod { - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - UID: types.UID(fmt.Sprintf("%v-%v", namespace, name)), - Name: name, - Namespace: namespace, - Labels: labels, - Annotations: map[string]string{ - schedulingv1beta1.KubeGroupNameAnnotationKey: groupName, - }, - }, - Status: v1.PodStatus{ - Phase: p, - }, - Spec: v1.PodSpec{ - NodeName: nodeName, - NodeSelector: selector, - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: v1.ResourceList{}, - }, - }, - }, - }, - } + return BuildPod(namespace, name, nodeName, p, v1.ResourceList{}, groupName, labels, selector) } // BuildPodWithPriority builds a pod object with priority func BuildPodWithPriority(namespace, name, nodeName string, p v1.PodPhase, req v1.ResourceList, groupName string, labels map[string]string, selector map[string]string, priority *int32) *v1.Pod { - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - UID: types.UID(fmt.Sprintf("%v-%v", namespace, name)), - Name: name, - Namespace: namespace, - Labels: labels, - Annotations: map[string]string{ - schedulingv1beta1.KubeGroupNameAnnotationKey: groupName, - }, - }, - Status: v1.PodStatus{ - Phase: p, - }, - Spec: v1.PodSpec{ - NodeName: nodeName, - NodeSelector: selector, - Priority: priority, - Containers: []v1.Container{ - { - Resources: v1.ResourceRequirements{ - Requests: req, - }, - }, - }, - }, - } + pod := BuildPod(namespace, name, nodeName, p, req, groupName, labels, selector) + pod.Spec.Priority = priority + return pod } // BuildPodGroup return podgroup with base spec and phase status @@ -276,17 +229,46 @@ func BuildQueue(qname string, weight int32, cap v1.ResourceList) *schedulingv1be } } +// BuildQueueWithAnnos return a Queue with annotations +func BuildQueueWithAnnos(qname string, weight int32, cap v1.ResourceList, annos map[string]string) *schedulingv1beta1.Queue { + queue := BuildQueue(qname, weight, cap) + queue.ObjectMeta.Annotations = annos + return queue +} + +// BuildQueueWithResourcesQuantity return a queue with deserved and capability resources quantity. +func BuildQueueWithResourcesQuantity(qname string, deserved, cap v1.ResourceList) *schedulingv1beta1.Queue { + queue := BuildQueue(qname, 1, cap) + queue.Spec.Deserved = deserved + return queue +} + +// ////// build in resource ////// +// BuildPriorityClass return pc +func BuildPriorityClass(name string, value int32) *schedulingv1.PriorityClass { + return &schedulingv1.PriorityClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + }, + Value: value, + } +} + // FakeBinder is used as fake binder type FakeBinder struct { + sync.Mutex Binds map[string]string Channel chan string } // Bind used by fake binder struct to bind pods func (fb *FakeBinder) Bind(kubeClient kubernetes.Interface, tasks []*api.TaskInfo) ([]*api.TaskInfo, error) { + fb.Lock() + defer fb.Unlock() for _, p := range tasks { key := fmt.Sprintf("%v/%v", p.Namespace, p.Name) fb.Binds[key] = p.NodeName + fb.Channel <- key // need to wait binding pod because Bind process is asynchronous } return nil, nil @@ -336,6 +318,11 @@ func (ftsu *FakeStatusUpdater) UpdatePodGroup(pg *api.PodGroup) (*api.PodGroup, return nil, nil } +// UpdateQueueStatus do fake empty update +func (ftsu *FakeStatusUpdater) UpdateQueueStatus(queue *api.QueueInfo) error { + return nil +} + // FakeVolumeBinder is used as fake volume binder type FakeVolumeBinder struct { volumeBinder volumescheduling.SchedulerVolumeBinder diff --git a/pkg/scheduler/util_test.go b/pkg/scheduler/util_test.go index 2a0c953637..1aa645103d 100644 --- a/pkg/scheduler/util_test.go +++ b/pkg/scheduler/util_test.go @@ -50,6 +50,7 @@ tiers: EnabledJobPipelined: &trueValue, EnabledTaskOrder: &trueValue, EnabledPreemptable: &trueValue, + EnablePreemptive: &trueValue, EnabledReclaimable: &trueValue, EnabledQueueOrder: &trueValue, EnabledPredicate: &trueValue, @@ -70,6 +71,7 @@ tiers: EnabledJobPipelined: &trueValue, EnabledTaskOrder: &trueValue, EnabledPreemptable: &trueValue, + EnablePreemptive: &trueValue, EnabledReclaimable: &trueValue, EnabledQueueOrder: &trueValue, EnabledPredicate: &trueValue, @@ -90,6 +92,7 @@ tiers: EnabledJobPipelined: &trueValue, EnabledTaskOrder: &trueValue, EnabledPreemptable: &trueValue, + EnablePreemptive: &trueValue, EnabledReclaimable: &trueValue, EnabledQueueOrder: &trueValue, EnabledPredicate: &trueValue, @@ -114,6 +117,7 @@ tiers: EnabledJobPipelined: &trueValue, EnabledTaskOrder: &trueValue, EnabledPreemptable: &trueValue, + EnablePreemptive: &trueValue, EnabledReclaimable: &trueValue, EnabledQueueOrder: &trueValue, EnabledPredicate: &trueValue, @@ -134,6 +138,7 @@ tiers: EnabledJobPipelined: &trueValue, EnabledTaskOrder: &trueValue, EnabledPreemptable: &trueValue, + EnablePreemptive: &trueValue, EnabledReclaimable: &trueValue, EnabledQueueOrder: &trueValue, EnabledPredicate: &trueValue, @@ -154,6 +159,7 @@ tiers: EnabledJobPipelined: &trueValue, EnabledTaskOrder: &trueValue, EnabledPreemptable: &trueValue, + EnablePreemptive: &trueValue, EnabledReclaimable: &trueValue, EnabledQueueOrder: &trueValue, EnabledPredicate: &trueValue, @@ -174,6 +180,7 @@ tiers: EnabledJobPipelined: &trueValue, EnabledTaskOrder: &trueValue, EnabledPreemptable: &trueValue, + EnablePreemptive: &trueValue, EnabledReclaimable: &trueValue, EnabledQueueOrder: &trueValue, EnabledPredicate: &trueValue, diff --git a/test/e2e/schedulingaction/reclaim.go b/test/e2e/schedulingaction/reclaim.go index 23c1927705..d5025c3f20 100644 --- a/test/e2e/schedulingaction/reclaim.go +++ b/test/e2e/schedulingaction/reclaim.go @@ -23,7 +23,9 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "gopkg.in/yaml.v2" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -535,6 +537,117 @@ var _ = Describe("Reclaim E2E Test", func() { }) + // Reclaim for capacity plugin. + It("Capacity Reclaim Case 11: Multi reclaimed queue", func() { + // First replace proportion with capacity plugin. + cmc := e2eutil.NewConfigMapCase("volcano-system", "integration-scheduler-configmap") + cmc.ChangeBy(func(data map[string]string) (changed bool, changedBefore map[string]string) { + vcScheConfStr, ok := data["volcano-scheduler-ci.conf"] + Expect(ok).To(BeTrue()) + + schedulerConf := &e2eutil.SchedulerConfiguration{} + err := yaml.Unmarshal([]byte(vcScheConfStr), schedulerConf) + Expect(err).NotTo(HaveOccurred()) + for _, tier := range schedulerConf.Tiers { + for i, plugin := range tier.Plugins { + if plugin.Name == "proportion" { + tier.Plugins[i].Name = "capacity" + break + } + } + } + + newVCScheConfBytes, err := yaml.Marshal(schedulerConf) + Expect(err).NotTo(HaveOccurred()) + + changed = true + changedBefore = make(map[string]string) + changedBefore["volcano-scheduler-ci.conf"] = vcScheConfStr + data["volcano-scheduler-ci.conf"] = string(newVCScheConfBytes) + return + }) + defer cmc.UndoChanged() + + q1 := "reclaim-q1" + q2 := "reclaim-q2" + q3 := "reclaim-q3" + q4 := "reclaim-q4" + ctx := e2eutil.InitTestContext(e2eutil.Options{ + Queues: []string{q1, q2, q3, q4}, + NodesNumLimit: 4, + DeservedResource: map[string]v1.ResourceList{ + q1: {v1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(1024*1024*1024, resource.BinarySI)}, + q2: {v1.ResourceCPU: *resource.NewQuantity(1, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(1024*1024*1024, resource.BinarySI)}, + q3: {v1.ResourceCPU: *resource.NewQuantity(2, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(2*1024*1024*1024, resource.BinarySI)}, + q4: {v1.ResourceCPU: *resource.NewQuantity(4, resource.DecimalSI), v1.ResourceMemory: *resource.NewQuantity(4*1024*1024*1024, resource.BinarySI)}, + }, + NodesResourceLimit: e2eutil.CPU2Mem2, + PriorityClasses: map[string]int32{ + "low-priority": 10, + "high-priority": 10000, + }, + }) + + defer e2eutil.CleanupTestContext(ctx) + + By("Setup initial jobs") + + spec := &e2eutil.JobSpec{ + Tasks: []e2eutil.TaskSpec{ + { + Img: e2eutil.DefaultNginxImage, + Req: e2eutil.CPU1Mem1, + Min: 1, + Rep: 4, + Labels: map[string]string{schedulingv1beta1.PodPreemptable: "true"}, + }, + }, + } + + spec.Name = "reclaim-j1" + spec.Queue = q1 + spec.Pri = "low-priority" + job1 := e2eutil.CreateJob(ctx, spec) + err := e2eutil.WaitJobReady(ctx, job1) + Expect(err).NotTo(HaveOccurred()) + + spec.Name = "reclaim-j2" + spec.Queue = q2 + spec.Pri = "low-priority" + job2 := e2eutil.CreateJob(ctx, spec) + err = e2eutil.WaitJobReady(ctx, job2) + Expect(err).NotTo(HaveOccurred()) + + err = WaitQueueStatus(ctx, "Running", 1, q1) + Expect(err).NotTo(HaveOccurred(), "Error waiting for queue1 running") + + err = WaitQueueStatus(ctx, "Running", 1, q2) + Expect(err).NotTo(HaveOccurred(), "Error waiting for queue2 running") + + By("Create coming jobs") + + _, err = CreateReclaimJob(ctx, e2eutil.CPU2Mem2, "reclaim-j3", q3, "high-priority", "", true) + Expect(err).NotTo(HaveOccurred(), "Wait for job3 failed") + + _, err = CreateReclaimJob(ctx, e2eutil.CPU2Mem2, "reclaim-j4", q4, "high-priority", "", true) + Expect(err).NotTo(HaveOccurred(), "Wait for job4 failed") + + _, err = CreateReclaimJob(ctx, e2eutil.CPU1Mem1, "reclaim-j5", q4, "high-priority", "", true) + Expect(err).NotTo(HaveOccurred(), "Wait for job5 failed") + + _, err = CreateReclaimJob(ctx, e2eutil.CPU1Mem1, "reclaim-j6", q4, "high-priority", "", true) + Expect(err).NotTo(HaveOccurred(), "Wait for job6 failed") + + By("Make sure all job running") + + err = WaitQueueStatus(ctx, "Running", 1, q3) + Expect(err).NotTo(HaveOccurred(), "Error waiting for queue3 running") + + err = WaitQueueStatus(ctx, "Running", 3, q4) + Expect(err).NotTo(HaveOccurred(), "Error waiting for queue4 running") + + }) + It("Reclaim", func() { Skip("skip: the case has some problem") q1, q2 := "reclaim-q1", "reclaim-q2" diff --git a/test/e2e/stress/queue.go b/test/e2e/stress/queue.go index cda028240f..ad8db87831 100644 --- a/test/e2e/stress/queue.go +++ b/test/e2e/stress/queue.go @@ -53,7 +53,7 @@ var _ = ginkgo.Describe("[Stress] Queue Test", func() { defer wg.Done() queueName := fmt.Sprintf("queue-%d", index) - e2eutil.CreateQueue(ctx, queueName) + e2eutil.CreateQueue(ctx, queueName, nil) err := e2eutil.WaitQueueStatus(func() (bool, error) { queue, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), queueName, metav1.GetOptions{}) gomega.Expect(err).NotTo(gomega.HaveOccurred()) diff --git a/test/e2e/util/queue.go b/test/e2e/util/queue.go index 87e955c1cf..f936e69f98 100644 --- a/test/e2e/util/queue.go +++ b/test/e2e/util/queue.go @@ -58,7 +58,7 @@ func CreateQueueWithQueueSpec(ctx *TestContext, queueSpec *QueueSpec) { } // CreateQueue creates Queue with the specified name -func CreateQueue(ctx *TestContext, q string) { +func CreateQueue(ctx *TestContext, q string, deservedResource v1.ResourceList) { _, err := ctx.Vcclient.SchedulingV1beta1().Queues().Get(context.TODO(), q, metav1.GetOptions{}) if err != nil { _, err := ctx.Vcclient.SchedulingV1beta1().Queues().Create(context.TODO(), &schedulingv1beta1.Queue{ @@ -66,7 +66,8 @@ func CreateQueue(ctx *TestContext, q string) { Name: q, }, Spec: schedulingv1beta1.QueueSpec{ - Weight: 1, + Weight: 1, + Deserved: deservedResource, }, }, metav1.CreateOptions{}) Expect(err).NotTo(HaveOccurred(), "failed to create queue %s", q) @@ -78,7 +79,7 @@ func CreateQueues(ctx *TestContext) { By("Creating Queues") for _, queue := range ctx.Queues { - CreateQueue(ctx, queue) + CreateQueue(ctx, queue, ctx.DeservedResource[queue]) } // wait for all queues state open diff --git a/test/e2e/util/util.go b/test/e2e/util/util.go index ed0e3d2d5b..34398bbc57 100644 --- a/test/e2e/util/util.go +++ b/test/e2e/util/util.go @@ -120,6 +120,7 @@ type TestContext struct { Namespace string Queues []string + DeservedResource map[string]v1.ResourceList PriorityClasses map[string]int32 UsingPlaceHolder bool } @@ -127,6 +128,7 @@ type TestContext struct { type Options struct { Namespace string Queues []string + DeservedResource map[string]v1.ResourceList PriorityClasses map[string]int32 NodesNumLimit int NodesResourceLimit v1.ResourceList @@ -144,6 +146,7 @@ func InitTestContext(o Options) *TestContext { ctx := &TestContext{ Namespace: o.Namespace, Queues: o.Queues, + DeservedResource: o.DeservedResource, PriorityClasses: o.PriorityClasses, Vcclient: VcClient, Kubeclient: KubeClient,