Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

supprot mgpu feature for volcano #3030

Open
wants to merge 5 commits into
base: release-1.8
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
module volcano.sh/volcano

go 1.20
go 1.19

require (
github.com/agiledragon/gomonkey/v2 v2.1.0
Expand Down
6 changes: 5 additions & 1 deletion pkg/scheduler/api/devices/nvidia/gpushare/device_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,10 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices {
return &gpudevices
}

func (gs *GPUDevices) DeepCopy() interface{} {
return gs
}

// GetIgnoredDevices return device names which wish vc-scheduler to ignore
func (gs *GPUDevices) GetIgnoredDevices() []string {
return []string{""}
Expand Down Expand Up @@ -148,7 +152,7 @@ func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) erro
return nil
}

func (gs *GPUDevices) FilterNode(pod *v1.Pod) (int, string, error) {
func (gs *GPUDevices) FilterNode(kubeClient kubernetes.Interface, pod *v1.Pod) (int, string, error) {
klog.V(4).Infoln("DeviceSharing:Into FitInPod", pod.Name)
if GpuSharingEnable {
fit, err := checkNodeGPUSharingPredicate(pod, gs)
Expand Down
110 changes: 110 additions & 0 deletions pkg/scheduler/api/devices/nvidia/mgpu/constants.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package mgpu

import v1 "k8s.io/api/core/v1"

/*
actions: "reclaim, allocate, backfill, preempt"
tiers:
- plugins:
- name: priority
- name: gang
- name: conformance
- plugins:
- name: drf
- name: predicates
arguments:
predicate.MGPUEnable: true
predicate.MGPUPolicy: binpack
predicate.MGPUWeightOfCore: 20
predicate.MGPUScheduleMode: index
predicate.MGPUMaxContainersPerCard: 16
- name: proportion
- name: nodeorder
*/

var (
MGPUEnable bool
GlobalConfig = &MGPUConfig{
Policy: Binpack,
MaxContainersNumPolicyValid: DefaultMaxContainersNumPolicyValid,
WeightOfCore: DefaultWeightOfCore,
ScheduleMode: DefaultScheduleMode,
MaxContainersPerCard: DefaultMaxContainersPerCard,
}
)

type MGPUConfig struct {
// policy is the card-level scoring policy, e.g. binpack/spread
Policy string
// MaxContainersNumPolicyValid means that bigger then the number GPU Policy is invalid
MaxContainersNumPolicyValid int
// WeightOfCore is the weight of core, e.g. 20
WeightOfCore int
// ScheduleMode is the schedule mode, e.g. index/id
ScheduleMode string
// MaxContainersPerCard is the number of max containers per card, e.g. 16
MaxContainersPerCard int
}

const (
Policy = "predicate.MGPUPolicy"
MaxContainersNumPolicyValid = "predicate.MGPUMaxContainersNumPolicyValid"
WeightOfCore = "predicate.MGPUWeightOfCore"
ScheduleMode = "predicate.MGPUScheduleMode"
MaxContainersPerCard = "predicate.MGPUMaxContainersPerCard"
DefaultWeightOfCore = 20
DefaultMaxContainersNumPolicyValid = 5
DefaultMaxContainersPerCard = 16
DefaultScheduleMode = "index"
)

// ModeType is a type "string".
type ModeType string

// PolicyType is a type "string".
type PolicyType string

const (
// Binpack is the string "binpack".
Binpack = "binpack"
// Spread is the string "spread".
Spread = "spread"
)

const (
// GPUCoreEachCard is GPUCoreEachCard
GPUCoreEachCard = 100
// NotNeedGPU is NotNeedGPU
NotNeedGPU = -1
// NotNeedRate is NotNeedRate
NotNeedRate = -2
// NotNeedMultipleGPU is NotNeedMultipleGPU
NotNeedMultipleGPU = -3
// GPUTypeMGPU is GPUTypeMGPU
GPUTypeMGPU = "mgpu"
// GPUTypeNvidiaGPU is GPUTypeNvidiaGPU
GPUTypeNvidiaGPU = "nvidia"
// DefaultComputePolicy is DefaultComputePolicy
DefaultComputePolicy = "fixed-share"
// NativeBurstSharePolicy is NativeBurstSharePolicy
NativeBurstSharePolicy = "native-burst-share"
// DefaultGPUCount is DefaultGPUCount
DefaultGPUCount = 1

// VKEAnnotationMGPUAssumed is the scheduled result for mgpu pod
VKEAnnotationMGPUAssumed = "vke.volcengine.com/assumed"
// VKEAnnotationMGPUContainer is the format for gpu index with specific container name
VKEAnnotationMGPUContainer = "vke.volcengine.com/gpu-index-container-%s"
// VKELabelNodeResourceType is label key of node resource type, the value is like nvidia/mgpu
VKELabelNodeResourceType = "vke.node.gpu.schedule"
// VKEAnnotationMGPUComputePolicy is annotation for mgpu compute policy on vke
VKEAnnotationMGPUComputePolicy = "vke.volcengine.com/mgpu-compute-policy"
// VKEAnnotationContainerMultipleGPU is annotation for mgpu container multiple mode on vke
VKEAnnotationContainerMultipleGPU = "vke.volcengine.com/container-multiple-gpu"
// VKEResourceMGPUCore is mgpu-core resource on vke
VKEResourceMGPUCore v1.ResourceName = "vke.volcengine.com/mgpu-core"
// VKEResourceMGPUMemory is mgpu-memory resource on vke
VKEResourceMGPUMemory v1.ResourceName = "vke.volcengine.com/mgpu-memory"

DeviceName = "mgpu"
)
Loading