diff --git a/go.mod b/go.mod index 8219ec5be04..d801ffecc59 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module volcano.sh/volcano -go 1.20 +go 1.19 require ( github.com/agiledragon/gomonkey/v2 v2.1.0 diff --git a/pkg/scheduler/api/devices/nvidia/mgpu/constants.go b/pkg/scheduler/api/devices/nvidia/mgpu/constants.go index 100b95402a9..f00b94dc9c8 100644 --- a/pkg/scheduler/api/devices/nvidia/mgpu/constants.go +++ b/pkg/scheduler/api/devices/nvidia/mgpu/constants.go @@ -82,8 +82,8 @@ const ( NotNeedMultipleGPU = -3 // GPUTypeMGPU is GPUTypeMGPU GPUTypeMGPU = "mgpu" - // GPUTypePGPU is GPUTypePGPU - GPUTypePGPU = "nvidia" + // GPUTypeNvidiaGPU is GPUTypeNvidiaGPU + GPUTypeNvidiaGPU = "nvidia" // DefaultComputePolicy is DefaultComputePolicy DefaultComputePolicy = "fixed-share" // NativeBurstSharePolicy is NativeBurstSharePolicy diff --git a/pkg/scheduler/api/devices/nvidia/mgpu/device_info.go b/pkg/scheduler/api/devices/nvidia/mgpu/device_info.go index 5b00e09f5ef..78490d251c2 100644 --- a/pkg/scheduler/api/devices/nvidia/mgpu/device_info.go +++ b/pkg/scheduler/api/devices/nvidia/mgpu/device_info.go @@ -71,15 +71,18 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices { if node == nil { return nil } + value, ok := node.Labels[VKELabelNodeResourceType] + if !ok || value == GPUTypeNvidiaGPU { + return nil + } rater := getRater() - devices := decodeNodeDevices(name, node, rater) - return devices + return decodeNodeDevices(name, node, rater) } // AddResource adds the pod to GPU pool if it is assigned func (gs *GPUDevices) AddResource(pod *v1.Pod) { - if !isSharedMGPUPod(pod) { + if !isSharedMGPUPod(pod) || (pod.Status.Phase != v1.PodRunning) { return } klog.V(3).Infof("Start to add pod %s/%s", pod.Namespace, pod.Name) @@ -105,11 +108,10 @@ func (gs *GPUDevices) SubResource(pod *v1.Pod) { } klog.Infof("Start to forget pod %s/%s", pod.Namespace, pod.Name) podName := getPodNamespaceName(pod) - option := NewGPUOptionFromPod(pod, VKEResourceMGPUCore, VKEResourceMGPUMemory) - //option, ok := gs.Pod2OptionMap[podName] - //if !ok { - // return - //} + option, ok := gs.Pod2OptionMap[podName] + if !ok { + return + } if option.Allocated != nil && option.Allocated[0] == nil { return } @@ -117,9 +119,9 @@ func (gs *GPUDevices) SubResource(pod *v1.Pod) { klog.Infof("Cancel pod %s/%s option %+v on %+v", pod.Namespace, pod.Name, option, gs.GPUs.ToString()) } gs.GPUs.Cancel(pod, option) - //if klog.V(3).Enabled() { - klog.Infof("After Cancel, Current GPU allocation of node %s: %+v", gs.Name, gs.GPUs.ToString()) - //} + if klog.V(3).Enabled() { + klog.Infof("After Cancel, Current GPU allocation of node %s: %+v", gs.Name, gs.GPUs.ToString()) + } delete(gs.Pod2OptionMap, podName) } @@ -154,23 +156,8 @@ func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) err if klog.V(5).Enabled() { klog.Infof("GPU Devices: %+v\n", gs.GPUs.ToString()) } - var ( - req GPURequest - cmReq ContainerMultiGPURequest - ) - req, cmReq = NewGPURequest(pod, VKEResourceMGPUCore, VKEResourceMGPUMemory) - // Check container specific number whether exceed the node's GPU number - gpuCountList, _ := ExtraMultipleGPUCountList(pod) - if len(gpuCountList) > 0 { - for i, count := range gpuCountList { - if count > len(gs.GPUs) { - return fmt.Errorf("request multiple GPU count %d is exceed the allocatable GPU number, container index: %d", count, i+1) - } - } - } - // Achieve the option of GPU for pod's containers - option, err := gs.GPUs.Trade(gs.Rater, req, pod, cmReq) + option, err := tradeForResourceOption(pod, gs) if err != nil { return err } @@ -196,7 +183,11 @@ func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) erro podName := GetPodNamespaceName(pod) option, ok := gs.Pod2OptionMap[podName] if !ok { - return fmt.Errorf("") + // remove patched GPU annotations of the pod + if err := removePatchGPUInfo(kubeClient, pod); err != nil { + return fmt.Errorf("remove patched mgpu annotations failed: %v", err) + } + return fmt.Errorf("not found pod option from cache when ReleaseToPod") } if klog.V(3).Enabled() { klog.Infof("Cancel pod %s/%s option %+v on %+v", pod.Namespace, pod.Name, option, gs.GPUs.ToString()) @@ -289,7 +280,7 @@ func (g GPUs) Transact(pod *v1.Pod, option *GPUOption) error { // judge whether the only needed card can satisfy the request of container i if !g[option.Allocated[i][0]].CanAllocate(option.Request[i]) { g.cancelAdded(addedDict, pod, option) - klog.Errorf("Fail to trade option %+v on %+v because the GPU's residual memory or core can't satisfy the container", option, g.ToString()) + //klog.Errorf("Fail to trade option %+v on %+v because the GPU's residual memory or core can't satisfy the container", option, g.ToString()) return fmt.Errorf("can't trade option %+v on %+v because the GPU's residual memory or core can't satisfy the container", option, g) } g[option.Allocated[i][0]].Add(option.Request[i]) @@ -676,6 +667,19 @@ func patchGPUInfo(kubeClient kubernetes.Interface, pod *v1.Pod, option *GPUOptio return Patch(kubeClient, pod, podCopy) } +func removePatchGPUInfo(kubeClient kubernetes.Interface, pod *v1.Pod) error { + podCopy := pod.DeepCopy() + if podCopy.Annotations == nil { + podCopy.Annotations = make(map[string]string) + } + for _, c := range podCopy.Spec.Containers { + delete(podCopy.Annotations, fmt.Sprintf(VKEAnnotationMGPUContainer, c.Name)) + } + delete(podCopy.Annotations, VKEAnnotationMGPUAssumed) + + return Patch(kubeClient, pod, podCopy) +} + // IsMultipleGPUPod return true when there is one pod required mgpu-core, mgpu-mem and multiple gpu count. func IsMultipleGPUPod(pod *v1.Pod) bool { gpuCountList, err := ExtraMultipleGPUCountList(pod) diff --git a/pkg/scheduler/api/devices/nvidia/mgpu/utils.go b/pkg/scheduler/api/devices/nvidia/mgpu/utils.go index 98d3f483f80..2fd9e0cf89c 100644 --- a/pkg/scheduler/api/devices/nvidia/mgpu/utils.go +++ b/pkg/scheduler/api/devices/nvidia/mgpu/utils.go @@ -48,17 +48,17 @@ func checkMGPUResourceInPod(pod *v1.Pod) bool { } func getRater() Rater { - //var rater Rater - //switch GlobalConfig.Policy { - //case Binpack: - // rater = &GPUBinpack{} - //case Spread: - // rater = &GPUSpread{} - //default: - // klog.Errorf("priority algorithm is not supported: %s", GlobalConfig.Policy) - // return nil - //} - return &GPUBinpack{} + var rater Rater + switch GlobalConfig.Policy { + case Binpack: + rater = &GPUBinpack{} + case Spread: + rater = &GPUSpread{} + default: + klog.Errorf("priority algorithm is not supported: %s", GlobalConfig.Policy) + return nil + } + return rater } func decodeNodeDevices(name string, node *v1.Node, rater Rater) *GPUDevices { @@ -102,7 +102,7 @@ func decodeNodeDevices(name string, node *v1.Node, rater Rater) *GPUDevices { }) } default: - klog.Errorf("invalid resource value of %s", VKELabelNodeResourceType) + klog.Errorf("invalid resource value of %s on node %s", VKELabelNodeResourceType, node.Name) return nil } } @@ -123,17 +123,30 @@ func checkNodeMGPUSharingPredicate(pod *v1.Pod, gssnap *GPUDevices) (bool, error if !isFullCardGPUPod(pod) && getPodComputePolicy(pod) != gs.NodePolicy { return false, fmt.Errorf("compute policy not match normal mgpu") } + _, err := tradeForResourceOption(pod, gs) + if err != nil { + return false, err + } + + return true, nil +} + +func tradeForResourceOption(pod *v1.Pod, gs *GPUDevices) (option *GPUOption, err error) { + var ( + req GPURequest + cmReq ContainerMultiGPURequest + ) + req, cmReq = NewGPURequest(pod, VKEResourceMGPUCore, VKEResourceMGPUMemory) // Check container specific number whether exceed the node's GPU number gpuCountList, _ := ExtraMultipleGPUCountList(pod) if len(gpuCountList) > 0 { for i, count := range gpuCountList { if count > len(gs.GPUs) { - return false, fmt.Errorf("request multiple GPU count %d is exceed the allocatable GPU number, container index: %d", count, i+1) + return nil, fmt.Errorf("request multiple GPU count %d is exceed the allocatable GPU number, container index: %d", count, i+1) } } } - - return true, nil + return gs.GPUs.Trade(gs.Rater, req, pod, cmReq) } func getPodNamespaceName(pod *v1.Pod) string {