fix Filter and Allocate in mgpu device

Signed-off-by: jiangchenxi.530 <jiangchenxi.530@bytedance.com>
volcano-sh · Oct 16, 2023 · b8fa22b · b8fa22b
1 parent 4698ac2
commit b8fa22b
Show file tree

Hide file tree

Showing 4 changed files with 64 additions and 47 deletions.
diff --git a/go.mod b/go.mod
@@ -1,6 +1,6 @@
 module volcano.sh/volcano
 
-go 1.20
+go 1.19
 
 require (
 	github.com/agiledragon/gomonkey/v2 v2.1.0

diff --git a/pkg/scheduler/api/devices/nvidia/mgpu/constants.go b/pkg/scheduler/api/devices/nvidia/mgpu/constants.go
@@ -82,8 +82,8 @@ const (
 	NotNeedMultipleGPU = -3
 	// GPUTypeMGPU is GPUTypeMGPU
 	GPUTypeMGPU = "mgpu"
-	// GPUTypePGPU is GPUTypePGPU
-	GPUTypePGPU = "nvidia"
+	// GPUTypeNvidiaGPU is GPUTypeNvidiaGPU
+	GPUTypeNvidiaGPU = "nvidia"
 	// DefaultComputePolicy is DefaultComputePolicy
 	DefaultComputePolicy = "fixed-share"
 	// NativeBurstSharePolicy is NativeBurstSharePolicy

diff --git a/pkg/scheduler/api/devices/nvidia/mgpu/device_info.go b/pkg/scheduler/api/devices/nvidia/mgpu/device_info.go
@@ -71,15 +71,18 @@ func NewGPUDevices(name string, node *v1.Node) *GPUDevices {
 	if node == nil {
 		return nil
 	}
+	value, ok := node.Labels[VKELabelNodeResourceType]
+	if !ok || value == GPUTypeNvidiaGPU {
+		return nil
+	}
 	rater := getRater()
-	devices := decodeNodeDevices(name, node, rater)
 
-	return devices
+	return decodeNodeDevices(name, node, rater)
 }
 
 // AddResource adds the pod to GPU pool if it is assigned
 func (gs *GPUDevices) AddResource(pod *v1.Pod) {
-	if !isSharedMGPUPod(pod) {
+	if !isSharedMGPUPod(pod) || (pod.Status.Phase != v1.PodRunning) {
 		return
 	}
 	klog.V(3).Infof("Start to add pod %s/%s", pod.Namespace, pod.Name)
@@ -105,21 +108,20 @@ func (gs *GPUDevices) SubResource(pod *v1.Pod) {
 	}
 	klog.Infof("Start to forget pod %s/%s", pod.Namespace, pod.Name)
 	podName := getPodNamespaceName(pod)
-	option := NewGPUOptionFromPod(pod, VKEResourceMGPUCore, VKEResourceMGPUMemory)
-	//option, ok := gs.Pod2OptionMap[podName]
-	//if !ok {
-	//	return
-	//}
+	option, ok := gs.Pod2OptionMap[podName]
+	if !ok {
+		return
+	}
 	if option.Allocated != nil && option.Allocated[0] == nil {
 		return
 	}
 	if klog.V(3).Enabled() {
 		klog.Infof("Cancel pod %s/%s option %+v on %+v", pod.Namespace, pod.Name, option, gs.GPUs.ToString())
 	}
 	gs.GPUs.Cancel(pod, option)
-	//if klog.V(3).Enabled() {
-	klog.Infof("After Cancel, Current GPU allocation of node %s: %+v", gs.Name, gs.GPUs.ToString())
-	//}
+	if klog.V(3).Enabled() {
+		klog.Infof("After Cancel, Current GPU allocation of node %s: %+v", gs.Name, gs.GPUs.ToString())
+	}
 	delete(gs.Pod2OptionMap, podName)
 }
 
@@ -154,23 +156,8 @@ func (gs *GPUDevices) Allocate(kubeClient kubernetes.Interface, pod *v1.Pod) err
 		if klog.V(5).Enabled() {
 			klog.Infof("GPU Devices: %+v\n", gs.GPUs.ToString())
 		}
-		var (
-			req   GPURequest
-			cmReq ContainerMultiGPURequest
-		)
-		req, cmReq = NewGPURequest(pod, VKEResourceMGPUCore, VKEResourceMGPUMemory)
-		// Check container specific number whether exceed the node's GPU number
-		gpuCountList, _ := ExtraMultipleGPUCountList(pod)
-		if len(gpuCountList) > 0 {
-			for i, count := range gpuCountList {
-				if count > len(gs.GPUs) {
-					return fmt.Errorf("request multiple GPU count %d is exceed the allocatable GPU number, container index: %d", count, i+1)
-				}
-			}
-		}
-
 		// Achieve the option of GPU for pod's containers
-		option, err := gs.GPUs.Trade(gs.Rater, req, pod, cmReq)
+		option, err := tradeForResourceOption(pod, gs)
 		if err != nil {
 			return err
 		}
@@ -196,7 +183,11 @@ func (gs *GPUDevices) Release(kubeClient kubernetes.Interface, pod *v1.Pod) erro
 	podName := GetPodNamespaceName(pod)
 	option, ok := gs.Pod2OptionMap[podName]
 	if !ok {
-		return fmt.Errorf("")
+		// remove patched GPU annotations of the pod
+		if err := removePatchGPUInfo(kubeClient, pod); err != nil {
+			return fmt.Errorf("remove patched mgpu annotations failed: %v", err)
+		}
+		return fmt.Errorf("not found pod option from cache when ReleaseToPod")
 	}
 	if klog.V(3).Enabled() {
 		klog.Infof("Cancel pod %s/%s option %+v on %+v", pod.Namespace, pod.Name, option, gs.GPUs.ToString())
@@ -289,7 +280,7 @@ func (g GPUs) Transact(pod *v1.Pod, option *GPUOption) error {
 					// judge whether the only needed card can satisfy the request of container i
 					if !g[option.Allocated[i][0]].CanAllocate(option.Request[i]) {
 						g.cancelAdded(addedDict, pod, option)
-						klog.Errorf("Fail to trade option %+v on %+v because the GPU's residual memory or core can't satisfy the container", option, g.ToString())
+						//klog.Errorf("Fail to trade option %+v on %+v because the GPU's residual memory or core can't satisfy the container", option, g.ToString())
 						return fmt.Errorf("can't trade option %+v on %+v because the GPU's residual memory or core can't satisfy the container", option, g)
 					}
 					g[option.Allocated[i][0]].Add(option.Request[i])
@@ -676,6 +667,19 @@ func patchGPUInfo(kubeClient kubernetes.Interface, pod *v1.Pod, option *GPUOptio
 	return Patch(kubeClient, pod, podCopy)
 }
 
+func removePatchGPUInfo(kubeClient kubernetes.Interface, pod *v1.Pod) error {
+	podCopy := pod.DeepCopy()
+	if podCopy.Annotations == nil {
+		podCopy.Annotations = make(map[string]string)
+	}
+	for _, c := range podCopy.Spec.Containers {
+		delete(podCopy.Annotations, fmt.Sprintf(VKEAnnotationMGPUContainer, c.Name))
+	}
+	delete(podCopy.Annotations, VKEAnnotationMGPUAssumed)
+
+	return Patch(kubeClient, pod, podCopy)
+}
+
 // IsMultipleGPUPod return true when there is one pod required mgpu-core, mgpu-mem and multiple gpu count.
 func IsMultipleGPUPod(pod *v1.Pod) bool {
 	gpuCountList, err := ExtraMultipleGPUCountList(pod)

diff --git a/pkg/scheduler/api/devices/nvidia/mgpu/utils.go b/pkg/scheduler/api/devices/nvidia/mgpu/utils.go
@@ -48,17 +48,17 @@ func checkMGPUResourceInPod(pod *v1.Pod) bool {
 }
 
 func getRater() Rater {
-	//var rater Rater
-	//switch GlobalConfig.Policy {
-	//case Binpack:
-	//	rater = &GPUBinpack{}
-	//case Spread:
-	//	rater = &GPUSpread{}
-	//default:
-	//	klog.Errorf("priority algorithm is not supported: %s", GlobalConfig.Policy)
-	//	return nil
-	//}
-	return &GPUBinpack{}
+	var rater Rater
+	switch GlobalConfig.Policy {
+	case Binpack:
+		rater = &GPUBinpack{}
+	case Spread:
+		rater = &GPUSpread{}
+	default:
+		klog.Errorf("priority algorithm is not supported: %s", GlobalConfig.Policy)
+		return nil
+	}
+	return rater
 }
 
 func decodeNodeDevices(name string, node *v1.Node, rater Rater) *GPUDevices {
@@ -102,7 +102,7 @@ func decodeNodeDevices(name string, node *v1.Node, rater Rater) *GPUDevices {
 				})
 			}
 		default:
-			klog.Errorf("invalid resource value of %s", VKELabelNodeResourceType)
+			klog.Errorf("invalid resource value of %s on node %s", VKELabelNodeResourceType, node.Name)
 			return nil
 		}
 	}
@@ -123,17 +123,30 @@ func checkNodeMGPUSharingPredicate(pod *v1.Pod, gssnap *GPUDevices) (bool, error
 	if !isFullCardGPUPod(pod) && getPodComputePolicy(pod) != gs.NodePolicy {
 		return false, fmt.Errorf("compute policy not match normal mgpu")
 	}
+	_, err := tradeForResourceOption(pod, gs)
+	if err != nil {
+		return false, err
+	}
+
+	return true, nil
+}
+
+func tradeForResourceOption(pod *v1.Pod, gs *GPUDevices) (option *GPUOption, err error) {
+	var (
+		req   GPURequest
+		cmReq ContainerMultiGPURequest
+	)
+	req, cmReq = NewGPURequest(pod, VKEResourceMGPUCore, VKEResourceMGPUMemory)
 	// Check container specific number whether exceed the node's GPU number
 	gpuCountList, _ := ExtraMultipleGPUCountList(pod)
 	if len(gpuCountList) > 0 {
 		for i, count := range gpuCountList {
 			if count > len(gs.GPUs) {
-				return false, fmt.Errorf("request multiple GPU count %d is exceed the allocatable GPU number, container index: %d", count, i+1)
+				return nil, fmt.Errorf("request multiple GPU count %d is exceed the allocatable GPU number, container index: %d", count, i+1)
 			}
 		}
 	}
-
-	return true, nil
+	return gs.GPUs.Trade(gs.Rater, req, pod, cmReq)
 }
 
 func getPodNamespaceName(pod *v1.Pod) string {