volcano-sh · volcano-sh-bot · Mar 7, 2022 · Feb 28, 2022 · Feb 25, 2022
diff --git a/pkg/controllers/podgroup/pg_controller_handler.go b/pkg/controllers/podgroup/pg_controller_handler.go
@@ -94,6 +94,9 @@ func (pg *pgcontroller) createNormalPodPGIfNotExist(pod *v1.Pod) error {
 				PriorityClassName: pod.Spec.PriorityClassName,
 				MinResources:      calcPGMinResources(pod),
 			},
+			Status: scheduling.PodGroupStatus{
+				Phase: scheduling.PodGroupPending,
+			},
 		}
 		if queueName, ok := pod.Annotations[scheduling.QueueNameAnnotationKey]; ok {
 			obj.Spec.Queue = queueName

diff --git a/pkg/scheduler/plugins/numaaware/numaaware.go b/pkg/scheduler/plugins/numaaware/numaaware.go
@@ -231,15 +231,16 @@ func getNodeNumaNumForTask(nodeInfo []*api.NodeInfo, resAssignMap map[string]api
 	workqueue.ParallelizeUntil(context.TODO(), 16, len(nodeInfo), func(index int) {
 		node := nodeInfo[index]
 		assignCpus := resAssignMap[node.Name][string(v1.ResourceCPU)]
+
 		mx.Lock()
 		defer mx.Unlock()
-		nodeNumaNumMap[node.Name] = int64(getNumaNodeCntForcpuID(assignCpus, node.NumaSchedulerInfo.CPUDetail))
+		nodeNumaNumMap[node.Name] = int64(getNumaNodeCntForCPUID(assignCpus, node.NumaSchedulerInfo.CPUDetail))
 	})
 
 	return nodeNumaNumMap
 }
 
-func getNumaNodeCntForcpuID(cpus cpuset.CPUSet, cpuDetails topology.CPUDetails) int {
+func getNumaNodeCntForCPUID(cpus cpuset.CPUSet, cpuDetails topology.CPUDetails) int {
 	mask, _ := bitmask.NewBitMask()
 	s := cpus.ToSlice()
 

diff --git a/pkg/scheduler/plugins/overcommit/overcommit.go b/pkg/scheduler/plugins/overcommit/overcommit.go
@@ -88,10 +88,21 @@ func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) {
 	}
 	op.idleResource = total.Clone().Multi(op.overCommitFactor).Sub(used)
 
-	// calculate inqueue job resources
 	for _, job := range ssn.Jobs {
+		// calculate inqueue job resources
 		if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue && job.PodGroup.Spec.MinResources != nil {
 			op.inqueueResource.Add(api.NewResource(*job.PodGroup.Spec.MinResources))
+			continue
+		}
+		// calculate inqueue resource for running jobs
+		// the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition:
+		// Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement.
+		if job.PodGroup.Status.Phase == scheduling.PodGroupRunning &&
+			job.PodGroup.Spec.MinResources != nil &&
+			job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember {
+			allocated := util.GetAllocatedResource(job)
+			inqueued := util.GetInqueueResource(job, allocated)
+			op.inqueueResource.Add(inqueued)
 		}
 	}
 

diff --git a/pkg/scheduler/plugins/proportion/proportion.go b/pkg/scheduler/plugins/proportion/proportion.go
@@ -140,6 +140,17 @@ func (pp *proportionPlugin) OnSessionOpen(ssn *framework.Session) {
 		if job.PodGroup.Status.Phase == scheduling.PodGroupInqueue {
 			attr.inqueue.Add(job.GetMinResources())
 		}
+
+		// calculate inqueue resource for running jobs
+		// the judgement 'job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember' will work on cases such as the following condition:
+		// Considering a Spark job is completed(driver pod is completed) while the podgroup keeps running, the allocated resource will be reserved again if without the judgement.
+		if job.PodGroup.Status.Phase == scheduling.PodGroupRunning &&
+			job.PodGroup.Spec.MinResources != nil &&
+			job.PodGroup.Status.Running >= job.PodGroup.Spec.MinMember {
+			allocated := util.GetAllocatedResource(job)
+			inqueued := util.GetInqueueResource(job, allocated)
+			attr.inqueue.Add(inqueued)
+		}
 	}
 
 	// Record metrics

diff --git a/pkg/scheduler/plugins/util/util.go b/pkg/scheduler/plugins/util/util.go
@@ -298,3 +298,48 @@ func NormalizeScore(maxPriority int64, reverse bool, scores map[string]int64) {
 		scores[key] = score
 	}
 }
+
+// GetAllocatedResource returns allocated resource for given job
+func GetAllocatedResource(job *api.JobInfo) *api.Resource {
+	allocated := &api.Resource{}
+	for status, tasks := range job.TaskStatusIndex {
+		if api.AllocatedStatus(status) {
+			for _, t := range tasks {
+				allocated.Add(t.Resreq)
+			}
+		}
+	}
+	return allocated
+}
+
+// GetInqueueResource returns reserved resource for running job whose part of pods have not been allocated resource.
+func GetInqueueResource(job *api.JobInfo, allocated *api.Resource) *api.Resource {
+	inqueue := &api.Resource{}
+	for rName, rQuantity := range *job.PodGroup.Spec.MinResources {
+		switch rName {
+		case v1.ResourceCPU:
+			reservedCPU := float64(rQuantity.Value()) - allocated.MilliCPU
+			if reservedCPU > 0 {
+				inqueue.MilliCPU = reservedCPU
+			}
+		case v1.ResourceMemory:
+			reservedMemory := float64(rQuantity.Value()) - allocated.Memory
+			if reservedMemory > 0 {
+				inqueue.Memory = reservedMemory
+			}
+		default:
+			if inqueue.ScalarResources == nil {
+				inqueue.ScalarResources = make(map[v1.ResourceName]float64)
+			}
+			if allocatedMount, ok := allocated.ScalarResources[rName]; !ok {
+				inqueue.ScalarResources[rName] = float64(rQuantity.Value())
+			} else {
+				reservedScalarRes := float64(rQuantity.Value()) - allocatedMount
+				if reservedScalarRes > 0 {
+					inqueue.ScalarResources[rName] = reservedScalarRes
+				}
+			}
+		}
+	}
+	return inqueue
+}