From 0f7626f3de6317ae6e639a5e377a7d9b8535d87a Mon Sep 17 00:00:00 2001 From: guoqinwill Date: Fri, 15 Mar 2024 17:07:34 +0800 Subject: [PATCH] fix panic when the gpu is faulty Signed-off-by: guoqinwill --- pkg/scheduler/api/resource_info.go | 9 +++++++++ pkg/scheduler/plugins/overcommit/overcommit.go | 2 +- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/scheduler/api/resource_info.go b/pkg/scheduler/api/resource_info.go index e8e64a42313..fa26f93ffc7 100644 --- a/pkg/scheduler/api/resource_info.go +++ b/pkg/scheduler/api/resource_info.go @@ -260,6 +260,15 @@ func (r *Resource) Sub(rr *Resource) *Resource { return r.sub(rr) } +// WithoutAssertSub subtracts two Resource objects without assertion. +func (r *Resource) WithoutAssertSub(rr *Resource) *Resource { + ok, resources := rr.LessEqualWithResourcesName(r, Zero) + if !ok { + klog.Errorf("resources <%v> are not sufficient to do operation: <%v> sub <%v>", resources, r, rr) + } + return r.sub(rr) +} + // sub subtracts two Resource objects. func (r *Resource) sub(rr *Resource) *Resource { r.MilliCPU -= rr.MilliCPU diff --git a/pkg/scheduler/plugins/overcommit/overcommit.go b/pkg/scheduler/plugins/overcommit/overcommit.go index bcaf2747641..407a2154249 100644 --- a/pkg/scheduler/plugins/overcommit/overcommit.go +++ b/pkg/scheduler/plugins/overcommit/overcommit.go @@ -88,7 +88,7 @@ func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) { for _, node := range ssn.Nodes { used.Add(node.Used) } - op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).Sub(used) + op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).WithoutAssertSub(used) for _, job := range ssn.Jobs { // calculate inqueue job resources