Skip to content

Commit

Permalink
fix panic when the gpu is faulty
Browse files Browse the repository at this point in the history
Signed-off-by: guoqinwill <guoqinwill@163.com>
  • Loading branch information
guoqinwill committed Mar 18, 2024
1 parent f0d99bf commit 0f7626f
Show file tree
Hide file tree
Showing 2 changed files with 10 additions and 1 deletion.
9 changes: 9 additions & 0 deletions pkg/scheduler/api/resource_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,15 @@ func (r *Resource) Sub(rr *Resource) *Resource {
return r.sub(rr)
}

// WithoutAssertSub subtracts two Resource objects without assertion.
func (r *Resource) WithoutAssertSub(rr *Resource) *Resource {
ok, resources := rr.LessEqualWithResourcesName(r, Zero)
if !ok {
klog.Errorf("resources <%v> are not sufficient to do operation: <%v> sub <%v>", resources, r, rr)
}
return r.sub(rr)
}

// sub subtracts two Resource objects.
func (r *Resource) sub(rr *Resource) *Resource {
r.MilliCPU -= rr.MilliCPU
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/plugins/overcommit/overcommit.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) {
for _, node := range ssn.Nodes {
used.Add(node.Used)
}
op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).Sub(used)
op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).WithoutAssertSub(used)

for _, job := range ssn.Jobs {
// calculate inqueue job resources
Expand Down

0 comments on commit 0f7626f

Please sign in to comment.