Skip to content

Commit

Permalink
fix panic when the gpu is faulty
Browse files Browse the repository at this point in the history
Signed-off-by: guoqinwill <guoqinwill@163.com>
  • Loading branch information
guoqinwill committed Mar 18, 2024
1 parent f0d99bf commit c2c4ef8
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 1 deletion.
8 changes: 8 additions & 0 deletions pkg/scheduler/api/resource_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,6 +260,14 @@ func (r *Resource) Sub(rr *Resource) *Resource {
return r.sub(rr)
}

// WithoutAssertSub subtracts two Resource objects without assertion.
func (r *Resource) WithoutAssertSub(rr *Resource) *Resource {
if !rr.LessEqual(r, Zero) {
klog.Errorf("resource is not sufficient to do operation: <%v> sub <%v> , the subtraction result is <%v>", r, rr, r.sub(rr))
}
return r.sub(rr)
}

// sub subtracts two Resource objects.
func (r *Resource) sub(rr *Resource) *Resource {
r.MilliCPU -= rr.MilliCPU
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/plugins/overcommit/overcommit.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) {
for _, node := range ssn.Nodes {
used.Add(node.Used)
}
op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).Sub(used)
op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).WithoutAssertSub(used)

for _, job := range ssn.Jobs {
// calculate inqueue job resources
Expand Down

0 comments on commit c2c4ef8

Please sign in to comment.