Skip to content

Commit

Permalink
fix panic when the gpu is faulty
Browse files Browse the repository at this point in the history
Signed-off-by: guoqinwill <guoqinwill@163.com>
  • Loading branch information
guoqinwill committed Mar 20, 2024
1 parent 0f7626f commit 42de699
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
5 changes: 3 additions & 2 deletions pkg/scheduler/api/resource_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -260,8 +260,9 @@ func (r *Resource) Sub(rr *Resource) *Resource {
return r.sub(rr)
}

// WithoutAssertSub subtracts two Resource objects without assertion.
func (r *Resource) WithoutAssertSub(rr *Resource) *Resource {
// SubWithoutAssert subtracts two Resource objects without assertion,
// this function is added because some resource subtraction allows negative results, while others do not.
func (r *Resource) SubWithoutAssert(rr *Resource) *Resource {
ok, resources := rr.LessEqualWithResourcesName(r, Zero)
if !ok {
klog.Errorf("resources <%v> are not sufficient to do operation: <%v> sub <%v>", resources, r, rr)
Expand Down
2 changes: 1 addition & 1 deletion pkg/scheduler/plugins/overcommit/overcommit.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func (op *overcommitPlugin) OnSessionOpen(ssn *framework.Session) {
for _, node := range ssn.Nodes {
used.Add(node.Used)
}
op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).WithoutAssertSub(used)
op.idleResource = op.totalResource.Clone().Multi(op.overCommitFactor).SubWithoutAssert(used)

for _, job := range ssn.Jobs {
// calculate inqueue job resources
Expand Down

0 comments on commit 42de699

Please sign in to comment.