Skip to content

Commit

Permalink
fix podgroup retry
Browse files Browse the repository at this point in the history
Signed-off-by: guoqin <gq411will@163.com>
Signed-off-by: g00673948 <guoqin10@huawei.com>
  • Loading branch information
guoqinwill committed Mar 29, 2024
1 parent 339314e commit 2832dfe
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 4 deletions.
1 change: 1 addition & 0 deletions pkg/controllers/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
"k8s.io/klog/v2"

"volcano.sh/apis/pkg/apis/batch/v1alpha1"

"volcano.sh/volcano/pkg/controllers/apis"
)

Expand Down
4 changes: 3 additions & 1 deletion pkg/scheduler/api/job_info.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,7 +296,8 @@ type NodeResourceMap map[string]*Resource

// JobInfo will have all info of a Job
type JobInfo struct {
UID JobID
UID JobID
PgUID types.UID

Name string
Namespace string
Expand Down Expand Up @@ -396,6 +397,7 @@ func (ji *JobInfo) SetPodGroup(pg *PodGroup) {
}
ji.TaskMinAvailableTotal = taskMinAvailableTotal

ji.PgUID = pg.UID
ji.PodGroup = pg
}

Expand Down
17 changes: 14 additions & 3 deletions pkg/scheduler/cache/cache.go
Original file line number Diff line number Diff line change
Expand Up @@ -1027,9 +1027,20 @@ func (sc *SchedulerCache) processCleanupJob() {
defer sc.Mutex.Unlock()

if schedulingapi.JobTerminated(job) {
delete(sc.Jobs, job.UID)
metrics.DeleteJobMetrics(job.Name, string(job.Queue), job.Namespace)
klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name)
oldJob, found := sc.Jobs[job.UID]
if !found {
klog.V(3).Infof("Failed to find Job <%v:%v/%v>, ignore it", job.UID, job.Namespace, job.Name)
sc.DeletedJobs.Forget(obj)
return
}
newPgVersion := oldJob.PgUID
oldPgVersion := job.PgUID
klog.V(5).Infof("Just add pguid:%v, try to delete pguid:%v", newPgVersion, oldPgVersion)
if oldPgVersion == newPgVersion {
delete(sc.Jobs, job.UID)
metrics.DeleteJobMetrics(job.Name, string(job.Queue), job.Namespace)
klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name)
}
sc.DeletedJobs.Forget(obj)
} else {
// Retry
Expand Down

0 comments on commit 2832dfe

Please sign in to comment.