From 2832dfe1229fd115742f7cf66c023b2d26407929 Mon Sep 17 00:00:00 2001 From: guoqin Date: Wed, 27 Mar 2024 15:07:32 +0800 Subject: [PATCH] fix podgroup retry Signed-off-by: guoqin Signed-off-by: g00673948 --- pkg/controllers/cache/cache.go | 1 + pkg/scheduler/api/job_info.go | 4 +++- pkg/scheduler/cache/cache.go | 17 ++++++++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/pkg/controllers/cache/cache.go b/pkg/controllers/cache/cache.go index 7e8819b7c5..e3ff8e3b9a 100644 --- a/pkg/controllers/cache/cache.go +++ b/pkg/controllers/cache/cache.go @@ -29,6 +29,7 @@ import ( "k8s.io/klog/v2" "volcano.sh/apis/pkg/apis/batch/v1alpha1" + "volcano.sh/volcano/pkg/controllers/apis" ) diff --git a/pkg/scheduler/api/job_info.go b/pkg/scheduler/api/job_info.go index 7436aba99b..be12040780 100644 --- a/pkg/scheduler/api/job_info.go +++ b/pkg/scheduler/api/job_info.go @@ -296,7 +296,8 @@ type NodeResourceMap map[string]*Resource // JobInfo will have all info of a Job type JobInfo struct { - UID JobID + UID JobID + PgUID types.UID Name string Namespace string @@ -396,6 +397,7 @@ func (ji *JobInfo) SetPodGroup(pg *PodGroup) { } ji.TaskMinAvailableTotal = taskMinAvailableTotal + ji.PgUID = pg.UID ji.PodGroup = pg } diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go index 9b29639020..ec6d1de8eb 100644 --- a/pkg/scheduler/cache/cache.go +++ b/pkg/scheduler/cache/cache.go @@ -1027,9 +1027,20 @@ func (sc *SchedulerCache) processCleanupJob() { defer sc.Mutex.Unlock() if schedulingapi.JobTerminated(job) { - delete(sc.Jobs, job.UID) - metrics.DeleteJobMetrics(job.Name, string(job.Queue), job.Namespace) - klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name) + oldJob, found := sc.Jobs[job.UID] + if !found { + klog.V(3).Infof("Failed to find Job <%v:%v/%v>, ignore it", job.UID, job.Namespace, job.Name) + sc.DeletedJobs.Forget(obj) + return + } + newPgVersion := oldJob.PgUID + oldPgVersion := job.PgUID + klog.V(5).Infof("Just add pguid:%v, try to delete pguid:%v", newPgVersion, oldPgVersion) + if oldPgVersion == newPgVersion { + delete(sc.Jobs, job.UID) + metrics.DeleteJobMetrics(job.Name, string(job.Queue), job.Namespace) + klog.V(3).Infof("Job <%v:%v/%v> was deleted.", job.UID, job.Namespace, job.Name) + } sc.DeletedJobs.Forget(obj) } else { // Retry