From 6ab70aa42d3af5badbbd1b987f8438a8821426f6 Mon Sep 17 00:00:00 2001 From: cheerfun Date: Mon, 5 Aug 2024 19:36:13 +0800 Subject: [PATCH] fix: Can not sync job status correctly when upgrading from v1.5 v1.5 changed the naming logics of pod group by adding UID into the name: #2140, and there is also another fix regarding handling the already created pod group without UID in create or update: #2400. But a similar fix does not exist in the syncJob function. Fixes #3640 --- pkg/controllers/job/helpers/helpers.go | 10 ++++++ pkg/controllers/job/job_controller_actions.go | 33 ++++++++++++++----- 2 files changed, 34 insertions(+), 9 deletions(-) diff --git a/pkg/controllers/job/helpers/helpers.go b/pkg/controllers/job/helpers/helpers.go index 9f0168ae51..6b9549a7a4 100644 --- a/pkg/controllers/job/helpers/helpers.go +++ b/pkg/controllers/job/helpers/helpers.go @@ -146,3 +146,13 @@ func GetPodsNameUnderTask(taskName string, job *batch.Job) []string { } return res } + +// GetRelatedPodGroupName returns the name of podgroup related to a job +func GetRelatedPodGroupName(job *batch.Job) string { + return fmt.Sprintf("%s-%s", job.Name, string(job.UID)) +} + +// GetRelatedPodGroupLegacyName returns the legacy name of podgroup related to a job +func GetRelatedPodGroupLegacyName(job *batch.Job) string { + return job.Name +} diff --git a/pkg/controllers/job/job_controller_actions.go b/pkg/controllers/job/job_controller_actions.go index 0c7ffd8c14..190ac4fcdc 100644 --- a/pkg/controllers/job/job_controller_actions.go +++ b/pkg/controllers/job/job_controller_actions.go @@ -152,11 +152,18 @@ func (cc *jobcontroller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.Pha } // Delete PodGroup - pgName := job.Name + "-" + string(job.UID) - if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(context.TODO(), pgName, metav1.DeleteOptions{}); err != nil { + pgName := jobhelpers.GetRelatedPodGroupName(job) + pgIface := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace) + if err := pgIface.Delete(context.TODO(), pgName, metav1.DeleteOptions{}); err != nil { if !apierrors.IsNotFound(err) { - klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v", - job.Namespace, job.Name, err) + klog.Errorf("Failed to delete PodGroup of Job %s/%s: %v", job.Namespace, job.Name, err) + return err + } + + pgLegacyName := jobhelpers.GetRelatedPodGroupLegacyName(job) + err := pgIface.Delete(context.TODO(), pgLegacyName, metav1.DeleteOptions{}) + if err != nil && !apierrors.IsNotFound(err) { + klog.Errorf("Failed to delete legacy PodGroup of Job %s/%s: %v", job.Namespace, job.Name, err) return err } } @@ -281,8 +288,14 @@ func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.Updat } var syncTask bool - pgName := job.Name + "-" + string(job.UID) - if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(pgName); pg != nil { + pgLister := cc.pgLister.PodGroups(job.Namespace) + pgName := jobhelpers.GetRelatedPodGroupName(job) + pg, err := pgLister.Get(pgName) + if err != nil && apierrors.IsNotFound(err) { + pgLegacyName := jobhelpers.GetRelatedPodGroupLegacyName(job) + pg, _ = pgLister.Get(pgLegacyName) + } + if pg != nil { if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending { syncTask = true } @@ -662,10 +675,11 @@ func (cc *jobcontroller) createPVC(job *batch.Job, vcName string, volumeClaim *v func (cc *jobcontroller) createOrUpdatePodGroup(job *batch.Job) error { // If PodGroup does not exist, create one for Job. - pgName := job.Name + "-" + string(job.UID) + pgLister := cc.pgLister.PodGroups(job.Namespace) var pg *scheduling.PodGroup var err error - pg, err = cc.pgLister.PodGroups(job.Namespace).Get(pgName) + pgName := jobhelpers.GetRelatedPodGroupName(job) + pg, err = pgLister.Get(pgName) if err != nil { if !apierrors.IsNotFound(err) { klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v", @@ -673,7 +687,8 @@ func (cc *jobcontroller) createOrUpdatePodGroup(job *batch.Job) error { return err } // try to get old pg if new pg not exist - pg, err = cc.pgLister.PodGroups(job.Namespace).Get(job.Name) + pgLegacyName := jobhelpers.GetRelatedPodGroupLegacyName(job) + pg, err = pgLister.Get(pgLegacyName) if err != nil { if !apierrors.IsNotFound(err) { klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v",