Skip to content

Commit

Permalink
fix: Can not sync job status correctly when upgrading from v1.5
Browse files Browse the repository at this point in the history
v1.5 changed the naming logics of pod group by adding UID into the name: volcano-sh#2140, and there is also another fix regarding handling the already created pod group without UID in create or update: volcano-sh#2400. But a similar fix does not exist in the syncJob function.

Fixes volcano-sh#3640
  • Loading branch information
QingyaFan committed Aug 5, 2024
1 parent 0fa8102 commit 6ab70aa
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 9 deletions.
10 changes: 10 additions & 0 deletions pkg/controllers/job/helpers/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,13 @@ func GetPodsNameUnderTask(taskName string, job *batch.Job) []string {
}
return res
}

// GetRelatedPodGroupName returns the name of podgroup related to a job
func GetRelatedPodGroupName(job *batch.Job) string {
return fmt.Sprintf("%s-%s", job.Name, string(job.UID))
}

// GetRelatedPodGroupLegacyName returns the legacy name of podgroup related to a job
func GetRelatedPodGroupLegacyName(job *batch.Job) string {
return job.Name
}
33 changes: 24 additions & 9 deletions pkg/controllers/job/job_controller_actions.go
Original file line number Diff line number Diff line change
Expand Up @@ -152,11 +152,18 @@ func (cc *jobcontroller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.Pha
}

// Delete PodGroup
pgName := job.Name + "-" + string(job.UID)
if err := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace).Delete(context.TODO(), pgName, metav1.DeleteOptions{}); err != nil {
pgName := jobhelpers.GetRelatedPodGroupName(job)
pgIface := cc.vcClient.SchedulingV1beta1().PodGroups(job.Namespace)
if err := pgIface.Delete(context.TODO(), pgName, metav1.DeleteOptions{}); err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to delete PodGroup of Job %v/%v: %v",
job.Namespace, job.Name, err)
klog.Errorf("Failed to delete PodGroup of Job %s/%s: %v", job.Namespace, job.Name, err)
return err
}

pgLegacyName := jobhelpers.GetRelatedPodGroupLegacyName(job)
err := pgIface.Delete(context.TODO(), pgLegacyName, metav1.DeleteOptions{})
if err != nil && !apierrors.IsNotFound(err) {
klog.Errorf("Failed to delete legacy PodGroup of Job %s/%s: %v", job.Namespace, job.Name, err)
return err
}
}
Expand Down Expand Up @@ -281,8 +288,14 @@ func (cc *jobcontroller) syncJob(jobInfo *apis.JobInfo, updateStatus state.Updat
}

var syncTask bool
pgName := job.Name + "-" + string(job.UID)
if pg, _ := cc.pgLister.PodGroups(job.Namespace).Get(pgName); pg != nil {
pgLister := cc.pgLister.PodGroups(job.Namespace)
pgName := jobhelpers.GetRelatedPodGroupName(job)
pg, err := pgLister.Get(pgName)
if err != nil && apierrors.IsNotFound(err) {
pgLegacyName := jobhelpers.GetRelatedPodGroupLegacyName(job)
pg, _ = pgLister.Get(pgLegacyName)
}
if pg != nil {
if pg.Status.Phase != "" && pg.Status.Phase != scheduling.PodGroupPending {
syncTask = true
}
Expand Down Expand Up @@ -662,18 +675,20 @@ func (cc *jobcontroller) createPVC(job *batch.Job, vcName string, volumeClaim *v

func (cc *jobcontroller) createOrUpdatePodGroup(job *batch.Job) error {
// If PodGroup does not exist, create one for Job.
pgName := job.Name + "-" + string(job.UID)
pgLister := cc.pgLister.PodGroups(job.Namespace)
var pg *scheduling.PodGroup
var err error
pg, err = cc.pgLister.PodGroups(job.Namespace).Get(pgName)
pgName := jobhelpers.GetRelatedPodGroupName(job)
pg, err = pgLister.Get(pgName)
if err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v",
job.Namespace, job.Name, err)
return err
}
// try to get old pg if new pg not exist
pg, err = cc.pgLister.PodGroups(job.Namespace).Get(job.Name)
pgLegacyName := jobhelpers.GetRelatedPodGroupLegacyName(job)
pg, err = pgLister.Get(pgLegacyName)
if err != nil {
if !apierrors.IsNotFound(err) {
klog.Errorf("Failed to get PodGroup for Job <%s/%s>: %v",
Expand Down

0 comments on commit 6ab70aa

Please sign in to comment.