volcano-sh · Thor-wl · Jan 30, 2022 · Nov 3, 2021
diff --git a/pkg/controllers/job/job_controller_actions.go b/pkg/controllers/job/job_controller_actions.go
@@ -69,13 +69,19 @@ func (cc *jobcontroller) killJob(jobInfo *apis.JobInfo, podRetainPhase state.Pha
 
 			maxRetry := job.Spec.MaxRetry
 			lastRetry := false
-			if job.Status.RetryCount > maxRetry-1 {
+			if job.Status.RetryCount >= maxRetry-1 {
 				lastRetry = true
 			}
 
-			_, retain := podRetainPhase[pod.Status.Phase]
+			// Only retain the Failed and Succeeded pods at the last retry.
+			// If it is not the last retry, kill pod as defined in `podRetainPhase`.
+			retainPhase := podRetainPhase
+			if lastRetry {
+				retainPhase = state.PodRetainPhaseSoft
+			}
+			_, retain := retainPhase[pod.Status.Phase]
 
-			if !retain && !lastRetry {
+			if !retain {
 				err := cc.deleteJobPod(job.Name, pod)
 				if err == nil {
 					terminating++

diff --git a/test/e2e/jobp/job_restart.go b/test/e2e/jobp/job_restart.go
@@ -0,0 +1,150 @@
+/*
+ Copyright 2021 The Volcano Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+*/
+
+package jobp
+
+import (
+	"context"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+	v1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	vcbatch "volcano.sh/apis/pkg/apis/batch/v1alpha1"
+	"volcano.sh/apis/pkg/apis/bus/v1alpha1"
+	jobctl "volcano.sh/volcano/pkg/controllers/job"
+
+	e2eutil "volcano.sh/volcano/test/e2e/util"
+)
+
+var _ = Describe("Test job restart", func() {
+	It("Retain failed pod on last Retry", func() {
+		ctx := e2eutil.InitTestContext(e2eutil.Options{})
+		defer e2eutil.CleanupTestContext(ctx)
+
+		jobName := "last-retry-job"
+		By("create job")
+		var minSuccess int32 = 2
+		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
+			Name:       jobName,
+			MinSuccess: &minSuccess,
+			Min:        2,
+			Policies: []vcbatch.LifecyclePolicy{
+				{Event: v1alpha1.PodEvictedEvent, Action: v1alpha1.RestartJobAction},
+				{Event: v1alpha1.PodFailedEvent, Action: v1alpha1.RestartJobAction},
+			},
+			MaxRetry: 2,
+			Tasks: []e2eutil.TaskSpec{
+				{
+					Name:          "running-task",
+					Img:           e2eutil.DefaultBusyBoxImage,
+					Min:           1,
+					Rep:           1,
+					Command:       "sleep 1000",
+					RestartPolicy: v1.RestartPolicyNever,
+					MaxRetry:      1,
+				},
+				{
+					Name:          "failed-task",
+					Img:           e2eutil.DefaultBusyBoxImage,
+					Min:           1,
+					Rep:           1,
+					Command:       "sh fake.sh",
+					RestartPolicy: v1.RestartPolicyNever,
+					MaxRetry:      1,
+				},
+			},
+		})
+
+		// wait job failed
+		err := e2eutil.WaitJobStates(ctx, job, []vcbatch.JobPhase{vcbatch.Failed}, e2eutil.FiveMinute)
+		Expect(err).NotTo(HaveOccurred())
+
+		// check job restart count
+		curjob, err := e2eutil.VcClient.BatchV1alpha1().Jobs(job.Namespace).Get(context.TODO(), jobName, metav1.GetOptions{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(curjob.Status.RetryCount).Should(Equal(int32(2)))
+
+		// wait running pod deleted
+		err = e2eutil.WaitPodGone(ctx, jobctl.MakePodName(jobName, "running-task", 0), job.Namespace)
+		Expect(err).NotTo(HaveOccurred())
+
+		// failed pod should be existing
+		pod, err := e2eutil.KubeClient.CoreV1().Pods(job.Namespace).Get(context.TODO(), jobctl.MakePodName(jobName, "failed-task", 0), metav1.GetOptions{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(pod.DeletionTimestamp).Should(BeNil())
+	})
+
+	It("Retain succeeded pod when job complete", func() {
+		ctx := e2eutil.InitTestContext(e2eutil.Options{})
+		defer e2eutil.CleanupTestContext(ctx)
+
+		jobName := "complete-retry-job"
+		By("create job")
+		var minSuccess int32 = 2
+		job := e2eutil.CreateJob(ctx, &e2eutil.JobSpec{
+			Name:       jobName,
+			MinSuccess: &minSuccess,
+			Min:        2,
+			Policies: []vcbatch.LifecyclePolicy{
+				{Event: v1alpha1.PodEvictedEvent, Action: v1alpha1.RestartJobAction},
+				{Event: v1alpha1.PodFailedEvent, Action: v1alpha1.RestartJobAction},
+			},
+			MaxRetry: 1,
+			Tasks: []e2eutil.TaskSpec{
+				{
+					Name:          "running-task",
+					Img:           e2eutil.DefaultBusyBoxImage,
+					Min:           1,
+					Rep:           1,
+					Command:       "sleep 1000",
+					RestartPolicy: v1.RestartPolicyNever,
+					MaxRetry:      1,
+				},
+				{
+					Name: "succeeded-task",
+					Img:  e2eutil.DefaultBusyBoxImage,
+					Policies: []vcbatch.LifecyclePolicy{
+						{Event: v1alpha1.TaskCompletedEvent, Action: v1alpha1.CompleteJobAction},
+					},
+					Min:           1,
+					Rep:           1,
+					Command:       "sleep 1",
+					RestartPolicy: v1.RestartPolicyNever,
+					MaxRetry:      1,
+				},
+			},
+		})
+
+		// wait job failed
+		err := e2eutil.WaitJobStates(ctx, job, []vcbatch.JobPhase{vcbatch.Completed}, e2eutil.FiveMinute)
+		Expect(err).NotTo(HaveOccurred())
+
+		// check job restart count
+		curjob, err := e2eutil.VcClient.BatchV1alpha1().Jobs(job.Namespace).Get(context.TODO(), jobName, metav1.GetOptions{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(curjob.Status.RetryCount).Should(Equal(int32(0)))
+
+		// wait running pod deleted
+		err = e2eutil.WaitPodGone(ctx, jobctl.MakePodName(jobName, "running-task", 0), job.Namespace)
+		Expect(err).NotTo(HaveOccurred())
+
+		// succeeded pod should be existing
+		pod, err := e2eutil.KubeClient.CoreV1().Pods(job.Namespace).Get(context.TODO(), jobctl.MakePodName(jobName, "succeeded-task", 0), metav1.GetOptions{})
+		Expect(err).NotTo(HaveOccurred())
+		Expect(pod.DeletionTimestamp).Should(BeNil())
+	})
+})
diff --git a/test/e2e/util/job.go b/test/e2e/util/job.go
@@ -69,6 +69,8 @@ type JobSpec struct {
 	// ttl seconds after job finished
 	Ttl        *int32
 	MinSuccess *int32
+	// job max retry
+	MaxRetry int32
 }
 
 func Namespace(context *TestContext, job *JobSpec) string {
@@ -197,6 +199,7 @@ func CreateJobInner(ctx *TestContext, jobSpec *JobSpec) (*batchv1alpha1.Job, err
 			Plugins:                 jobSpec.Plugins,
 			TTLSecondsAfterFinished: jobSpec.Ttl,
 			MinSuccess:              jobSpec.MinSuccess,
+			MaxRetry:                jobSpec.MaxRetry,
 		},
 	}