From 1fef5711aeb6ccf0e99f58f7db59d6b6e2011f49 Mon Sep 17 00:00:00 2001 From: Jaylon McShan Date: Tue, 1 Oct 2024 09:58:47 -0500 Subject: [PATCH 1/4] Handle Jobs with ttl_seconds_after_finished = 0 correctly --- kubernetes/resource_kubernetes_job_v1.go | 50 +++++++++++++++- kubernetes/resource_kubernetes_job_v1_test.go | 59 +++++++++++++++++++ 2 files changed, 107 insertions(+), 2 deletions(-) diff --git a/kubernetes/resource_kubernetes_job_v1.go b/kubernetes/resource_kubernetes_job_v1.go index 572635b340..caa7bad7dd 100644 --- a/kubernetes/resource_kubernetes_job_v1.go +++ b/kubernetes/resource_kubernetes_job_v1.go @@ -28,6 +28,7 @@ func resourceKubernetesJobV1() *schema.Resource { ReadContext: resourceKubernetesJobV1Read, UpdateContext: resourceKubernetesJobV1Update, DeleteContext: resourceKubernetesJobV1Delete, + CustomizeDiff: resourceKubernetesJobV1CustomizeDiff, Importer: &schema.ResourceImporter{ StateContext: schema.ImportStatePassthroughContext, }, @@ -48,6 +49,42 @@ func resourceKubernetesJobV1() *schema.Resource { } } +func resourceKubernetesJobV1CustomizeDiff(ctx context.Context, d *schema.ResourceDiff, meta interface{}) error { + if d.Id() == "" { + log.Printf("[DEBUG] Resource ID is empty, resource not created yet.") + return nil + } + + conn, err := meta.(KubeClientsets).MainClientset() + if err != nil { + return err + } + + namespace, name, err := idParts(d.Id()) + if err != nil { + return err + } + + ttlAttr := d.Get("spec.0.ttl_seconds_after_finished") + ttlSeconds, ok := ttlAttr.(int) + if !ok || ttlSeconds != 0 { + return nil + } + + // getting the job + _, err = conn.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if errors.IsNotFound(err) { + // Suppress diff + d.Clear("spec") + d.Clear("metadata") + return nil + } + return err + } + return nil +} + func resourceKubernetesJobV1Schema() map[string]*schema.Schema { return map[string]*schema.Schema{ "metadata": jobMetadataSchema(), @@ -118,8 +155,17 @@ func resourceKubernetesJobV1Read(ctx context.Context, d *schema.ResourceData, me return diag.FromErr(err) } if !exists { - d.SetId("") - return diag.Diagnostics{} + // Check if ttl_seconds_after_finished is set + if ttl, ok := d.GetOk("spec.0.ttl_seconds_after_finished"); ok { + // ttl_seconds_after_finished is set, Job is deleted due to TTL + // We don't need to remove the resource from the state + log.Printf("[INFO] Job %s has been deleted by Kubernetes due to TTL (ttl_seconds_after_finished = %v), keeping resource in state", d.Id(), ttl) + return diag.Diagnostics{} + } else { + // ttl_seconds_after_finished is not set, remove the resource from the state + d.SetId("") + return diag.Diagnostics{} + } } conn, err := meta.(KubeClientsets).MainClientset() if err != nil { diff --git a/kubernetes/resource_kubernetes_job_v1_test.go b/kubernetes/resource_kubernetes_job_v1_test.go index 396777de9b..5d6d2393fd 100644 --- a/kubernetes/resource_kubernetes_job_v1_test.go +++ b/kubernetes/resource_kubernetes_job_v1_test.go @@ -237,6 +237,40 @@ func TestAccKubernetesJobV1_ttl_seconds_after_finished(t *testing.T) { }) } +func TestAccKubernetesJobV1_customizeDiff_ttlZero(t *testing.T) { + var conf batchv1.Job + name := fmt.Sprintf("tf-acc-test-%s", acctest.RandString(10)) + imageName := busyboxImage + resourceName := "kubernetes_job_v1.test" + + resource.ParallelTest(t, resource.TestCase{ + PreCheck: func() { + testAccPreCheck(t) + skipIfClusterVersionLessThan(t, "1.21.0") + }, + ProviderFactories: testAccProviderFactories, + Steps: []resource.TestStep{ + // Step 1: Create the Job + { + Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName), + Check: resource.ComposeAggregateTestCheckFunc( + testAccCheckKubernetesJobV1Exists(resourceName, &conf), + resource.TestCheckResourceAttr(resourceName, "spec.0.ttl_seconds_after_finished", "0"), + ), + }, + // Step 2: Wait for the Job to complete and be deleted + { + PreConfig: func() { + time.Sleep(70 * time.Second) + }, + Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName), + PlanOnly: true, + ExpectNonEmptyPlan: false, + }, + }, + }) +} + func testAccCheckJobV1Waited(minDuration time.Duration) func(*terraform.State) error { // NOTE this works because this function is called when setting up the test // and the function it returns is called after the resource has been created @@ -516,3 +550,28 @@ func testAccKubernetesJobV1Config_modified(name, imageName string) string { wait_for_completion = false }`, name, imageName) } + +func testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName string) string { + return fmt.Sprintf(` +resource "kubernetes_job_v1" "test" { + metadata { + name = "%s" + } + spec { + ttl_seconds_after_finished = 0 + template { + metadata {} + spec { + container { + name = "wait-test" + image = "%s" + command = ["sleep", "60"] + } + restart_policy = "Never" + } + } + } + wait_for_completion = false +} +`, name, imageName) +} From 49c28bdf799e61b4b57f751629009f80a3412be9 Mon Sep 17 00:00:00 2001 From: Jaylon McShan Date: Tue, 1 Oct 2024 10:12:31 -0500 Subject: [PATCH 2/4] Adding changelog --- .changelog/2596.txt | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .changelog/2596.txt diff --git a/.changelog/2596.txt b/.changelog/2596.txt new file mode 100644 index 0000000000..50e5816094 --- /dev/null +++ b/.changelog/2596.txt @@ -0,0 +1,3 @@ +```release-note:enhancement +Properly handle Kubernetes Jobs with ttl_seconds_after_finished = 0 to prevent unnecessary recreation. +``` \ No newline at end of file From ff6c89b85347ce00d43efdd2904d9488af96e4e6 Mon Sep 17 00:00:00 2001 From: Jaylon McShan Date: Fri, 11 Oct 2024 11:24:39 -0500 Subject: [PATCH 3/4] Fixed edge case to where we update ttl from 0 to another value --- kubernetes/resource_kubernetes_job_v1.go | 89 ++++++++++++++++--- kubernetes/resource_kubernetes_job_v1_test.go | 67 ++++++++++++++ 2 files changed, 143 insertions(+), 13 deletions(-) diff --git a/kubernetes/resource_kubernetes_job_v1.go b/kubernetes/resource_kubernetes_job_v1.go index caa7bad7dd..ae6fbd46d7 100644 --- a/kubernetes/resource_kubernetes_job_v1.go +++ b/kubernetes/resource_kubernetes_job_v1.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "log" + "strconv" "time" "github.com/hashicorp/terraform-plugin-sdk/v2/diag" @@ -16,6 +17,7 @@ import ( batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" pkgApi "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/kubernetes" @@ -55,6 +57,27 @@ func resourceKubernetesJobV1CustomizeDiff(ctx context.Context, d *schema.Resourc return nil } + // Retrieve old and new TTL values as strings + oldTTLRaw, newTTLRaw := d.GetChange("spec.0.ttl_seconds_after_finished") + + var oldTTLStr, newTTLStr string + + if oldTTLRaw != nil { + oldTTLStr, _ = oldTTLRaw.(string) + } + if newTTLRaw != nil { + newTTLStr, _ = newTTLRaw.(string) + } + + oldTTLInt, err := strconv.Atoi(oldTTLStr) + if err != nil { + oldTTLInt = 0 + } + newTTLInt, err := strconv.Atoi(newTTLStr) + if err != nil { + newTTLInt = 0 + } + conn, err := meta.(KubeClientsets).MainClientset() if err != nil { return err @@ -65,23 +88,40 @@ func resourceKubernetesJobV1CustomizeDiff(ctx context.Context, d *schema.Resourc return err } - ttlAttr := d.Get("spec.0.ttl_seconds_after_finished") - ttlSeconds, ok := ttlAttr.(int) - if !ok || ttlSeconds != 0 { - return nil - } - - // getting the job + // Check if the Job exists _, err = conn.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) if err != nil { - if errors.IsNotFound(err) { - // Suppress diff - d.Clear("spec") - d.Clear("metadata") + if apierrors.IsNotFound(err) { + // Job is missing + if oldTTLInt == 0 { + if newTTLInt == 0 { + // TTL remains 0; suppress diff to prevent recreation + log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished remains 0; suppressing diff", d.Id()) + d.Clear("spec") + d.Clear("metadata") + return nil + } else { + // TTL changed from 0 to non-zero; force recreation + log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished changed from 0 to %d; forcing recreation", d.Id(), newTTLInt) + d.ForceNew("spec.0.ttl_seconds_after_finished") + return nil + } + } else { + return nil + } + } else { + return err + } + } else { + // Job exists + if oldTTLInt == 0 && newTTLInt != 0 { + // TTL changing from 0 to non-zero; force recreation + log.Printf("[DEBUG] Job %s exists and ttl_seconds_after_finished changed from 0 to %d; forcing recreation", d.Id(), newTTLInt) + d.ForceNew("spec.0.ttl_seconds_after_finished") return nil } - return err } + return nil } @@ -219,6 +259,30 @@ func resourceKubernetesJobV1Update(ctx context.Context, d *schema.ResourceData, return diag.FromErr(err) } + // Attempt to get the Job + _, err = conn.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) + if err != nil { + if apierrors.IsNotFound(err) { + // Job is missing; cannot update + ttlAttr := d.Get("spec.0.ttl_seconds_after_finished") + ttlStr, _ := ttlAttr.(string) + ttlInt, err := strconv.Atoi(ttlStr) + if err != nil { + ttlInt = 0 + } + if ttlInt == 0 { + // Job was deleted due to TTL = 0; nothing to update + log.Printf("[INFO] Job %s not found but ttl_seconds_after_finished = 0; nothing to update", d.Id()) + return nil + } else { + // Job was deleted unexpectedly; return an error + return diag.Errorf("Job %s not found; cannot update because it has been deleted", d.Id()) + } + } + return diag.Errorf("Error retrieving Job: %s", err) + } + + // Proceed with the update as usual ops := patchMetadata("metadata.0.", "/metadata/", d) if d.HasChange("spec") { @@ -250,7 +314,6 @@ func resourceKubernetesJobV1Update(ctx context.Context, d *schema.ResourceData, } return resourceKubernetesJobV1Read(ctx, d, meta) } - func resourceKubernetesJobV1Delete(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics { conn, err := meta.(KubeClientsets).MainClientset() if err != nil { diff --git a/kubernetes/resource_kubernetes_job_v1_test.go b/kubernetes/resource_kubernetes_job_v1_test.go index 5d6d2393fd..ff98d94930 100644 --- a/kubernetes/resource_kubernetes_job_v1_test.go +++ b/kubernetes/resource_kubernetes_job_v1_test.go @@ -271,6 +271,48 @@ func TestAccKubernetesJobV1_customizeDiff_ttlZero(t *testing.T) { }) } +func TestAccKubernetesJobV1_updateTTLFromZero(t *testing.T) { + var conf batchv1.Job + name := fmt.Sprintf("tf-acc-test-%s", acctest.RandString(10)) + imageName := busyboxImage + resourceName := "kubernetes_job_v1.test" + + resource.ParallelTest(t, resource.TestCase{ + PreCheck: func() { + testAccPreCheck(t) + skipIfClusterVersionLessThan(t, "1.21.0") + }, + ProviderFactories: testAccProviderFactories, + Steps: []resource.TestStep{ + // Step 1: Create the Job with ttl_seconds_after_finished = 0 + { + Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName), + Check: resource.ComposeAggregateTestCheckFunc( + testAccCheckKubernetesJobV1Exists(resourceName, &conf), + resource.TestCheckResourceAttr(resourceName, "spec.0.ttl_seconds_after_finished", "0"), + ), + }, + // Step 2: Wait for the Job to complete and be deleted + { + PreConfig: func() { + time.Sleep(70 * time.Second) + }, + Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName), + PlanOnly: true, + ExpectNonEmptyPlan: false, + }, + // Step 3: Update the Job to ttl_seconds_after_finished = 5 + { + Config: testAccKubernetesJobV1Config_customizeDiff_ttlFive(name, imageName), + Check: resource.ComposeAggregateTestCheckFunc( + testAccCheckKubernetesJobV1Exists(resourceName, &conf), + resource.TestCheckResourceAttr(resourceName, "spec.0.ttl_seconds_after_finished", "5"), + ), + }, + }, + }) +} + func testAccCheckJobV1Waited(minDuration time.Duration) func(*terraform.State) error { // NOTE this works because this function is called when setting up the test // and the function it returns is called after the resource has been created @@ -575,3 +617,28 @@ resource "kubernetes_job_v1" "test" { } `, name, imageName) } + +func testAccKubernetesJobV1Config_customizeDiff_ttlFive(name, imageName string) string { + return fmt.Sprintf(` +resource "kubernetes_job_v1" "test" { + metadata { + name = "%s" + } + spec { + ttl_seconds_after_finished = 5 + template { + metadata {} + spec { + container { + name = "wait-test" + image = "%s" + command = ["sleep", "60"] + } + restart_policy = "Never" + } + } + } + wait_for_completion = false +} +`, name, imageName) +} From bc239a2e6eee8dc33f0283143e01a7e79ee12c12 Mon Sep 17 00:00:00 2001 From: Jaylon McShan Date: Fri, 18 Oct 2024 09:26:51 -0500 Subject: [PATCH 4/4] Handle TTL value changes for Job recreation in Kubernetes --- kubernetes/resource_kubernetes_job_v1.go | 43 ++++++++++++------------ 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/kubernetes/resource_kubernetes_job_v1.go b/kubernetes/resource_kubernetes_job_v1.go index ae6fbd46d7..abccf49816 100644 --- a/kubernetes/resource_kubernetes_job_v1.go +++ b/kubernetes/resource_kubernetes_job_v1.go @@ -93,30 +93,28 @@ func resourceKubernetesJobV1CustomizeDiff(ctx context.Context, d *schema.Resourc if err != nil { if apierrors.IsNotFound(err) { // Job is missing - if oldTTLInt == 0 { - if newTTLInt == 0 { - // TTL remains 0; suppress diff to prevent recreation - log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished remains 0; suppressing diff", d.Id()) - d.Clear("spec") - d.Clear("metadata") + if oldTTLInt >= 0 { + if oldTTLInt != newTTLInt { + // TTL value changed; force recreation + log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished changed from %d to %d; forcing recreation", d.Id(), oldTTLInt, newTTLInt) + d.ForceNew("spec.0.ttl_seconds_after_finished") return nil } else { - // TTL changed from 0 to non-zero; force recreation - log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished changed from 0 to %d; forcing recreation", d.Id(), newTTLInt) - d.ForceNew("spec.0.ttl_seconds_after_finished") + // TTL remains the same; suppress diff + log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished remains %d; suppressing diff", d.Id(), oldTTLInt) + d.Clear("spec") + d.Clear("metadata") return nil } - } else { - return nil } } else { return err } } else { - // Job exists - if oldTTLInt == 0 && newTTLInt != 0 { - // TTL changing from 0 to non-zero; force recreation - log.Printf("[DEBUG] Job %s exists and ttl_seconds_after_finished changed from 0 to %d; forcing recreation", d.Id(), newTTLInt) + // Job exists, check if TTL changed + if oldTTLInt != newTTLInt { + // TTL changed; force recreation + log.Printf("[DEBUG] Job %s exists and ttl_seconds_after_finished changed from %d to %d; forcing recreation", d.Id(), oldTTLInt, newTTLInt) d.ForceNew("spec.0.ttl_seconds_after_finished") return nil } @@ -263,21 +261,22 @@ func resourceKubernetesJobV1Update(ctx context.Context, d *schema.ResourceData, _, err = conn.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{}) if err != nil { if apierrors.IsNotFound(err) { - // Job is missing; cannot update + // Job is missing; check TTL ttlAttr := d.Get("spec.0.ttl_seconds_after_finished") ttlStr, _ := ttlAttr.(string) ttlInt, err := strconv.Atoi(ttlStr) if err != nil { ttlInt = 0 } - if ttlInt == 0 { - // Job was deleted due to TTL = 0; nothing to update - log.Printf("[INFO] Job %s not found but ttl_seconds_after_finished = 0; nothing to update", d.Id()) + + if ttlInt >= 0 { + // Job was deleted due to TTL nothing to update + log.Printf("[INFO] Job %s not found but ttl_seconds_after_finished = %v; nothing to update", d.Id(), ttlInt) return nil - } else { - // Job was deleted unexpectedly; return an error - return diag.Errorf("Job %s not found; cannot update because it has been deleted", d.Id()) } + + // Job was deleted unexpectedly; return an error + return diag.Errorf("Job %s not found; cannot update because it has been deleted", d.Id()) } return diag.Errorf("Error retrieving Job: %s", err) }