Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle Jobs with ttl_seconds_after_finished = 0 correctly #2596

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/2596.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:enhancement
Properly handle Kubernetes Jobs with ttl_seconds_after_finished = 0 to prevent unnecessary recreation.
```
114 changes: 111 additions & 3 deletions kubernetes/resource_kubernetes_job_v1.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"context"
"fmt"
"log"
"strconv"
"time"

"github.com/hashicorp/terraform-plugin-sdk/v2/diag"
Expand All @@ -16,6 +17,7 @@ import (
batchv1 "k8s.io/api/batch/v1"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
pkgApi "k8s.io/apimachinery/pkg/types"
"k8s.io/client-go/kubernetes"
Expand All @@ -28,6 +30,7 @@ func resourceKubernetesJobV1() *schema.Resource {
ReadContext: resourceKubernetesJobV1Read,
UpdateContext: resourceKubernetesJobV1Update,
DeleteContext: resourceKubernetesJobV1Delete,
CustomizeDiff: resourceKubernetesJobV1CustomizeDiff,
Importer: &schema.ResourceImporter{
StateContext: schema.ImportStatePassthroughContext,
},
Expand All @@ -48,6 +51,78 @@ func resourceKubernetesJobV1() *schema.Resource {
}
}

func resourceKubernetesJobV1CustomizeDiff(ctx context.Context, d *schema.ResourceDiff, meta interface{}) error {
if d.Id() == "" {
log.Printf("[DEBUG] Resource ID is empty, resource not created yet.")
return nil
}

// Retrieve old and new TTL values as strings
oldTTLRaw, newTTLRaw := d.GetChange("spec.0.ttl_seconds_after_finished")

var oldTTLStr, newTTLStr string

if oldTTLRaw != nil {
oldTTLStr, _ = oldTTLRaw.(string)
}
if newTTLRaw != nil {
newTTLStr, _ = newTTLRaw.(string)
}

oldTTLInt, err := strconv.Atoi(oldTTLStr)
if err != nil {
oldTTLInt = 0
}
newTTLInt, err := strconv.Atoi(newTTLStr)
if err != nil {
newTTLInt = 0
}

conn, err := meta.(KubeClientsets).MainClientset()
if err != nil {
return err
}

namespace, name, err := idParts(d.Id())
if err != nil {
return err
}

// Check if the Job exists
_, err = conn.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
// Job is missing
if oldTTLInt >= 0 {
if oldTTLInt != newTTLInt {
// TTL value changed; force recreation
log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished changed from %d to %d; forcing recreation", d.Id(), oldTTLInt, newTTLInt)
d.ForceNew("spec.0.ttl_seconds_after_finished")
return nil
} else {
// TTL remains the same; suppress diff
log.Printf("[DEBUG] Job %s not found and ttl_seconds_after_finished remains %d; suppressing diff", d.Id(), oldTTLInt)
d.Clear("spec")
d.Clear("metadata")
return nil
}
}
} else {
return err
}
} else {
// Job exists, check if TTL changed
if oldTTLInt != newTTLInt {
// TTL changed; force recreation
log.Printf("[DEBUG] Job %s exists and ttl_seconds_after_finished changed from %d to %d; forcing recreation", d.Id(), oldTTLInt, newTTLInt)
d.ForceNew("spec.0.ttl_seconds_after_finished")
return nil
}
}

return nil
}

func resourceKubernetesJobV1Schema() map[string]*schema.Schema {
return map[string]*schema.Schema{
"metadata": jobMetadataSchema(),
Expand Down Expand Up @@ -118,8 +193,17 @@ func resourceKubernetesJobV1Read(ctx context.Context, d *schema.ResourceData, me
return diag.FromErr(err)
}
if !exists {
d.SetId("")
return diag.Diagnostics{}
// Check if ttl_seconds_after_finished is set
if ttl, ok := d.GetOk("spec.0.ttl_seconds_after_finished"); ok {
// ttl_seconds_after_finished is set, Job is deleted due to TTL
// We don't need to remove the resource from the state
log.Printf("[INFO] Job %s has been deleted by Kubernetes due to TTL (ttl_seconds_after_finished = %v), keeping resource in state", d.Id(), ttl)
return diag.Diagnostics{}
} else {
// ttl_seconds_after_finished is not set, remove the resource from the state
d.SetId("")
return diag.Diagnostics{}
}
}
conn, err := meta.(KubeClientsets).MainClientset()
if err != nil {
Expand Down Expand Up @@ -173,6 +257,31 @@ func resourceKubernetesJobV1Update(ctx context.Context, d *schema.ResourceData,
return diag.FromErr(err)
}

// Attempt to get the Job
_, err = conn.BatchV1().Jobs(namespace).Get(ctx, name, metav1.GetOptions{})
if err != nil {
if apierrors.IsNotFound(err) {
// Job is missing; check TTL
ttlAttr := d.Get("spec.0.ttl_seconds_after_finished")
ttlStr, _ := ttlAttr.(string)
ttlInt, err := strconv.Atoi(ttlStr)
if err != nil {
ttlInt = 0
}

if ttlInt >= 0 {
// Job was deleted due to TTL nothing to update
log.Printf("[INFO] Job %s not found but ttl_seconds_after_finished = %v; nothing to update", d.Id(), ttlInt)
return nil
}

// Job was deleted unexpectedly; return an error
return diag.Errorf("Job %s not found; cannot update because it has been deleted", d.Id())
}
return diag.Errorf("Error retrieving Job: %s", err)
}

// Proceed with the update as usual
ops := patchMetadata("metadata.0.", "/metadata/", d)

if d.HasChange("spec") {
Expand Down Expand Up @@ -204,7 +313,6 @@ func resourceKubernetesJobV1Update(ctx context.Context, d *schema.ResourceData,
}
return resourceKubernetesJobV1Read(ctx, d, meta)
}

func resourceKubernetesJobV1Delete(ctx context.Context, d *schema.ResourceData, meta interface{}) diag.Diagnostics {
conn, err := meta.(KubeClientsets).MainClientset()
if err != nil {
Expand Down
126 changes: 126 additions & 0 deletions kubernetes/resource_kubernetes_job_v1_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,82 @@ func TestAccKubernetesJobV1_ttl_seconds_after_finished(t *testing.T) {
})
}

func TestAccKubernetesJobV1_customizeDiff_ttlZero(t *testing.T) {
var conf batchv1.Job
name := fmt.Sprintf("tf-acc-test-%s", acctest.RandString(10))
imageName := busyboxImage
resourceName := "kubernetes_job_v1.test"

resource.ParallelTest(t, resource.TestCase{
PreCheck: func() {
testAccPreCheck(t)
skipIfClusterVersionLessThan(t, "1.21.0")
},
ProviderFactories: testAccProviderFactories,
Steps: []resource.TestStep{
// Step 1: Create the Job
{
Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName),
Check: resource.ComposeAggregateTestCheckFunc(
testAccCheckKubernetesJobV1Exists(resourceName, &conf),
resource.TestCheckResourceAttr(resourceName, "spec.0.ttl_seconds_after_finished", "0"),
),
},
// Step 2: Wait for the Job to complete and be deleted
{
PreConfig: func() {
time.Sleep(70 * time.Second)
},
Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName),
PlanOnly: true,
ExpectNonEmptyPlan: false,
},
},
})
}

func TestAccKubernetesJobV1_updateTTLFromZero(t *testing.T) {
var conf batchv1.Job
name := fmt.Sprintf("tf-acc-test-%s", acctest.RandString(10))
imageName := busyboxImage
resourceName := "kubernetes_job_v1.test"

resource.ParallelTest(t, resource.TestCase{
PreCheck: func() {
testAccPreCheck(t)
skipIfClusterVersionLessThan(t, "1.21.0")
},
ProviderFactories: testAccProviderFactories,
Steps: []resource.TestStep{
// Step 1: Create the Job with ttl_seconds_after_finished = 0
{
Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName),
Check: resource.ComposeAggregateTestCheckFunc(
testAccCheckKubernetesJobV1Exists(resourceName, &conf),
resource.TestCheckResourceAttr(resourceName, "spec.0.ttl_seconds_after_finished", "0"),
),
},
// Step 2: Wait for the Job to complete and be deleted
{
PreConfig: func() {
time.Sleep(70 * time.Second)
},
Config: testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName),
PlanOnly: true,
ExpectNonEmptyPlan: false,
},
// Step 3: Update the Job to ttl_seconds_after_finished = 5
{
Config: testAccKubernetesJobV1Config_customizeDiff_ttlFive(name, imageName),
Check: resource.ComposeAggregateTestCheckFunc(
testAccCheckKubernetesJobV1Exists(resourceName, &conf),
resource.TestCheckResourceAttr(resourceName, "spec.0.ttl_seconds_after_finished", "5"),
),
},
},
})
}

func testAccCheckJobV1Waited(minDuration time.Duration) func(*terraform.State) error {
// NOTE this works because this function is called when setting up the test
// and the function it returns is called after the resource has been created
Expand Down Expand Up @@ -516,3 +592,53 @@ func testAccKubernetesJobV1Config_modified(name, imageName string) string {
wait_for_completion = false
}`, name, imageName)
}

func testAccKubernetesJobV1Config_customizeDiff_ttlZero(name, imageName string) string {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When applying the tfconfig manually it works as expected. The apply goes through with no diff occurring.

I did however attempt to update ttl_seconds_after_finished from 0 to 5 and got the following error:

(base) ┌─(~/Dev/Scratch/ttl_test)────────────────────────────────────(mau@mau-JKDT676NCP:s017)─┐
└─(10:16:32)──> tfa                                                       ──(Thu,Oct10)─┘
kubernetes_job_v1.test: Refreshing state... [id=default/ttl-test]

Terraform used the selected providers to generate the following execution plan. Resource
actions are indicated with the following symbols:
  ~ update in-place

Terraform will perform the following actions:

  # kubernetes_job_v1.test will be updated in-place
  ~ resource "kubernetes_job_v1" "test" {
        id                  = "default/ttl-test"
        # (1 unchanged attribute hidden)

      ~ spec {
          ~ ttl_seconds_after_finished = "0" -> "5"
            # (8 unchanged attributes hidden)

            # (2 unchanged blocks hidden)
        }

        # (1 unchanged block hidden)
    }

Plan: 0 to add, 1 to change, 0 to destroy.
kubernetes_job_v1.test: Modifying... [id=default/ttl-test]
╷
│ Error: Failed to update Job! API error: jobs.batch "ttl-test" not found
│ 
│   with kubernetes_job_v1.test,
│   on main.tf line 1, in resource "kubernetes_job_v1" "test":
│    1: resource "kubernetes_job_v1" "test" {

From my understanding we're wanting to prevent this error from happening when set to 0 which has been achieved. But in doing so we are now unable to update the existing job that's in tfstate assuming we want to update the already existing job that's part of state despite it having a ttl of 0.

We'll want to consider how to solve this since if this is left users will need to destroy every job that has ttl_seconds_after_finished = 0 if wanting to apply an update to the job already existing in state.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, initially the issue was when ttl_seconds_after_finished = 0, the job deletes, and we don't want terraform to recreate the job in the next apply.

I modified the update function, first I attempt to get the job, if a NotFound error occurs, we proceed to check the previous ttl_seconds_after_finished, I then check if ttl = 0. If so set the resource id to "" to remove it from the state, and with my understanding I thought terraform would recreate it.

But when attempting that solution, I get this error produced an unexpected new value: Root object was present, but now absent.
And I believe that's due to, during an update terraform expects the resource to remain in the state unless explicitly destroyed by the config.

I will give it some more thought today, with that being said do you have another idea in mind? Is this ttl solution viable, meaning with us solving one issue, another edge case arises.

return fmt.Sprintf(`
resource "kubernetes_job_v1" "test" {
metadata {
name = "%s"
}
spec {
ttl_seconds_after_finished = 0
template {
metadata {}
spec {
container {
name = "wait-test"
image = "%s"
command = ["sleep", "60"]
}
restart_policy = "Never"
}
}
}
wait_for_completion = false
}
`, name, imageName)
}

func testAccKubernetesJobV1Config_customizeDiff_ttlFive(name, imageName string) string {
return fmt.Sprintf(`
resource "kubernetes_job_v1" "test" {
metadata {
name = "%s"
}
spec {
ttl_seconds_after_finished = 5
template {
metadata {}
spec {
container {
name = "wait-test"
image = "%s"
command = ["sleep", "60"]
}
restart_policy = "Never"
}
}
}
wait_for_completion = false
}
`, name, imageName)
}
Loading