Skip to content

Commit

Permalink
[JENKINS-73224] Do not try to create a pod which already exists (#1561)
Browse files Browse the repository at this point in the history
  • Loading branch information
amuniz authored May 29, 2024
1 parent 306198d commit c06f753
Show file tree
Hide file tree
Showing 9 changed files with 183 additions and 50 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@
import java.util.List;
import java.util.Map;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.stream.Collectors;
Expand All @@ -68,7 +67,7 @@ public class KubernetesLauncher extends JNLPLauncher {

private static final Logger LOGGER = Logger.getLogger(KubernetesLauncher.class.getName());

private final AtomicBoolean launched = new AtomicBoolean(false);
private volatile boolean launched = false;

/**
* Provisioning exception if any.
Expand All @@ -87,7 +86,7 @@ public KubernetesLauncher() {

@Override
public boolean isLaunchSupported() {
return !launched.get();
return !launched;
}

@Override
Expand All @@ -104,7 +103,7 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
if (node == null) {
throw new IllegalStateException("Node has been removed, cannot launch " + computer.getName());
}
if (launched.get()) {
if (launched) {
LOGGER.log(INFO, "Agent has already been launched, activating: {0}", node.getNodeName());
computer.setAcceptingTasks(true);
return;
Expand All @@ -129,51 +128,71 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
.orElse(null);
node.setNamespace(namespace);

LOGGER.log(FINE, () -> "Creating Pod: " + cloudName + " " + namespace + "/" + podName);
try {
pod = client.pods().inNamespace(namespace).create(pod);
} catch (KubernetesClientException e) {
Metrics.metricRegistry().counter(MetricNames.CREATION_FAILED).inc();
int httpCode = e.getCode();
if (400 <= httpCode && httpCode < 500) { // 4xx
if (httpCode == 403 && e.getMessage().contains("is forbidden: exceeded quota")) {
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota exceeded. %n%s%nRetrying...%n%n",
cloudName, namespace, pod.getMetadata().getName(), e.getMessage());
} else if (httpCode == 409
&& e.getMessage().contains("Operation cannot be fulfilled on resourcequotas")) {
// See: https://github.com/kubernetes/kubernetes/issues/67761 ; A retry usually works.
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota update conflict. %n%s%nRetrying...%n%n",
cloudName, namespace, pod.getMetadata().getName(), e.getMessage());
// if the controller was interrupted after creating the pod but before it connected back, then
// the pod might already exist and the creating logic must be skipped.
Pod existingPod =
client.pods().inNamespace(namespace).withName(podName).get();
if (existingPod == null) {
LOGGER.log(FINE, () -> "Creating Pod: " + cloudName + " " + namespace + "/" + podName);
try {
pod = client.pods().inNamespace(namespace).create(pod);
} catch (KubernetesClientException e) {
Metrics.metricRegistry()
.counter(MetricNames.CREATION_FAILED)
.inc();
int httpCode = e.getCode();
if (400 <= httpCode && httpCode < 500) { // 4xx
if (httpCode == 403 && e.getMessage().contains("is forbidden: exceeded quota")) {
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota exceeded. %n%s%nRetrying...%n%n",
cloudName,
namespace,
pod.getMetadata().getName(),
e.getMessage());
} else if (httpCode == 409
&& e.getMessage().contains("Operation cannot be fulfilled on resourcequotas")) {
// See: https://github.com/kubernetes/kubernetes/issues/67761 ; A retry usually works.
node.getRunListener()
.getLogger()
.printf(
"WARNING: Unable to create pod: %s %s/%s because kubernetes resource quota update conflict. %n%s%nRetrying...%n%n",
cloudName,
namespace,
pod.getMetadata().getName(),
e.getMessage());
} else {
node.getRunListener()
.getLogger()
.printf(
"ERROR: Unable to create pod %s %s/%s.%n%s%n",
cloudName,
namespace,
pod.getMetadata().getName(),
e.getMessage());
PodUtils.cancelQueueItemFor(pod, e.getMessage());
}
} else if (500 <= httpCode && httpCode < 600) { // 5xx
LOGGER.log(FINE, "Kubernetes returned HTTP code {0} {1}. Retrying...", new Object[] {
e.getCode(), e.getStatus()
});
} else {
node.getRunListener()
.getLogger()
.printf(
"ERROR: Unable to create pod %s %s/%s.%n%s%n",
cloudName, namespace, pod.getMetadata().getName(), e.getMessage());
PodUtils.cancelQueueItemFor(pod, e.getMessage());
LOGGER.log(WARNING, "Kubernetes returned unhandled HTTP code {0} {1}", new Object[] {
e.getCode(), e.getStatus()
});
}
} else if (500 <= httpCode && httpCode < 600) { // 5xx
LOGGER.log(FINE, "Kubernetes returned HTTP code {0} {1}. Retrying...", new Object[] {
e.getCode(), e.getStatus()
});
} else {
LOGGER.log(WARNING, "Kubernetes returned unhandled HTTP code {0} {1}", new Object[] {
e.getCode(), e.getStatus()
});
throw e;
}
throw e;
LOGGER.log(INFO, () -> "Created Pod: " + cloudName + " " + namespace + "/" + podName);
listener.getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
Metrics.metricRegistry().counter(MetricNames.PODS_CREATED).inc();

node.getRunListener().getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
} else {
LOGGER.log(INFO, () -> "Pod already exists: " + cloudName + " " + namespace + "/" + podName);
listener.getLogger().printf("Pod already exists: %s %s/%s%n", cloudName, namespace, podName);
}
LOGGER.log(INFO, () -> "Created Pod: " + cloudName + " " + namespace + "/" + podName);
listener.getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
Metrics.metricRegistry().counter(MetricNames.PODS_CREATED).inc();

node.getRunListener().getLogger().printf("Created Pod: %s %s/%s%n", cloudName, namespace, podName);
kubernetesComputer.setLaunching(true);

ObjectMeta podMetadata = pod.getMetadata();
Expand Down Expand Up @@ -268,7 +287,7 @@ public synchronized void launch(SlaveComputer computer, TaskListener listener) {
}

computer.setAcceptingTasks(true);
launched.set(true);
launched = true;
try {
// We need to persist the "launched" setting...
node.save();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,6 @@ public abstract class AbstractKubernetesPipelineRJRTest {
}
}

protected RunId runId;

private SetupCloud setup;

public AbstractKubernetesPipelineRJRTest(SetupCloud setup) {
Expand All @@ -52,7 +50,10 @@ public static void isKubernetesConfigured() throws Exception {
public void setUp() throws Throwable {
rjr.startJenkins();
rjr.runRemotely(setup);
runId = rjr.runRemotely(new CreateWorkflowJobThenScheduleRun(
}

protected RunId createWorkflowJobThenScheduleRun() throws Throwable {
return rjr.runRemotely(new CreateWorkflowJobThenScheduleRun(
KubernetesTestUtil.loadPipelineScript(getClass(), name.getMethodName() + ".groovy")));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.net.UnknownHostException;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildStatusSuccess;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.RunId;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.SetupCloud;
import org.junit.Test;

Expand All @@ -29,6 +30,7 @@ public KubernetesDeclarativeAgentRJRTest() throws UnknownHostException {

@Test
public void declarative() throws Throwable {
RunId runId = createWorkflowJobThenScheduleRun();
rjr.runRemotely(new AssertBuildStatusSuccess(runId));
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,14 @@
package org.csanchez.jenkins.plugins.kubernetes.pipeline;

import io.fabric8.kubernetes.api.model.NodeBuilder;
import io.fabric8.kubernetes.client.KubernetesClient;
import io.fabric8.kubernetes.client.KubernetesClientBuilder;
import java.net.UnknownHostException;
import org.csanchez.jenkins.plugins.kubernetes.KubernetesTestUtil;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildLogMessage;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildStatusSuccess;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.CreateWorkflowJobThenScheduleTask;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.RunId;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.SetupCloud;
import org.junit.Test;

Expand All @@ -12,6 +19,44 @@ public KubernetesPipelineRJRTest() throws UnknownHostException {

@Test
public void basicPipeline() throws Throwable {
RunId runId = createWorkflowJobThenScheduleRun();
rjr.runRemotely(new AssertBuildStatusSuccess(runId));
}

@Test
public void restartDuringPodLaunch() throws Throwable {
// try to run something on a pod which is not schedulable (disktype=special)
RunId build = rjr.runRemotely(new CreateWorkflowJobThenScheduleTask(
KubernetesTestUtil.loadPipelineScript(getClass(), name.getMethodName() + ".groovy")));
// the pod is created, but not connected yet
rjr.runRemotely(new AssertBuildLogMessage("Created Pod", build));
// restart
rjr.stopJenkins();
rjr.startJenkins();
// update k8s to make a node suitable to schedule (add disktype=special to the node)
System.out.println("Adding label to node....");
try (KubernetesClient client = new KubernetesClientBuilder().build()) {
String nodeName =
client.nodes().list().getItems().get(0).getMetadata().getName();
client.nodes().withName(nodeName).edit(n -> new NodeBuilder(n)
.editMetadata()
.addToLabels("disktype", "special")
.endMetadata()
.build());

// pod connects back and the build finishes correctly
rjr.runRemotely(new AssertBuildStatusSuccess(build));
} finally {
// clean up
try (KubernetesClient client = new KubernetesClientBuilder().build()) {
String nodeName =
client.nodes().list().getItems().get(0).getMetadata().getName();
client.nodes().withName(nodeName).edit(n -> new NodeBuilder(n)
.editMetadata()
.removeFromLabels("disktype")
.endMetadata()
.build());
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import java.net.UnknownHostException;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.AssertBuildStatusSuccess;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.RunId;
import org.csanchez.jenkins.plugins.kubernetes.pipeline.steps.SetupCloud;
import org.junit.Test;

Expand All @@ -13,6 +14,7 @@ public KubernetesPipelineWebsocketRJRTest() throws UnknownHostException {

@Test
public void basicPipeline() throws Throwable {
RunId runId = createWorkflowJobThenScheduleRun();
rjr.runRemotely(new AssertBuildStatusSuccess(runId));
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
package org.csanchez.jenkins.plugins.kubernetes.pipeline.steps;

import org.jenkinsci.plugins.workflow.job.WorkflowJob;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.RealJenkinsRule;

public class AssertBuildLogMessage implements RealJenkinsRule.Step {

private final String message;
private final RunId runId;

public AssertBuildLogMessage(String message, RunId runId) {
this.message = message;
this.runId = runId;
}

@Override
public void run(JenkinsRule r) throws Throwable {
WorkflowJob p = r.jenkins.getItemByFullName(runId.name, WorkflowJob.class);
WorkflowRun b = p.getBuildByNumber(runId.number);
r.waitForMessage(message, b);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package org.csanchez.jenkins.plugins.kubernetes.pipeline.steps;

import org.jenkinsci.plugins.workflow.cps.CpsFlowDefinition;
import org.jenkinsci.plugins.workflow.job.WorkflowJob;
import org.jenkinsci.plugins.workflow.job.WorkflowRun;
import org.jvnet.hudson.test.JenkinsRule;
import org.jvnet.hudson.test.RealJenkinsRule;

/**
* Creates a workflow job using the specified script, then schedules it and returns a reference to the run.
*/
public class CreateWorkflowJobThenScheduleTask implements RealJenkinsRule.Step2<RunId> {
private String script;

public CreateWorkflowJobThenScheduleTask(String script) {
this.script = script;
}

@Override
public RunId run(JenkinsRule r) throws Throwable {
WorkflowJob project = r.createProject(WorkflowJob.class);
project.setDefinition(new CpsFlowDefinition(script, true));
project.save();
System.out.println("Scheduling build...");
WorkflowRun b = project.scheduleBuild2(0).getStartCondition().get();
System.out.println("Build scheduled...");
return new RunId(project.getFullName(), b.number);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
podTemplate(yaml: '''
apiVersion: v1
kind: Pod
spec:
nodeSelector:
disktype: special
''') {
node(POD_LABEL) {
sh 'true'
}
}
2 changes: 1 addition & 1 deletion test-in-k8s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ rules:
verbs: ["watch"]
- apiGroups: [""]
resources: ["nodes"]
verbs: ["list"]
verbs: ["list","get","patch","update"] # KubernetesPipelineRJRTest.restartDuringPodLaunch
- apiGroups: [""]
resources: ["secrets"]
verbs: ["create","delete","get","list","patch","update","watch"]
Expand Down

0 comments on commit c06f753

Please sign in to comment.