From df3dc5f6df1d1ea82340b20e097dbc2fbe491a9e Mon Sep 17 00:00:00 2001
From: Thorarinn Sigurdsson <thorarinnsigurdsson@gmail.com>
Date: Thu, 7 Mar 2019 14:12:30 +0100
Subject: [PATCH] fix: increase init delay for liveness probe

The initial delay in the liveness probe config for k8s health checks was
too short. This was due to the mistaken assumption that k8s would only
start running the liveness probe after the readiness probe succeeded.

For services that take a long time to start up, the old config would
sometimes lead to the liveness probe failing and thus the container
being restarted - repeatedly, in some situations.

The initial delay and frequency of the readiness probe were also
reduced.

Later, we should aim to make initialDelaySeconds configurable for the
liveness probe.
---
 .../plugins/kubernetes/container/deployment.ts    | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/garden-service/src/plugins/kubernetes/container/deployment.ts b/garden-service/src/plugins/kubernetes/container/deployment.ts
index 8a55e235a09..10a261778c0 100644
--- a/garden-service/src/plugins/kubernetes/container/deployment.ts
+++ b/garden-service/src/plugins/kubernetes/container/deployment.ts
@@ -269,16 +269,23 @@ function deploymentConfig(service: Service, configuredReplicas: number, namespac
 
 function configureHealthCheck(container, spec): void {
 
+  const readinessPeriodSeconds = 1
+  const readinessFailureThreshold = 90
+
   container.readinessProbe = {
-    initialDelaySeconds: 10,
-    periodSeconds: 5,
+    initialDelaySeconds: 2,
+    periodSeconds: readinessPeriodSeconds,
     timeoutSeconds: 3,
     successThreshold: 2,
-    failureThreshold: 5,
+    failureThreshold: readinessFailureThreshold,
   }
 
+  /*
+   * We wait for the effective failure duration (period * threshold) of the readiness probe before starting the
+   * liveness probe.
+   */
   container.livenessProbe = {
-    initialDelaySeconds: 15,
+    initialDelaySeconds: readinessPeriodSeconds * readinessFailureThreshold,
     periodSeconds: 5,
     timeoutSeconds: 3,
     successThreshold: 1,