From 2216d3ca8d65c5b0c85e297a3eb351a05db3a4d2 Mon Sep 17 00:00:00 2001
From: Brian Goff <cpuguy83@gmail.com>
Date: Sat, 2 May 2020 14:06:18 -0700
Subject: [PATCH] Add health start interval

This adds an additional interval to be used by healthchecks during the
start period.
Typically when a container is just starting you want to check if it is
ready more quickly than a typical healthcheck might run. Without this
users have to balance between running healthchecks to frequently vs
taking a very long time to mark a container as healthy for the first
time.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
---
 .../router/container/container_routes.go      |  8 +++
 api/swagger.yaml                              |  6 ++
 api/types/container/config.go                 |  7 +-
 client/container_create.go                    |  3 +
 daemon/commit.go                              |  3 +
 daemon/health.go                              | 25 +++++++-
 docs/api/version-history.md                   |  2 +
 integration/container/health_test.go          | 64 +++++++++++++++++++
 8 files changed, 112 insertions(+), 6 deletions(-)

diff --git a/api/server/router/container/container_routes.go b/api/server/router/container/container_routes.go
index c6458cfa8eb89..2e551e2e6e91e 100644
--- a/api/server/router/container/container_routes.go
+++ b/api/server/router/container/container_routes.go
@@ -541,6 +541,14 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo
 				bo.CreateMountpoint = false
 			}
 		}
+
+	}
+
+	if hostConfig != nil && versions.LessThan(version, "1.44") {
+		if config.Healthcheck != nil {
+			// StartInterval was added in API 1.44
+			config.Healthcheck.StartInterval = 0
+		}
 	}
 
 	if hostConfig != nil && versions.GreaterThanOrEqualTo(version, "1.42") {
diff --git a/api/swagger.yaml b/api/swagger.yaml
index 6894a29cfec47..0ad0b1c076cc5 100644
--- a/api/swagger.yaml
+++ b/api/swagger.yaml
@@ -804,6 +804,12 @@ definitions:
           1000000 (1 ms). 0 means inherit.
         type: "integer"
         format: "int64"
+      StartInterval:
+        description: |
+          The time to wait between checks in nanoseconds during the start period.
+          It should be 0 or at least 1000000 (1 ms). 0 means inherit.
+        type: "integer"
+        format: "int64"
 
   Health:
     description: |
diff --git a/api/types/container/config.go b/api/types/container/config.go
index 077583e66c1fc..8776dfbf36d44 100644
--- a/api/types/container/config.go
+++ b/api/types/container/config.go
@@ -44,9 +44,10 @@ type HealthConfig struct {
 	Test []string `json:",omitempty"`
 
 	// Zero means to inherit. Durations are expressed as integer nanoseconds.
-	Interval    time.Duration `json:",omitempty"` // Interval is the time to wait between checks.
-	Timeout     time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung.
-	StartPeriod time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down.
+	Interval      time.Duration `json:",omitempty"` // Interval is the time to wait between checks.
+	Timeout       time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung.
+	StartPeriod   time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down.
+	StartInterval time.Duration `json:",omitempty"` // The interval to attempt healthchecks at during the start period
 
 	// Retries is the number of consecutive failures needed to consider a container as unhealthy.
 	// Zero means inherit.
diff --git a/client/container_create.go b/client/container_create.go
index 193a2bb56264c..14a2127d883c7 100644
--- a/client/container_create.go
+++ b/client/container_create.go
@@ -29,6 +29,9 @@ func (cli *Client) ContainerCreate(ctx context.Context, config *container.Config
 	if err := cli.NewVersionError("1.41", "specify container image platform"); platform != nil && err != nil {
 		return response, err
 	}
+	if err := cli.NewVersionError("1.44", "specify health-check start interval"); config != nil && config.Healthcheck != nil && config.Healthcheck.StartInterval != 0 && err != nil {
+		return response, err
+	}
 
 	if hostConfig != nil {
 		if versions.LessThan(cli.ClientVersion(), "1.25") {
diff --git a/daemon/commit.go b/daemon/commit.go
index a442cd021e3ef..5911601280d97 100644
--- a/daemon/commit.go
+++ b/daemon/commit.go
@@ -92,6 +92,9 @@ func merge(userConf, imageConf *containertypes.Config) error {
 			if userConf.Healthcheck.StartPeriod == 0 {
 				userConf.Healthcheck.StartPeriod = imageConf.Healthcheck.StartPeriod
 			}
+			if userConf.Healthcheck.StartInterval == 0 {
+				userConf.Healthcheck.StartInterval = imageConf.Healthcheck.StartInterval
+			}
 			if userConf.Healthcheck.Retries == 0 {
 				userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries
 			}
diff --git a/daemon/health.go b/daemon/health.go
index 914118fefce57..6b8effb119866 100644
--- a/daemon/health.go
+++ b/daemon/health.go
@@ -248,13 +248,31 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
 // There is never more than one monitor thread running per container at a time.
 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
 	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
+	startInterval := timeoutWithDefault(c.Config.Healthcheck.StartInterval, defaultProbeInterval)
+	startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
 
-	intervalTimer := time.NewTimer(probeInterval)
+	c.Lock()
+	started := c.State.StartedAt
+	c.Unlock()
+
+	getInterval := func() time.Duration {
+		if time.Since(started) >= startPeriod {
+			return probeInterval
+		}
+		c.Lock()
+		status := c.Health.Health.Status
+		c.Unlock()
+
+		if status == types.Starting {
+			return startInterval
+		}
+		return probeInterval
+	}
+
+	intervalTimer := time.NewTimer(getInterval())
 	defer intervalTimer.Stop()
 
 	for {
-		intervalTimer.Reset(probeInterval)
-
 		select {
 		case <-stop:
 			log.G(context.TODO()).Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
@@ -296,6 +314,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
 				cancelProbe()
 			}
 		}
+		intervalTimer.Reset(getInterval())
 	}
 }
 
diff --git a/docs/api/version-history.md b/docs/api/version-history.md
index 16bee52b8df16..704d6b524cdec 100644
--- a/docs/api/version-history.md
+++ b/docs/api/version-history.md
@@ -24,6 +24,8 @@ keywords: "API, Docker, rcli, REST, documentation"
   with runtimes which support the feature.
   `POST /containers/create`, `GET /containers/{id}/json`, and `GET /containers/json` now supports
   `BindOptions.ReadOnlyNonRecursive` and `BindOptions.ReadOnlyForceRecursive` to customize the behavior.
+* `POST /containers/create` now accepts a `HealthConfig.StartInterval` to set the
+  interval for health checks during the start period.
 
 ## v1.43 API changes
 
diff --git a/integration/container/health_test.go b/integration/container/health_test.go
index 9e28a58066123..76414df0cb37d 100644
--- a/integration/container/health_test.go
+++ b/integration/container/health_test.go
@@ -111,6 +111,70 @@ func TestHealthCheckProcessKilled(t *testing.T) {
 	poll.WaitOn(t, pollForHealthCheckLog(ctx, apiClient, cID, "Health check exceeded timeout (50ms): logs logs logs\n"))
 }
 
+func TestHealthStartInterval(t *testing.T) {
+	skip.If(t, testEnv.DaemonInfo.OSType == "windows", "The shell commands used in the test healthcheck do not work on Windows")
+	defer setupTest(t)()
+	ctx := context.Background()
+	client := testEnv.APIClient()
+
+	// Note: Windows is much slower than linux so this use longer intervals/timeouts
+	id := container.Run(ctx, t, client, func(c *container.TestContainerConfig) {
+		c.Config.Healthcheck = &containertypes.HealthConfig{
+			Test:          []string{"CMD-SHELL", `count="$(cat /tmp/health)"; if [ -z "${count}" ]; then let count=0; fi; let count=${count}+1; echo -n ${count} | tee /tmp/health; if [ ${count} -lt 3 ]; then exit 1; fi`},
+			Interval:      30 * time.Second,
+			StartInterval: time.Second,
+			StartPeriod:   30 * time.Second,
+		}
+	})
+
+	ctxPoll, cancel := context.WithTimeout(ctx, 30*time.Second)
+	defer cancel()
+
+	dl, _ := ctxPoll.Deadline()
+
+	poll.WaitOn(t, func(log poll.LogT) poll.Result {
+		if ctxPoll.Err() != nil {
+			return poll.Error(ctxPoll.Err())
+		}
+		inspect, err := client.ContainerInspect(ctxPoll, id)
+		if err != nil {
+			return poll.Error(err)
+		}
+		if inspect.State.Health.Status != "healthy" {
+			if len(inspect.State.Health.Log) > 0 {
+				t.Log(inspect.State.Health.Log[len(inspect.State.Health.Log)-1])
+			}
+			return poll.Continue("waiting on container to be ready")
+		}
+		return poll.Success()
+	}, poll.WithDelay(100*time.Millisecond), poll.WithTimeout(time.Until(dl)))
+	cancel()
+
+	ctxPoll, cancel = context.WithTimeout(ctx, 2*time.Minute)
+	defer cancel()
+	dl, _ = ctxPoll.Deadline()
+
+	poll.WaitOn(t, func(log poll.LogT) poll.Result {
+		inspect, err := client.ContainerInspect(ctxPoll, id)
+		if err != nil {
+			return poll.Error(err)
+		}
+
+		hLen := len(inspect.State.Health.Log)
+		if hLen < 2 {
+			return poll.Continue("waiting for more healthcheck results")
+		}
+
+		h1 := inspect.State.Health.Log[hLen-1]
+		h2 := inspect.State.Health.Log[hLen-2]
+		if h1.Start.Sub(h2.Start) >= inspect.Config.Healthcheck.Interval {
+			return poll.Success()
+		}
+		t.Log(h1.Start.Sub(h2.Start))
+		return poll.Continue("waiting for health check interval to switch from the start interval")
+	}, poll.WithDelay(time.Second), poll.WithTimeout(time.Until(dl)))
+}
+
 func pollForHealthCheckLog(ctx context.Context, client client.APIClient, containerID string, expected string) func(log poll.LogT) poll.Result {
 	return func(log poll.LogT) poll.Result {
 		inspect, err := client.ContainerInspect(ctx, containerID)