From 2216d3ca8d65c5b0c85e297a3eb351a05db3a4d2 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Sat, 2 May 2020 14:06:18 -0700 Subject: [PATCH] Add health start interval This adds an additional interval to be used by healthchecks during the start period. Typically when a container is just starting you want to check if it is ready more quickly than a typical healthcheck might run. Without this users have to balance between running healthchecks to frequently vs taking a very long time to mark a container as healthy for the first time. Signed-off-by: Brian Goff Signed-off-by: Sebastiaan van Stijn --- .../router/container/container_routes.go | 8 +++ api/swagger.yaml | 6 ++ api/types/container/config.go | 7 +- client/container_create.go | 3 + daemon/commit.go | 3 + daemon/health.go | 25 +++++++- docs/api/version-history.md | 2 + integration/container/health_test.go | 64 +++++++++++++++++++ 8 files changed, 112 insertions(+), 6 deletions(-) diff --git a/api/server/router/container/container_routes.go b/api/server/router/container/container_routes.go index c6458cfa8eb89..2e551e2e6e91e 100644 --- a/api/server/router/container/container_routes.go +++ b/api/server/router/container/container_routes.go @@ -541,6 +541,14 @@ func (s *containerRouter) postContainersCreate(ctx context.Context, w http.Respo bo.CreateMountpoint = false } } + + } + + if hostConfig != nil && versions.LessThan(version, "1.44") { + if config.Healthcheck != nil { + // StartInterval was added in API 1.44 + config.Healthcheck.StartInterval = 0 + } } if hostConfig != nil && versions.GreaterThanOrEqualTo(version, "1.42") { diff --git a/api/swagger.yaml b/api/swagger.yaml index 6894a29cfec47..0ad0b1c076cc5 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -804,6 +804,12 @@ definitions: 1000000 (1 ms). 0 means inherit. type: "integer" format: "int64" + StartInterval: + description: | + The time to wait between checks in nanoseconds during the start period. + It should be 0 or at least 1000000 (1 ms). 0 means inherit. + type: "integer" + format: "int64" Health: description: | diff --git a/api/types/container/config.go b/api/types/container/config.go index 077583e66c1fc..8776dfbf36d44 100644 --- a/api/types/container/config.go +++ b/api/types/container/config.go @@ -44,9 +44,10 @@ type HealthConfig struct { Test []string `json:",omitempty"` // Zero means to inherit. Durations are expressed as integer nanoseconds. - Interval time.Duration `json:",omitempty"` // Interval is the time to wait between checks. - Timeout time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung. - StartPeriod time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down. + Interval time.Duration `json:",omitempty"` // Interval is the time to wait between checks. + Timeout time.Duration `json:",omitempty"` // Timeout is the time to wait before considering the check to have hung. + StartPeriod time.Duration `json:",omitempty"` // The start period for the container to initialize before the retries starts to count down. + StartInterval time.Duration `json:",omitempty"` // The interval to attempt healthchecks at during the start period // Retries is the number of consecutive failures needed to consider a container as unhealthy. // Zero means inherit. diff --git a/client/container_create.go b/client/container_create.go index 193a2bb56264c..14a2127d883c7 100644 --- a/client/container_create.go +++ b/client/container_create.go @@ -29,6 +29,9 @@ func (cli *Client) ContainerCreate(ctx context.Context, config *container.Config if err := cli.NewVersionError("1.41", "specify container image platform"); platform != nil && err != nil { return response, err } + if err := cli.NewVersionError("1.44", "specify health-check start interval"); config != nil && config.Healthcheck != nil && config.Healthcheck.StartInterval != 0 && err != nil { + return response, err + } if hostConfig != nil { if versions.LessThan(cli.ClientVersion(), "1.25") { diff --git a/daemon/commit.go b/daemon/commit.go index a442cd021e3ef..5911601280d97 100644 --- a/daemon/commit.go +++ b/daemon/commit.go @@ -92,6 +92,9 @@ func merge(userConf, imageConf *containertypes.Config) error { if userConf.Healthcheck.StartPeriod == 0 { userConf.Healthcheck.StartPeriod = imageConf.Healthcheck.StartPeriod } + if userConf.Healthcheck.StartInterval == 0 { + userConf.Healthcheck.StartInterval = imageConf.Healthcheck.StartInterval + } if userConf.Healthcheck.Retries == 0 { userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries } diff --git a/daemon/health.go b/daemon/health.go index 914118fefce57..6b8effb119866 100644 --- a/daemon/health.go +++ b/daemon/health.go @@ -248,13 +248,31 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch // There is never more than one monitor thread running per container at a time. func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) + startInterval := timeoutWithDefault(c.Config.Healthcheck.StartInterval, defaultProbeInterval) + startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod) - intervalTimer := time.NewTimer(probeInterval) + c.Lock() + started := c.State.StartedAt + c.Unlock() + + getInterval := func() time.Duration { + if time.Since(started) >= startPeriod { + return probeInterval + } + c.Lock() + status := c.Health.Health.Status + c.Unlock() + + if status == types.Starting { + return startInterval + } + return probeInterval + } + + intervalTimer := time.NewTimer(getInterval()) defer intervalTimer.Stop() for { - intervalTimer.Reset(probeInterval) - select { case <-stop: log.G(context.TODO()).Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID) @@ -296,6 +314,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) cancelProbe() } } + intervalTimer.Reset(getInterval()) } } diff --git a/docs/api/version-history.md b/docs/api/version-history.md index 16bee52b8df16..704d6b524cdec 100644 --- a/docs/api/version-history.md +++ b/docs/api/version-history.md @@ -24,6 +24,8 @@ keywords: "API, Docker, rcli, REST, documentation" with runtimes which support the feature. `POST /containers/create`, `GET /containers/{id}/json`, and `GET /containers/json` now supports `BindOptions.ReadOnlyNonRecursive` and `BindOptions.ReadOnlyForceRecursive` to customize the behavior. +* `POST /containers/create` now accepts a `HealthConfig.StartInterval` to set the + interval for health checks during the start period. ## v1.43 API changes diff --git a/integration/container/health_test.go b/integration/container/health_test.go index 9e28a58066123..76414df0cb37d 100644 --- a/integration/container/health_test.go +++ b/integration/container/health_test.go @@ -111,6 +111,70 @@ func TestHealthCheckProcessKilled(t *testing.T) { poll.WaitOn(t, pollForHealthCheckLog(ctx, apiClient, cID, "Health check exceeded timeout (50ms): logs logs logs\n")) } +func TestHealthStartInterval(t *testing.T) { + skip.If(t, testEnv.DaemonInfo.OSType == "windows", "The shell commands used in the test healthcheck do not work on Windows") + defer setupTest(t)() + ctx := context.Background() + client := testEnv.APIClient() + + // Note: Windows is much slower than linux so this use longer intervals/timeouts + id := container.Run(ctx, t, client, func(c *container.TestContainerConfig) { + c.Config.Healthcheck = &containertypes.HealthConfig{ + Test: []string{"CMD-SHELL", `count="$(cat /tmp/health)"; if [ -z "${count}" ]; then let count=0; fi; let count=${count}+1; echo -n ${count} | tee /tmp/health; if [ ${count} -lt 3 ]; then exit 1; fi`}, + Interval: 30 * time.Second, + StartInterval: time.Second, + StartPeriod: 30 * time.Second, + } + }) + + ctxPoll, cancel := context.WithTimeout(ctx, 30*time.Second) + defer cancel() + + dl, _ := ctxPoll.Deadline() + + poll.WaitOn(t, func(log poll.LogT) poll.Result { + if ctxPoll.Err() != nil { + return poll.Error(ctxPoll.Err()) + } + inspect, err := client.ContainerInspect(ctxPoll, id) + if err != nil { + return poll.Error(err) + } + if inspect.State.Health.Status != "healthy" { + if len(inspect.State.Health.Log) > 0 { + t.Log(inspect.State.Health.Log[len(inspect.State.Health.Log)-1]) + } + return poll.Continue("waiting on container to be ready") + } + return poll.Success() + }, poll.WithDelay(100*time.Millisecond), poll.WithTimeout(time.Until(dl))) + cancel() + + ctxPoll, cancel = context.WithTimeout(ctx, 2*time.Minute) + defer cancel() + dl, _ = ctxPoll.Deadline() + + poll.WaitOn(t, func(log poll.LogT) poll.Result { + inspect, err := client.ContainerInspect(ctxPoll, id) + if err != nil { + return poll.Error(err) + } + + hLen := len(inspect.State.Health.Log) + if hLen < 2 { + return poll.Continue("waiting for more healthcheck results") + } + + h1 := inspect.State.Health.Log[hLen-1] + h2 := inspect.State.Health.Log[hLen-2] + if h1.Start.Sub(h2.Start) >= inspect.Config.Healthcheck.Interval { + return poll.Success() + } + t.Log(h1.Start.Sub(h2.Start)) + return poll.Continue("waiting for health check interval to switch from the start interval") + }, poll.WithDelay(time.Second), poll.WithTimeout(time.Until(dl))) +} + func pollForHealthCheckLog(ctx context.Context, client client.APIClient, containerID string, expected string) func(log poll.LogT) poll.Result { return func(log poll.LogT) poll.Result { inspect, err := client.ContainerInspect(ctx, containerID)