From 0c5da059cb818ddbc8420ae9f39114c285923c69 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 16 Oct 2024 14:30:45 -0400 Subject: [PATCH] docker: use streaming stats collection to correct CPU stats In #23966 we switched to the official Docker SDK for the `docker` driver. In the process we refactored code around stats collection to use the "one shot" version of stats. Unfortunately this "one shot" stats collection does not include the `PreCPU` stats, which are the stats from the previous read. This breaks the calculation we use to determine CPU ticks, because now we're subtracting 0 from the current value to get the delta. Switch back to using the streaming stats collection. Fixes: https://github.com/hashicorp/nomad/issues/24224 --- .changelog/24229.txt | 3 +++ drivers/docker/stats.go | 24 ++++++++++++++---------- 2 files changed, 17 insertions(+), 10 deletions(-) create mode 100644 .changelog/24229.txt diff --git a/.changelog/24229.txt b/.changelog/24229.txt new file mode 100644 index 00000000000..c4ff1256abc --- /dev/null +++ b/.changelog/24229.txt @@ -0,0 +1,3 @@ +```release-note:bug +docker: Fixed a bug where task CPU stats were reported incorrectly +``` diff --git a/drivers/docker/stats.go b/drivers/docker/stats.go index cd64e966ee9..7b9e671a345 100644 --- a/drivers/docker/stats.go +++ b/drivers/docker/stats.go @@ -97,22 +97,26 @@ func (h *taskHandle) collectStats(ctx context.Context, destCh *usageSender, inte timer, cancel := helper.NewSafeTimer(interval) defer cancel() + // we need to use the streaming stats API here because our calculation for + // CPU usage depends on having the values from the previous read, which are + // not available in one-shot + statsReader, err := h.dockerClient.ContainerStats(ctx, h.containerID, true) + if err != nil && err != io.EOF { + h.logger.Debug("error collecting stats from container", "error", err) + return + } + defer statsReader.Body.Close() + collectOnce := func() { defer timer.Reset(interval) - statsReader, err := h.dockerClient.ContainerStatsOneShot(ctx, h.containerID) + var stats *containerapi.Stats + err := json.NewDecoder(statsReader.Body).Decode(&stats) if err != nil && err != io.EOF { - h.logger.Debug("error collecting stats from container", "error", err) - return - } - defer statsReader.Body.Close() - - var stats containerapi.Stats - if err := json.NewDecoder(statsReader.Body).Decode(&stats); err != nil { - h.logger.Error("error decoding stats data for container", "error", err) + h.logger.Debug("error decoding stats data from container", "error", err) return } - resourceUsage := util.DockerStatsToTaskResourceUsage(&stats, compute) + resourceUsage := util.DockerStatsToTaskResourceUsage(stats, compute) destCh.send(resourceUsage) }