From ddece758a46b0cea2e5b59ff8e7181f2066dd8b7 Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Fri, 16 Aug 2024 12:57:21 +0200 Subject: [PATCH] libpod: remove UpdateContainerStatus() There are two major problems with UpdateContainerStatus() First, it can deadlock when the the state json is to big as it tries to read stderr until EOF but it will never hit EOF as long as the runtime process is alive. This means if the runtime json is to big to git into the pipe buffer we deadlock ourselves. Second, the function modifies the container state struct and even adds and exit code to the db however when it is called from the stop() code path we will be unlocked here. While the first problem is easy to fix the second one not so much. And when we cannot update the state there is no point in reading the from runtime in the first place as such remove the function as it does more harm then good. And add some warnings the the functions that might be called unlocked. Fixes #22246 Signed-off-by: Paul Holzinger --- libpod/oci.go | 2 - libpod/oci_conmon_common.go | 105 +++++------------------------------- libpod/oci_missing.go | 5 -- 3 files changed, 12 insertions(+), 100 deletions(-) diff --git a/libpod/oci.go b/libpod/oci.go index a10056779cec..e0d7406339d9 100644 --- a/libpod/oci.go +++ b/libpod/oci.go @@ -29,8 +29,6 @@ type OCIRuntime interface { //nolint:interfacebloat // the given container if it is a restore and if restoreOptions.PrintStats // is true. In all other cases the returned int64 is 0. CreateContainer(ctr *Container, restoreOptions *ContainerCheckpointOptions) (int64, error) - // UpdateContainerStatus updates the status of the given container. - UpdateContainerStatus(ctr *Container) error // StartContainer starts the given container. StartContainer(ctr *Container) error // KillContainer sends the given signal to the given container. diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 80b34e58addc..39a2a0d62633 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -192,91 +192,6 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta return r.createOCIContainer(ctr, restoreOptions) } -// UpdateContainerStatus retrieves the current status of the container from the -// runtime. It updates the container's state but does not save it. -// If useRuntime is false, we will not directly hit runc to see the container's -// status, but will instead only check for the existence of the conmon exit file -// and update state to stopped if it exists. -func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error { - runtimeDir, err := util.GetRootlessRuntimeDir() - if err != nil { - return err - } - - // Store old state so we know if we were already stopped - oldState := ctr.state.State - - state := new(spec.State) - - cmd := exec.Command(r.path, "state", ctr.ID()) - cmd.Env = append(cmd.Env, fmt.Sprintf("XDG_RUNTIME_DIR=%s", runtimeDir)) - - outPipe, err := cmd.StdoutPipe() - if err != nil { - return fmt.Errorf("getting stdout pipe: %w", err) - } - errPipe, err := cmd.StderrPipe() - if err != nil { - return fmt.Errorf("getting stderr pipe: %w", err) - } - - err = cmd.Start() - if err != nil { - return fmt.Errorf("error launching container runtime: %w", err) - } - defer func() { - _ = cmd.Wait() - }() - - stderr, err := io.ReadAll(errPipe) - if err != nil { - return fmt.Errorf("reading stderr: %s: %w", ctr.ID(), err) - } - if strings.Contains(string(stderr), "does not exist") || strings.Contains(string(stderr), "No such file") { - if err := ctr.removeConmonFiles(); err != nil { - logrus.Debugf("unable to remove conmon files for container %s", ctr.ID()) - } - ctr.state.ExitCode = -1 - ctr.state.FinishedTime = time.Now() - ctr.state.State = define.ContainerStateExited - return ctr.runtime.state.AddContainerExitCode(ctr.ID(), ctr.state.ExitCode) - } - if err := errPipe.Close(); err != nil { - return err - } - - out, err := io.ReadAll(outPipe) - if err != nil { - return fmt.Errorf("reading stdout: %s: %w", ctr.ID(), err) - } - if err := json.NewDecoder(bytes.NewReader(out)).Decode(state); err != nil { - return fmt.Errorf("decoding container status for container %s: %w", ctr.ID(), err) - } - ctr.state.PID = state.Pid - - switch state.Status { - case "created": - ctr.state.State = define.ContainerStateCreated - case "paused": - ctr.state.State = define.ContainerStatePaused - case "running": - ctr.state.State = define.ContainerStateRunning - case "stopped": - ctr.state.State = define.ContainerStateStopped - default: - return fmt.Errorf("unrecognized status returned by runtime for container %s: %s: %w", - ctr.ID(), state.Status, define.ErrInternal) - } - - // Handle ContainerStateStopping - keep it unless the container - // transitioned to no longer running. - if oldState == define.ContainerStateStopping && (ctr.state.State == define.ContainerStatePaused || ctr.state.State == define.ContainerStateRunning) { - ctr.state.State = define.ContainerStateStopping - } - - return nil -} - // StartContainer starts the given container. // Sets time the container was started, but does not save it. func (r *ConmonOCIRuntime) StartContainer(ctr *Container) error { @@ -358,6 +273,8 @@ func (r *ConmonOCIRuntime) KillContainer(ctr *Container, signal uint, all bool) // If captureStderr is requested, OCI runtime STDERR will be captured as a // *bytes.buffer and returned; otherwise, it is set to os.Stderr. +// IMPORTANT: Thus function is called from an unlocked container state in +// the stop() code path so do not modify the state here. func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captureStderr bool) (*bytes.Buffer, error) { logrus.Debugf("Sending signal %d to container %s", signal, ctr.ID()) runtimeDir, err := util.GetRootlessRuntimeDir() @@ -381,15 +298,15 @@ func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captu stderr = stderrBuffer } if err := utils.ExecCmdWithStdStreams(os.Stdin, os.Stdout, stderr, env, r.path, args...); err != nil { - // Update container state - there's a chance we failed because - // the container exited in the meantime. - if err2 := r.UpdateContainerStatus(ctr); err2 != nil { - logrus.Infof("Error updating status for container %s: %v", ctr.ID(), err2) - } - if ctr.ensureState(define.ContainerStateStopped, define.ContainerStateExited) { - return stderrBuffer, fmt.Errorf("%w: %s", define.ErrCtrStateInvalid, ctr.state.State) + rErr := err + // quick check if ctr pid is still alive + if err := unix.Kill(ctr.state.PID, 0); err == unix.ESRCH { + // pid already dead so signal sending fails logically, set error to ErrCtrStateInvalid + // This is needed for the ProxySignals() function which already ignores the error to not cause flakes + // when the ctr process just exited. + rErr = define.ErrCtrStateInvalid } - return stderrBuffer, fmt.Errorf("sending signal to container %s: %w", ctr.ID(), err) + return stderrBuffer, fmt.Errorf("sending signal to container %s: %w", ctr.ID(), rErr) } return stderrBuffer, nil @@ -401,6 +318,8 @@ func (r *ConmonOCIRuntime) killContainer(ctr *Container, signal uint, all, captu // immediately kill with SIGKILL. // Does not set finished time for container, assumes you will run updateStatus // after to pull the exit code. +// IMPORTANT: Thus function is called from an unlocked container state in +// the stop() code path so do not modify the state here. func (r *ConmonOCIRuntime) StopContainer(ctr *Container, timeout uint, all bool) error { logrus.Debugf("Stopping container %s (PID %d)", ctr.ID(), ctr.state.PID) diff --git a/libpod/oci_missing.go b/libpod/oci_missing.go index 0ac6417f58ea..98eb91ef8db4 100644 --- a/libpod/oci_missing.go +++ b/libpod/oci_missing.go @@ -76,11 +76,6 @@ func (r *MissingRuntime) CreateContainer(ctr *Container, restoreOptions *Contain return 0, r.printError() } -// UpdateContainerStatus is not available as the runtime is missing -func (r *MissingRuntime) UpdateContainerStatus(ctr *Container) error { - return r.printError() -} - // StartContainer is not available as the runtime is missing func (r *MissingRuntime) StartContainer(ctr *Container) error { return r.printError()