From 9583b3d1c297021109081872c52302316ede15b1 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Thu, 13 Apr 2023 11:13:13 -0700 Subject: [PATCH] libct: move killing logic to container.Signal By default, the container has its own PID namespace, and killing (with SIGKILL) its init process from the parent PID namespace also kills all the other processes. Obviously, it does not work that way when the container is sharing its PID namespace with the host or another container, since init is no longer special (it's not PID 1). In this case, killing container's init will result in a bunch of other processes left running (and thus the inability to remove the cgroup). The solution to the above problem is killing all the container processes, not just init. The problem with the current implementation is, the killing logic is implemented in libcontainer's initProcess.wait, and thus only available to libcontainer users, but not the runc kill command (which uses nonChildProcess.kill and does not use wait at all). So, some workarounds exist: - func destroy(c *Container) calls signalAllProcesses; - runc kill implements -a flag. This code became very tangled over time. Let's simplify things by moving the killing all processes from initProcess.wait to container.Signal, and documents the new behavior. In essence, this also makes `runc kill` to automatically kill all container processes when the container does not have its own PID namespace. Document that as well. Signed-off-by: Kir Kolyshkin --- libcontainer/container_linux.go | 52 ++++++++++++++++----------- libcontainer/integration/exec_test.go | 4 +-- libcontainer/process_linux.go | 5 --- libcontainer/state_linux.go | 7 ---- man/runc-kill.8.md | 4 +-- 5 files changed, 36 insertions(+), 36 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 59262d3ab22..1fa7e2b60c1 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -358,9 +358,9 @@ func (c *Container) start(process *Process) (retErr error) { } // Signal sends a specified signal to container's init, or, if all is true, -// true, to all container's processes (as determined by container's cgroup). +// to all container's processes (as determined by container's cgroup). // -// Setting all=true is useful when s is SIGKILL and the container does not have +// Note all=true is implied when s is SIGKILL and the container does not have // its own PID namespace. In this scenario, the libcontainer user may be required // to implement a proper child reaper. func (c *Container) Signal(s os.Signal, all bool) error { @@ -382,22 +382,36 @@ func (c *Container) Signal(s os.Signal, all bool) error { } return c.ignoreCgroupError(signalAllProcesses(c.cgroupManager, sig)) } - // to avoid a PID reuse attack - if status == Running || status == Created || status == Paused { - if err := c.initProcess.signal(s); err != nil { - return fmt.Errorf("unable to signal init: %w", err) - } - if status == Paused { - // For cgroup v1, killing a process in a frozen cgroup - // does nothing until it's thawed. Only thaw the cgroup - // for SIGKILL. - if s, ok := s.(unix.Signal); ok && s == unix.SIGKILL { - _ = c.cgroupManager.Freeze(configs.Thawed) - } - } - return nil + + // To avoid a PID reuse attack, don't kill non-running container. + switch status { + case Running, Created, Paused: + default: + return ErrNotRunning } - return ErrNotRunning + + // When a container has its own PID namespace, inside it the init PID + // is 1, and thus it is handled specially by the kernel. In particular, + // killing init with SIGKILL from an ancestor namespace will also kill + // all other processes in that PID namespace (see pid_namespaces(7)). + // + // OTOH, if PID namespace is shared, we should kill all pids to avoid + // leftover processes. + if s == unix.SIGKILL && !c.config.Namespaces.IsPrivate(configs.NEWPID) { + err = signalAllProcesses(c.cgroupManager, unix.SIGKILL) + } else { + err = c.initProcess.signal(s) + } + if err != nil { + return fmt.Errorf("unable to signal init: %w", err) + } + if status == Paused && s == unix.SIGKILL { + // For cgroup v1, killing a process in a frozen cgroup + // does nothing until it's thawed. Only thaw the cgroup + // for SIGKILL. + _ = c.cgroupManager.Freeze(configs.Thawed) + } + return nil } func (c *Container) createExecFifo() error { @@ -593,7 +607,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l container: c, process: p, bootstrapData: data, - sharePidns: !c.config.Namespaces.IsPrivate(configs.NEWPID), } c.initProcess = init return init, nil @@ -696,8 +709,7 @@ func (c *Container) newInitConfig(process *Process) *initConfig { return cfg } -// Destroy destroys the container, if its in a valid state, after killing any -// remaining running processes. +// Destroy destroys the container, if its in a valid state. // // Any event registrations are removed before the container is destroyed. // No error is returned if the container is already destroyed. diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 4ac0f466bd9..e1ee5f7b4ed 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -1399,8 +1399,8 @@ func testPidnsInitKill(t *testing.T, config *configs.Config) { err = container.Run(process2) ok(t, err) - // Kill the init process and Wait for it. - err = process1.Signal(syscall.SIGKILL) + // Kill the container. + err = container.Signal(syscall.SIGKILL, false) ok(t, err) _, err = process1.Wait() if err == nil { diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index ba2990a951b..48861406dba 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -304,7 +304,6 @@ type initProcess struct { fds []string process *Process bootstrapData io.Reader - sharePidns bool } func (p *initProcess) pid() int { @@ -616,10 +615,6 @@ func (p *initProcess) start() (retErr error) { func (p *initProcess) wait() (*os.ProcessState, error) { err := p.cmd.Wait() - // we should kill all processes in cgroup when init is died if we use host PID namespace - if p.sharePidns { - _ = signalAllProcesses(p.manager, unix.SIGKILL) - } return p.cmd.ProcessState, err } diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go index 4895612e257..442d0baaf5b 100644 --- a/libcontainer/state_linux.go +++ b/libcontainer/state_linux.go @@ -7,7 +7,6 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -36,12 +35,6 @@ type containerState interface { } func destroy(c *Container) error { - if !c.config.Namespaces.Contains(configs.NEWPID) || - c.config.Namespaces.PathOf(configs.NEWPID) != "" { - if err := signalAllProcesses(c.cgroupManager, unix.SIGKILL); err != nil { - logrus.Warn(err) - } - } err := c.cgroupManager.Destroy() if c.intelRdtManager != nil { if ierr := c.intelRdtManager.Destroy(); err == nil { diff --git a/man/runc-kill.8.md b/man/runc-kill.8.md index 5f0f51ed9b9..684666396bd 100644 --- a/man/runc-kill.8.md +++ b/man/runc-kill.8.md @@ -18,8 +18,8 @@ to list available signals. # OPTIONS **--all**|**-a** : Send the signal to all processes inside the container, rather than -the container's init only. This option is useful when the container does not -have its own PID namespace. +the container's init only. This option is implied when the _signal_ is **KILL** +and the container does not have its own PID namespace. : When this option is set, no error is returned if the container is stopped or does not exist.