Skip to content

Commit

Permalink
use fork to replace clone(CLONE_PARENT)
Browse files Browse the repository at this point in the history
As the description in opencontainers#4233, there is a bug in glibc, pthread_self()
will return wrong info after we do `clone(CLONE_PARENT)` in libct/nsenter,
it will cause runc can't work in `go 1.22.*`. So we use fork(2) to replace
clone(2) in libct/nsenter, but there is a double-fork in nsenter, so we
need to use `PR_SET_CHILD_SUBREAPER` to let runc can reap grand child
process in libct/nsenter.

Signed-off-by: lifubang <lifubang@acmcoder.com>
  • Loading branch information
lifubang committed May 17, 2024
1 parent 6916425 commit a2d4311
Show file tree
Hide file tree
Showing 10 changed files with 60 additions and 109 deletions.
2 changes: 0 additions & 2 deletions contrib/completions/bash/runc
Original file line number Diff line number Diff line change
Expand Up @@ -458,7 +458,6 @@ _runc_run() {
--help
--detatch
-d
--no-subreaper
--no-pivot
--no-new-keyring
"
Expand Down Expand Up @@ -623,7 +622,6 @@ _runc_restore() {
--file-locks
--detach
-d
--no-subreaper
--no-pivot
--auto-dedup
--lazy-pages
Expand Down
21 changes: 10 additions & 11 deletions exec.go
Original file line number Diff line number Diff line change
Expand Up @@ -181,17 +181,16 @@ func execProcess(context *cli.Context) (int, error) {
}

r := &runner{
enableSubreaper: false,
shouldDestroy: false,
container: container,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
action: CT_ACT_RUN,
init: false,
preserveFDs: context.Int("preserve-fds"),
subCgroupPaths: cgPaths,
shouldDestroy: false,
container: container,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
action: CT_ACT_RUN,
init: false,
preserveFDs: context.Int("preserve-fds"),
subCgroupPaths: cgPaths,
}
return r.run(p)
}
Expand Down
6 changes: 6 additions & 0 deletions libcontainer/container_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -341,6 +341,12 @@ func (c *Container) start(process *Process) (retErr error) {
if err := utils.CloseExecFrom(3); err != nil {
return fmt.Errorf("unable to mark non-stdio fds as cloexec: %w", err)
}

// Set us as the subreaper to let the grandchild process reparent to us.
if err := system.SetSubreaper(1); err != nil {
return fmt.Errorf("unable to set subreaper: %w", err)
}

if err := parent.start(); err != nil {
return fmt.Errorf("unable to start container process: %w", err)
}
Expand Down
66 changes: 14 additions & 52 deletions libcontainer/nsenter/nsexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,6 @@ enum sync_t {
/* Stores the current stage of nsexec. */
int current_stage = STAGE_SETUP;

/* Assume the stack grows down, so arguments should be above it. */
struct clone_t {
/*
* Reserve some space for clone() to locate arguments
* and retcode in this place
*/
char stack[4096] __attribute__((aligned(16)));
char stack_ptr[0];

/* There's two children. This is used to execute the different code. */
jmp_buf *env;
int jmpval;
};

struct nlconfig_t {
char *data;

Expand Down Expand Up @@ -303,23 +289,15 @@ static void update_oom_score_adj(char *data, size_t len)
bail("failed to update /proc/self/oom_score_adj");
}

/* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__((noinline));
static int child_func(void *arg)
static int fork_and_run(jmp_buf *env, int jmpval)
{
struct clone_t *ca = (struct clone_t *)arg;
longjmp(*ca->env, ca->jmpval);
}

static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
.env = env,
.jmpval = jmpval,
};

return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
pid_t pid = fork();
if (pid < 0)
bail("failed to fork");
if (pid > 0)
return pid;
/* Jump back to the big switch in nsexec... */
longjmp(*env, jmpval);
}

/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
Expand Down Expand Up @@ -644,7 +622,7 @@ void nsexec(void)
*
* Unfortunately, it's not as simple as that. We have to fork to enter the
* PID namespace (the PID namespace only applies to children). Since we'll
* have to double-fork, this clone_parent() call won't be able to get the
* have to double-fork, this fork() call won't be able to get the
* PID of the _actual_ init process (without doing more synchronisation than
* I can deal with at the moment). So we'll just get the parent to send it
* for us, the only job of this process is to update
Expand All @@ -655,15 +633,6 @@ void nsexec(void)
* will be in that namespace (and it will not be able to give us a PID value
* that makes sense without resorting to sending things with cmsg).
*
* This also deals with an older issue caused by dumping cloneflags into
* clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
* we have to unshare(2) before clone(2) in order to do this. This was fixed
* in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
* introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
* aware, the last mainline kernel which had this bug was Linux 3.12.
* However, we cannot comment on which kernels the broken patch was
* backported to.
*
* -- Aleksa "what has my life come to?" Sarai
*/

Expand All @@ -687,7 +656,7 @@ void nsexec(void)

/* Start the process of getting a container. */
write_log(DEBUG, "spawn stage-1");
stage1_pid = clone_parent(&env, STAGE_CHILD);
stage1_pid = fork_and_run(&env, STAGE_CHILD);
if (stage1_pid < 0)
bail("unable to spawn stage-1");

Expand Down Expand Up @@ -755,9 +724,7 @@ void nsexec(void)
/*
* Send both the stage-1 and stage-2 pids back to runc.
* runc needs the stage-2 to continue process management,
* but because stage-1 was spawned with CLONE_PARENT we
* cannot reap it within stage-0 and thus we need to ask
* runc to reap the zombie for us.
* and ask runc to reap the zombie for us.
*/
write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
stage1_pid, stage2_pid);
Expand Down Expand Up @@ -892,9 +859,8 @@ void nsexec(void)
}

/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal stage-0 to do the mapping for
* us.
* We don't have the privileges to do any mapping here.
* So signal stage-0 to do the mapping for us.
*/
write_log(DEBUG, "request stage-0 to map user namespace");
s = SYNC_USERMAP_PLS;
Expand Down Expand Up @@ -925,10 +891,6 @@ void nsexec(void)
* ordering might break in the future (especially with rootless
* containers). But for now, it's not possible to split this into
* CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
*
* Note that we don't merge this with clone() because there were
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
try_unshare(config.cloneflags, "remaining namespaces");

Expand All @@ -955,7 +917,7 @@ void nsexec(void)
* to actually enter the new PID namespace.
*/
write_log(DEBUG, "spawn stage-2");
stage2_pid = clone_parent(&env, STAGE_INIT);
stage2_pid = fork_and_run(&env, STAGE_INIT);
if (stage2_pid < 0)
bail("unable to spawn stage-2");

Expand Down
3 changes: 0 additions & 3 deletions man/runc-restore.8.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,6 @@ image files directory.
**--pid-file** _path_
: Specify the file to write the initial container process' PID to.

**--no-subreaper**
: Disable the use of the subreaper used to reap reparented processes.

**--no-pivot**
: Do not use pivot root to jail process inside rootfs. This should not be used
except in exceptional circumstances, and may be unsafe from the security
Expand Down
3 changes: 0 additions & 3 deletions man/runc-run.8.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,6 @@ referencing the master end of the console's pseudoterminal. See
**--pid-file** _path_
: Specify the file to write the initial container process' PID to.

**--no-subreaper**
: Disable the use of the subreaper used to reap reparented processes.

**--no-pivot**
: Do not use pivot root to jail process inside rootfs. This should not be used
except in exceptional circumstances, and may be unsafe from the security
Expand Down
2 changes: 1 addition & 1 deletion restore.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ using the runc checkpoint command.`,
},
cli.BoolFlag{
Name: "no-subreaper",
Usage: "disable the use of the subreaper used to reap reparented processes",
Usage: "(ignored) disable the use of the subreaper used to reap reparented processes. This flag has been ignored by runc, and will be removed in 1.3.0",
},
cli.BoolFlag{
Name: "no-pivot",
Expand Down
2 changes: 1 addition & 1 deletion run.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ command(s) that get executed on start, edit the args parameter of the spec. See
},
cli.BoolFlag{
Name: "no-subreaper",
Usage: "disable the use of the subreaper used to reap reparented processes",
Usage: "(ignored) disable the use of the subreaper used to reap reparented processes. This flag has been ignored by runc, and will be removed in 1.3.0",
},
cli.BoolFlag{
Name: "no-pivot",
Expand Down
9 changes: 1 addition & 8 deletions signals.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import (
"os/signal"

"github.com/opencontainers/runc/libcontainer"
"github.com/opencontainers/runc/libcontainer/system"
"github.com/opencontainers/runc/libcontainer/utils"

"github.com/sirupsen/logrus"
Expand All @@ -18,13 +17,7 @@ const signalBufferSize = 2048
// while still forwarding all other signals to the process.
// If notifySocket is present, use it to read systemd notifications from the container and
// forward them to notifySocketHost.
func newSignalHandler(enableSubreaper bool, notifySocket *notifySocket) *signalHandler {
if enableSubreaper {
// set us as the subreaper before registering the signal handler for the container
if err := system.SetSubreaper(1); err != nil {
logrus.Warn(err)
}
}
func newSignalHandler(notifySocket *notifySocket) *signalHandler {
// ensure that we have a large buffer size so that we do not miss any signals
// in case we are not processing them fast enough.
s := make(chan os.Signal, signalBufferSize)
Expand Down
55 changes: 27 additions & 28 deletions utils_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -190,20 +190,19 @@ func createContainer(context *cli.Context, id string, spec *specs.Spec) (*libcon
}

type runner struct {
init bool
enableSubreaper bool
shouldDestroy bool
detach bool
listenFDs []*os.File
preserveFDs int
pidFile string
consoleSocket string
pidfdSocket string
container *libcontainer.Container
action CtAct
notifySocket *notifySocket
criuOpts *libcontainer.CriuOpts
subCgroupPaths map[string]string
init bool
shouldDestroy bool
detach bool
listenFDs []*os.File
preserveFDs int
pidFile string
consoleSocket string
pidfdSocket string
container *libcontainer.Container
action CtAct
notifySocket *notifySocket
criuOpts *libcontainer.CriuOpts
subCgroupPaths map[string]string
}

func (r *runner) run(config *specs.Process) (int, error) {
Expand Down Expand Up @@ -247,10 +246,11 @@ func (r *runner) run(config *specs.Process) (int, error) {
return -1, err
}
detach := r.detach || (r.action == CT_ACT_CREATE)

// Setting up IO is a two stage process. We need to modify process to deal
// with detaching containers, and then we get a tty after the container has
// started.
handler := newSignalHandler(r.enableSubreaper, r.notifySocket)
handler := newSignalHandler(r.notifySocket)
tty, err := setupIO(process, rootuid, rootgid, config.Terminal, detach, r.consoleSocket)
if err != nil {
return -1, err
Expand Down Expand Up @@ -396,19 +396,18 @@ func startContainer(context *cli.Context, action CtAct, criuOpts *libcontainer.C
}

r := &runner{
enableSubreaper: !context.Bool("no-subreaper"),
shouldDestroy: !context.Bool("keep"),
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
init: true,
shouldDestroy: !context.Bool("keep"),
container: container,
listenFDs: listenFDs,
notifySocket: notifySocket,
consoleSocket: context.String("console-socket"),
pidfdSocket: context.String("pidfd-socket"),
detach: context.Bool("detach"),
pidFile: context.String("pid-file"),
preserveFDs: context.Int("preserve-fds"),
action: action,
criuOpts: criuOpts,
init: true,
}
return r.run(spec.Process)
}
Expand Down

0 comments on commit a2d4311

Please sign in to comment.