From ed053a740c91126f8e887ed1dac49791bd265fe8 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.de>
Date: Tue, 19 Jul 2016 00:40:24 +1000
Subject: [PATCH 1/3] nsenter: specify namespace type in setns()

This avoids us from running into cases where libcontainer thinks that a
particular namespace file is a different type, and makes it a fatal
error rather than causing broken functionality.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
---
 libcontainer/configs/namespaces_unix.go |  8 +--
 libcontainer/container_linux.go         | 15 +++--
 libcontainer/nsenter/nsenter_test.go    | 44 ++++++++++++-
 libcontainer/nsenter/nsexec.c           | 83 ++++++++++++++++++++-----
 4 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/libcontainer/configs/namespaces_unix.go b/libcontainer/configs/namespaces_unix.go
index b9c820d0627..8beba9d300c 100644
--- a/libcontainer/configs/namespaces_unix.go
+++ b/libcontainer/configs/namespaces_unix.go
@@ -22,8 +22,8 @@ var (
 	supportedNamespaces = make(map[NamespaceType]bool)
 )
 
-// nsToFile converts the namespace type to its filename
-func nsToFile(ns NamespaceType) string {
+// NsName converts the namespace type to its filename
+func NsName(ns NamespaceType) string {
 	switch ns {
 	case NEWNET:
 		return "net"
@@ -50,7 +50,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if ok {
 		return supported
 	}
-	nsFile := nsToFile(ns)
+	nsFile := NsName(ns)
 	// if the namespace type is unknown, just return false
 	if nsFile == "" {
 		return false
@@ -84,7 +84,7 @@ func (n *Namespace) GetPath(pid int) string {
 	if n.Path != "" {
 		return n.Path
 	}
-	return fmt.Sprintf("/proc/%d/ns/%s", pid, nsToFile(n.Type))
+	return fmt.Sprintf("/proc/%d/ns/%s", pid, NsName(n.Type))
 }
 
 func (n *Namespaces) Remove(t NamespaceType) bool {
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 29c8b3437be..34cac634783 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -1223,16 +1223,21 @@ func (c *linuxContainer) currentState() (*State, error) {
 // can setns in order.
 func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
 	paths := []string{}
-	nsTypes := []configs.NamespaceType{
+	order := []configs.NamespaceType{
 		configs.NEWIPC,
 		configs.NEWUTS,
 		configs.NEWNET,
 		configs.NEWPID,
 		configs.NEWNS,
+		configs.NEWUSER,
 	}
-	// join userns if the init process explicitly requires NEWUSER
-	if c.config.Namespaces.Contains(configs.NEWUSER) {
-		nsTypes = append(nsTypes, configs.NEWUSER)
+
+	// Remove namespaces that we don't need to join.
+	var nsTypes []configs.NamespaceType
+	for _, ns := range order {
+		if c.config.Namespaces.Contains(ns) {
+			nsTypes = append(nsTypes, ns)
+		}
 	}
 	for _, nsType := range nsTypes {
 		if p, ok := namespaces[nsType]; ok && p != "" {
@@ -1249,7 +1254,7 @@ func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceTyp
 			if strings.ContainsRune(p, ',') {
 				return nil, newSystemError(fmt.Errorf("invalid path %s", p))
 			}
-			paths = append(paths, p)
+			paths = append(paths, fmt.Sprintf("%s:%s", configs.NsName(nsType), p))
 		}
 	}
 	return paths, nil
diff --git a/libcontainer/nsenter/nsenter_test.go b/libcontainer/nsenter/nsenter_test.go
index 598e80b5640..98b026f7bb4 100644
--- a/libcontainer/nsenter/nsenter_test.go
+++ b/libcontainer/nsenter/nsenter_test.go
@@ -29,7 +29,7 @@ func TestNsenterValidPaths(t *testing.T) {
 
 	namespaces := []string{
 		// join pid ns of the current process
-		fmt.Sprintf("/proc/%d/ns/pid", os.Getpid()),
+		fmt.Sprintf("pid:/proc/%d/ns/pid", os.Getpid()),
 	}
 	cmd := &exec.Cmd{
 		Path:       os.Args[0],
@@ -87,7 +87,47 @@ func TestNsenterInvalidPaths(t *testing.T) {
 
 	namespaces := []string{
 		// join pid ns of the current process
-		fmt.Sprintf("/proc/%d/ns/pid", -1),
+		fmt.Sprintf("pid:/proc/%d/ns/pid", -1),
+	}
+	cmd := &exec.Cmd{
+		Path:       os.Args[0],
+		Args:       args,
+		ExtraFiles: []*os.File{child},
+		Env:        []string{"_LIBCONTAINER_INITPIPE=3"},
+	}
+
+	if err := cmd.Start(); err != nil {
+		t.Fatal(err)
+	}
+	// write cloneFlags
+	r := nl.NewNetlinkRequest(int(libcontainer.InitMsg), 0)
+	r.AddData(&libcontainer.Int32msg{
+		Type:  libcontainer.CloneFlagsAttr,
+		Value: uint32(syscall.CLONE_NEWNET),
+	})
+	r.AddData(&libcontainer.Bytemsg{
+		Type:  libcontainer.NsPathsAttr,
+		Value: []byte(strings.Join(namespaces, ",")),
+	})
+	if _, err := io.Copy(parent, bytes.NewReader(r.Serialize())); err != nil {
+		t.Fatal(err)
+	}
+
+	if err := cmd.Wait(); err == nil {
+		t.Fatalf("nsenter exits with a zero exit status")
+	}
+}
+
+func TestNsenterIncorrectPathType(t *testing.T) {
+	args := []string{"nsenter-exec"}
+	parent, child, err := newPipe()
+	if err != nil {
+		t.Fatalf("failed to create pipe %v", err)
+	}
+
+	namespaces := []string{
+		// join pid ns of the current process
+		fmt.Sprintf("net:/proc/%d/ns/pid", os.Getpid()),
 	}
 	cmd := &exec.Cmd{
 		Path:       os.Args[0],
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index b93f827b55f..ce8fab380d1 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -11,6 +11,7 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <stdbool.h>
 #include <string.h>
 #include <unistd.h>
 
@@ -265,6 +266,44 @@ static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nlcon
 	exit(0);
 }
 
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
+static int nsflag(char *name)
+{
+	if (false)
+		/* dummy */ ;
+#ifdef CLONE_NEWCGROUP
+	else if (!strcmp(name, "cgroup"))
+		return CLONE_NEWCGROUP;
+#endif
+#ifdef CLONE_NEWIPC
+	else if (!strcmp(name, "ipc"))
+		return CLONE_NEWIPC;
+#endif
+#ifdef CLONE_NEWNS
+	else if (!strcmp(name, "mnt"))
+		return CLONE_NEWNS;
+#endif
+#ifdef CLONE_NEWNET
+	else if (!strcmp(name, "net"))
+		return CLONE_NEWNET;
+#endif
+#ifdef CLONE_NEWPID
+	else if (!strcmp(name, "pid"))
+		return CLONE_NEWPID;
+#endif
+#ifdef CLONE_NEWUSER
+	else if (!strcmp(name, "user"))
+		return CLONE_NEWUSER;
+#endif
+#ifdef CLONE_NEWUTS
+	else if (!strcmp(name, "uts"))
+		return CLONE_NEWUTS;
+#endif
+
+	/* If we don't recognise a name, fallback to 0. */
+	return 0;
+}
+
 static void nl_parse(int fd, struct nlconfig_t *config)
 {
 	size_t len, size;
@@ -328,8 +367,13 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 				 */
 				char *saveptr = NULL;
 				char *ns = strtok_r(current, ",", &saveptr);
-				int *fds = NULL, num = 0, i;
-				char **paths = NULL;
+				int num = 0, i;
+
+				struct namespace_t {
+					int fd;
+					int ns;
+					char *path;
+				} *nses = NULL;
 
 				if (!ns || !strlen(current))
 					bail("ns paths are empty");
@@ -341,32 +385,39 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 				 */
 				do {
 					int fd;
+					char *path;
+
+					/* Resize the namespace array. */
+					nses = realloc(nses, ++num * sizeof(struct namespace_t));
 
-					/* Resize fds. */
-					num++;
-					fds = realloc(fds, num * sizeof(int));
-					paths = realloc(paths, num * sizeof(char *));
+					/* Split 'ns:path'. */
+					path = strstr(ns, ":");
+					if (!path)
+						bail("failed to parse %s", ns);
+					*path++ = '\0';
 
-					fd = open(ns, O_RDONLY);
+					fd = open(path, O_RDONLY);
 					if (fd < 0)
 						bail("failed to open %s", ns);
 
-					fds[num - 1] = fd;
-					paths[num - 1] = ns;
+					nses[num - 1] = (struct namespace_t) {
+						.fd = fd,
+						.ns = nsflag(ns),
+						.path = path,
+					};
 				} while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL);
 
 				for (i = 0; i < num; i++) {
-					int fd = fds[i];
-					char *path = paths[i];
+					struct namespace_t ns = nses[i];
 
-					if (setns(fd, 0) < 0)
-						bail("failed to setns to %s", path);
+					/* Actually join the namespaces. */
+					if (setns(ns.fd, ns.ns) < 0)
+						bail("failed to setns to %s", ns.path);
 
-					close(fd);
+					close(ns.fd);
 				}
 
-				free(fds);
-				free(paths);
+				free(nses);
 				break;
 			}
 		case UIDMAP_ATTR:

From 2cd9c31b995cff313c828dafbfeea082c7318c3f Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.de>
Date: Thu, 21 Jul 2016 10:02:08 +1000
Subject: [PATCH 2/3] nsenter: guarantee correct user namespace ordering

Depending on your SELinux setup, the order in which you join namespaces
can be important. In general, user namespaces should *always* be joined
and unshared first because then the other namespaces are correctly
pinned and you have the right priviliges within them. This also is very
useful for rootless containers, as well as older kernels that had
essentially broken unshare(2) and clone(2) implementations.

This also includes huge refactorings in how we spawn processes for
complicated reasons that I don't want to get into because it will make
me spiral into a cloud of rage. The reasoning is in the giant comment in
clone_parent. Have fun.

In addition, because we now create multiple children with CLONE_PARENT,
we cannot wait for them to SIGCHLD us in the case of a death. Thus, we
have to resort to having a child kindly send us their exit code before
they die. Hopefully this all works okay, but at this point there's not
much more than we can do.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
---
 libcontainer/container_linux.go  |   3 +-
 libcontainer/nsenter/namespace.h |  32 ++
 libcontainer/nsenter/nsexec.c    | 707 ++++++++++++++++++++-----------
 3 files changed, 501 insertions(+), 241 deletions(-)
 create mode 100644 libcontainer/nsenter/namespace.h

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 34cac634783..4ba2735d010 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -1224,12 +1224,13 @@ func (c *linuxContainer) currentState() (*State, error) {
 func (c *linuxContainer) orderNamespacePaths(namespaces map[configs.NamespaceType]string) ([]string, error) {
 	paths := []string{}
 	order := []configs.NamespaceType{
+		// The user namespace *must* be done first.
+		configs.NEWUSER,
 		configs.NEWIPC,
 		configs.NEWUTS,
 		configs.NEWNET,
 		configs.NEWPID,
 		configs.NEWNS,
-		configs.NEWUSER,
 	}
 
 	// Remove namespaces that we don't need to join.
diff --git a/libcontainer/nsenter/namespace.h b/libcontainer/nsenter/namespace.h
new file mode 100644
index 00000000000..9e9bdca05e1
--- /dev/null
+++ b/libcontainer/nsenter/namespace.h
@@ -0,0 +1,32 @@
+#ifndef NSENTER_NAMESPACE_H
+#define NSENTER_NAMESPACE_H
+
+#ifndef _GNU_SOURCE
+#	define _GNU_SOURCE
+#endif
+#include <sched.h>
+
+/* All of these are taken from include/uapi/linux/sched.h */
+#ifndef CLONE_NEWNS
+#	define CLONE_NEWNS 0x00020000 /* New mount namespace group */
+#endif
+#ifndef CLONE_NEWCGROUP
+#	define CLONE_NEWCGROUP 0x02000000 /* New cgroup namespace */
+#endif
+#ifndef CLONE_NEWUTS
+#	define CLONE_NEWUTS 0x04000000 /* New utsname namespace */
+#endif
+#ifndef CLONE_NEWIPC
+#	define CLONE_NEWIPC 0x08000000 /* New ipc namespace */
+#endif
+#ifndef CLONE_NEWUSER
+#	define CLONE_NEWUSER 0x10000000 /* New user namespace */
+#endif
+#ifndef CLONE_NEWPID
+#	define CLONE_NEWPID 0x20000000 /* New pid namespace */
+#endif
+#ifndef CLONE_NEWNET
+#	define CLONE_NEWNET 0x40000000 /* New network namespace */
+#endif
+
+#endif /* NSENTER_NAMESPACE_H */
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index ce8fab380d1..d3a50b04ce0 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -24,27 +24,51 @@
 #include <linux/netlink.h>
 #include <linux/types.h>
 
-#define SYNC_VAL 0x42
-#define JUMP_VAL 0x43
+/* Get all of the CLONE_NEW* flags. */
+#include "namespace.h"
+
+/* Synchronisation values. */
+enum sync_t {
+	SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
+	SYNC_USERMAP_ACK = 0x41, /* Mapping finished by the parent. */
+	SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
+	SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
+
+	/* XXX: This doesn't help with segfaults and other such issues. */
+	SYNC_ERR = 0xFF, /* Fatal error, no turning back. The error code follows. */
+};
+
+/* longjmp() arguments. */
+#define JUMP_PARENT 0x00
+#define JUMP_CHILD  0xA0
+#define JUMP_INIT   0xA1
+
+/* JSON buffer. */
+#define JSON_MAX 4096
 
 /* Assume the stack grows down, so arguments should be above it. */
-struct clone_arg {
+struct clone_t {
 	/*
 	 * Reserve some space for clone() to locate arguments
 	 * and retcode in this place
 	 */
 	char stack[4096] __attribute__ ((aligned(16)));
 	char stack_ptr[0];
+
+	/* There's two children. This is used to execute the different code. */
 	jmp_buf *env;
+	int jmpval;
 };
 
 struct nlconfig_t {
 	char *data;
 	uint32_t cloneflags;
 	char *uidmap;
-	int uidmap_len;
+	size_t uidmap_len;
 	char *gidmap;
-	int gidmap_len;
+	size_t gidmap_len;
+	char *namespaces;
+	size_t namespaces_len;
 	uint8_t is_setgroup;
 	int consolefd;
 };
@@ -82,80 +106,24 @@ int setns(int fd, int nstype)
 }
 #endif
 
+/* XXX: This is ugly. */
+static int syncfd = -1;
+
 /* TODO(cyphar): Fix this so it correctly deals with syncT. */
-#define bail(fmt, ...)							\
-	do {								\
-		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__); \
-		exit(__COUNTER__ + 1);					\
+#define bail(fmt, ...)								\
+	do {									\
+		int ret = __COUNTER__ + 1;					\
+		fprintf(stderr, "nsenter: " fmt ": %m\n", ##__VA_ARGS__);	\
+		if (syncfd >= 0) {						\
+			enum sync_t s = SYNC_ERR;				\
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s))		\
+				fprintf(stderr, "nsenter: failed: write(s)");	\
+			if (write(syncfd, &ret, sizeof(ret)) != sizeof(ret))	\
+				fprintf(stderr, "nsenter: failed: write(ret)");	\
+		}								\
+		exit(ret);							\
 	} while(0)
 
-static int child_func(void *arg)
-{
-	struct clone_arg *ca = (struct clone_arg *)arg;
-	longjmp(*ca->env, JUMP_VAL);
-}
-
-static int clone_parent(jmp_buf *env, int flags) __attribute__ ((noinline));
-static int clone_parent(jmp_buf *env, int flags)
-{
-	int child;
-	struct clone_arg ca = {
-		.env = env,
-	};
-
-	child = clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD | flags, &ca);
-
-	/*
-	 * On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so we have
-	 * to unshare(2) before clone(2) in order to do this. This was fixed in
-	 * upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
-	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e.
-	 *
-	 * As far as we're aware, the last mainline kernel which had this bug was
-	 * Linux 3.12. However, we cannot comment on which kernels the broken patch
-	 * was backported to.
-	 */
-	if (errno == EINVAL) {
-		if (unshare(flags) < 0)
-			bail("unable to unshare namespaces");
-		child = clone(child_func, ca.stack_ptr, SIGCHLD | CLONE_PARENT, &ca);
-	}
-
-	return child;
-}
-
-/*
- * Gets the init pipe fd from the environment, which is used to read the
- * bootstrap data and tell the parent what the new pid is after we finish
- * setting up the environment.
- */
-static int initpipe(void)
-{
-	int pipenum;
-	char *initpipe, *endptr;
-
-	initpipe = getenv("_LIBCONTAINER_INITPIPE");
-	if (initpipe == NULL || *initpipe == '\0')
-		return -1;
-
-	errno = 0;
-	pipenum = strtol(initpipe, &endptr, 10);
-	if (errno != 0 || *endptr != '\0')
-		bail("unable to parse _LIBCONTAINER_INITPIPE");
-
-	return pipenum;
-}
-
-static uint32_t readint32(char *buf)
-{
-	return *(uint32_t *) buf;
-}
-
-static uint8_t readint8(char *buf)
-{
-	return *(uint8_t *) buf;
-}
-
 static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 {
 	int fd, len, ret = 0;
@@ -185,18 +153,28 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
 	return ret;
 }
 
-#define SETGROUPS_ALLOW "allow"
-#define SETGROUPS_DENY  "deny"
+enum policy_t {
+	SETGROUPS_DEFAULT = 0,
+	SETGROUPS_ALLOW,
+	SETGROUPS_DENY,
+};
 
 /* This *must* be called before we touch gid_map. */
-static void update_setgroups(int pid, bool setgroup)
+static void update_setgroups(int pid, enum policy_t setgroup)
 {
 	char *policy;
 
-	if (setgroup)
-		policy = SETGROUPS_ALLOW;
-	else
-		policy = SETGROUPS_DENY;
+	switch (setgroup) {
+		case SETGROUPS_ALLOW:
+			policy = "allow";
+			break;
+		case SETGROUPS_DENY:
+			policy = "deny";
+			break;
+		case SETGROUPS_DEFAULT:
+			/* Nothing to do. */
+			return;
+	}
 
 	if (write_file(policy, strlen(policy), "/proc/%d/setgroups", pid) < 0) {
 		/*
@@ -226,84 +204,78 @@ static void update_gidmap(int pid, char *map, int map_len)
 		bail("failed to update /proc/%d/gid_map", pid);
 }
 
-#define JSON_MAX 4096
+/* A dummy function that just jumps to the given jumpval. */
+static int child_func(void *arg) __attribute__ ((noinline));
+static int child_func(void *arg)
+{
+	struct clone_t *ca = (struct clone_t *)arg;
+	longjmp(*ca->env, ca->jmpval);
+}
 
-static void start_child(int pipenum, jmp_buf *env, int syncpipe[2], struct nlconfig_t *config)
+static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
+static int clone_parent(jmp_buf *env, int jmpval)
 {
-	int len, childpid;
-	char buf[JSON_MAX];
-	uint8_t syncval;
+	struct clone_t ca = {
+		.env    = env,
+		.jmpval = jmpval,
+	};
 
-	/*
-	 * We must fork to actually enter the PID namespace, and use
-	 * CLONE_PARENT so that the child init can have the right parent
-	 * (the bootstrap process). Also so we don't need to forward the
-	 * child's exit code or resend its death signal.
-	 */
-	childpid = clone_parent(env, config->cloneflags);
-	if (childpid < 0)
-		bail("unable to fork");
-
-	/* Update setgroups, uid_map and gid_map for the process if provided. */
-	if (config->is_setgroup)
-		update_setgroups(childpid, true);
-	update_uidmap(childpid, config->uidmap, config->uidmap_len);
-	update_gidmap(childpid, config->gidmap, config->gidmap_len);
-
-	/* Send the sync signal to the child. */
-	close(syncpipe[0]);
-	syncval = SYNC_VAL;
-	if (write(syncpipe[1], &syncval, sizeof(syncval)) != sizeof(syncval))
-		bail("failed to write sync byte to child");
-
-	/* Send the child pid back to our parent */
-	len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", childpid);
-	if (len < 0 || write(pipenum, buf, len) != len) {
-		kill(childpid, SIGKILL);
-		bail("unable to send a child pid");
-	}
+	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
+}
+
+/*
+ * Gets the init pipe fd from the environment, which is used to read the
+ * bootstrap data and tell the parent what the new pid is after we finish
+ * setting up the environment.
+ */
+static int initpipe(void)
+{
+	int pipenum;
+	char *initpipe, *endptr;
+
+	initpipe = getenv("_LIBCONTAINER_INITPIPE");
+	if (initpipe == NULL || *initpipe == '\0')
+		return -1;
+
+	pipenum = strtol(initpipe, &endptr, 10);
+	if (*endptr != '\0')
+		bail("unable to parse _LIBCONTAINER_INITPIPE");
 
-	exit(0);
+	return pipenum;
 }
 
 /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
 static int nsflag(char *name)
 {
-	if (false)
-		/* dummy */ ;
-#ifdef CLONE_NEWCGROUP
-	else if (!strcmp(name, "cgroup"))
+	if (!strcmp(name, "cgroup"))
 		return CLONE_NEWCGROUP;
-#endif
-#ifdef CLONE_NEWIPC
 	else if (!strcmp(name, "ipc"))
 		return CLONE_NEWIPC;
-#endif
-#ifdef CLONE_NEWNS
 	else if (!strcmp(name, "mnt"))
 		return CLONE_NEWNS;
-#endif
-#ifdef CLONE_NEWNET
 	else if (!strcmp(name, "net"))
 		return CLONE_NEWNET;
-#endif
-#ifdef CLONE_NEWPID
 	else if (!strcmp(name, "pid"))
 		return CLONE_NEWPID;
-#endif
-#ifdef CLONE_NEWUSER
 	else if (!strcmp(name, "user"))
 		return CLONE_NEWUSER;
-#endif
-#ifdef CLONE_NEWUTS
 	else if (!strcmp(name, "uts"))
 		return CLONE_NEWUTS;
-#endif
 
 	/* If we don't recognise a name, fallback to 0. */
 	return 0;
 }
 
+static uint32_t readint32(char *buf)
+{
+	return *(uint32_t *) buf;
+}
+
+static uint8_t readint8(char *buf)
+{
+	return *(uint8_t *) buf;
+}
+
 static void nl_parse(int fd, struct nlconfig_t *config)
 {
 	size_t len, size;
@@ -348,78 +320,17 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 			break;
 		case CONSOLE_PATH_ATTR:
 			/*
-			 * The context in which this is done (before or after we
-			 * join the other namespaces) will affect how the path
-			 * resolution of the console works. This order is not
-			 * decided here, but rather in container_linux.go. We just
-			 * follow the order given by the netlink message.
+			 * We open the console here because we currently evaluate console
+			 * paths from the *host* namespaces.
 			 */
 			config->consolefd = open(current, O_RDWR);
 			if (config->consolefd < 0)
 				bail("failed to open console %s", current);
 			break;
-		case NS_PATHS_ATTR:{
-				/*
-				 * Open each namespace path and setns it in the
-				 * order provided to us. We currently don't have
-				 * any context for what kind of namespace we're
-				 * joining, so just blindly do it.
-				 */
-				char *saveptr = NULL;
-				char *ns = strtok_r(current, ",", &saveptr);
-				int num = 0, i;
-
-				struct namespace_t {
-					int fd;
-					int ns;
-					char *path;
-				} *nses = NULL;
-
-				if (!ns || !strlen(current))
-					bail("ns paths are empty");
-
-				/*
-				 * We have to open the file descriptors first, since after
-				 * we join the mnt namespace we might no longer be able to
-				 * access the paths.
-				 */
-				do {
-					int fd;
-					char *path;
-
-					/* Resize the namespace array. */
-					nses = realloc(nses, ++num * sizeof(struct namespace_t));
-
-					/* Split 'ns:path'. */
-					path = strstr(ns, ":");
-					if (!path)
-						bail("failed to parse %s", ns);
-					*path++ = '\0';
-
-					fd = open(path, O_RDONLY);
-					if (fd < 0)
-						bail("failed to open %s", ns);
-
-					nses[num - 1] = (struct namespace_t) {
-						.fd = fd,
-						.ns = nsflag(ns),
-						.path = path,
-					};
-				} while ((ns = strtok_r(NULL, ",", &saveptr)) != NULL);
-
-				for (i = 0; i < num; i++) {
-					struct namespace_t ns = nses[i];
-
-					/* Actually join the namespaces. */
-					if (setns(ns.fd, ns.ns) < 0)
-						bail("failed to setns to %s", ns.path);
-
-					close(ns.fd);
-				}
-
-				free(nses);
-				break;
-			}
+		case NS_PATHS_ATTR:
+			config->namespaces = current;
+			config->namespaces_len = payload_len;
+			break;
 		case UIDMAP_ATTR:
 			config->uidmap = current;
 			config->uidmap_len = payload_len;
@@ -444,6 +355,71 @@ void nl_free(struct nlconfig_t *config)
 	free(config->data);
 }
 
+void join_namespaces(char *nslist)
+{
+	int num = 0, i;
+	char *saveptr = NULL;
+	char *namespace = strtok_r(nslist, ",", &saveptr);
+	struct namespace_t {
+		int fd;
+		int ns;
+		char type[PATH_MAX];
+		char path[PATH_MAX];
+	} *namespaces = NULL;
+
+	if (!namespace || !strlen(namespace) || !strlen(nslist))
+		bail("ns paths are empty");
+
+	/*
+	 * We have to open the file descriptors first, since after
+	 * we join the mnt namespace we might no longer be able to
+	 * access the paths.
+	 */
+	do {
+		int fd;
+		char *path;
+		struct namespace_t *ns;
+
+		/* Resize the namespace array. */
+		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
+		if (!namespaces)
+			bail("failed to reallocate namespace array");
+		ns = &namespaces[num - 1];
+
+		/* Split 'ns:path'. */
+		path = strstr(namespace, ":");
+		if (!path)
+			bail("failed to parse %s", namespace);
+		*path++ = '\0';
+
+		fd = open(path, O_RDONLY);
+		if (fd < 0)
+			bail("failed to open %s", namespace);
+
+		ns->fd = fd;
+		ns->ns = nsflag(namespace);
+		strncpy(ns->path, path, PATH_MAX);
+	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
+
+	/*
+	 * The ordering in which we join namespaces is important. We should
+	 * always join the user namespace *first*. This is all guaranteed
+	 * from the container_linux.go side of this, so we're just going to
+	 * follow the order given to us.
+	 */
+
+	for (i = 0; i < num; i++) {
+		struct namespace_t ns = namespaces[i];
+
+		if (setns(ns.fd, ns.ns) < 0)
+			bail("failed to setns to %s", ns.path);
+
+		close(ns.fd);
+	}
+
+	free(namespaces);
+}
+
 void nsexec(void)
 {
 	int pipenum;
@@ -464,60 +440,311 @@ void nsexec(void)
 
 	/* clone(2) flags are mandatory. */
 	if (config.cloneflags == -1)
-		bail("missing clone_flags");
+		bail("missing cloneflags");
 
 	/* Pipe so we can tell the child when we've finished setting up. */
-	if (pipe(syncpipe) < 0)
+	if (socketpair(AF_LOCAL, SOCK_STREAM, 0, syncpipe) < 0)
 		bail("failed to setup sync pipe between parent and child");
 
-	/* Set up the jump point. */
-	if (setjmp(env) == JUMP_VAL) {
-		/*
-		 * We're inside the child now, having jumped from the
-		 * start_child() code after forking in the parent.
-		 */
-		uint8_t s = 0;
-		int consolefd = config.consolefd;
+	/* TODO: Currently we aren't dealing with child deaths properly. */
+
+	/*
+	 * Okay, so this is quite annoying.
+	 *
+	 * In order to make sure that deal with older kernels (when CLONE_NEWUSER
+	 * wasn't guaranteed to be done first if you specify multiple namespaces in
+	 * a clone(2) invocation) as well as with certain usecases like rootless
+	 * containers, we cannot just dump all of the cloneflags into clone(2).
+	 * However, if we unshare(2) the user namespace *before* we clone(2), then
+	 * all hell breaks loose.
+	 *
+	 * The parent no longer has permissions to do many things (unshare(2) drops
+	 * all capabilities in your old namespace), and the container cannot be set
+	 * up to have more than one {uid,gid} mapping. This is obviously less than
+	 * ideal. In order to fix this, we have to first clone(2) and then unshare.
+	 *
+	 * Unfortunately, it's not as simple as that. We have to fork to enter the
+	 * PID namespace (the PID namespace only applies to children). Since we'll
+	 * have to double-fork, this clone_parent() call won't be able to get the
+	 * PID of the _actual_ init process (without doing more synchronisation than
+	 * I can deal with at the moment). So we'll just get the parent to send it
+	 * for us, the only job of this process is to update
+	 * /proc/pid/{setgroups,uid_map,gid_map}.
+	 *
+	 * And as a result of the above, we also need to setns(2) in the first child
+	 * because if we join a PID namespace in the topmost parent then our child
+	 * will be in that namespace (and it will not be able to give us a PID value
+	 * that makes sense without resorting to sending things with cmsg).
+	 *
+	 * This also deals with an older issue caused by dumping cloneflags into
+	 * clone(2): On old kernels, CLONE_PARENT didn't work with CLONE_NEWPID, so
+	 * we have to unshare(2) before clone(2) in order to do this. This was fixed
+	 * in upstream commit 1f7f4dde5c945f41a7abc2285be43d918029ecc5, and was
+	 * introduced by 40a0d32d1eaffe6aac7324ca92604b6b3977eb0e. As far as we're
+	 * aware, the last mainline kernel which had this bug was Linux 3.12.
+	 * However, we cannot comment on which kernels the broken patch was
+	 * backported to.
+	 *
+	 * -- Aleksa "what has my life come to?" Sarai
+	 */
+
+	switch (setjmp(env)) {
+	/*
+	 * Stage 0: We're in the parent. Our job is just to create a new child
+	 *          (stage 1: JUMP_CHILD) process and write its uid_map and
+	 *          gid_map. That process will go on to create a new process, then
+	 *          it will send us its PID which we will send to the bootstrap
+	 *          process.
+	 */
+	case JUMP_PARENT: {
+			int len;
+			pid_t child;
+			char buf[JSON_MAX];
+
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[0:PARENT]", 0, 0, 0);
+
+			/* Start the process of getting a container. */
+			child = clone_parent(&env, JUMP_CHILD);
+			if (child < 0)
+				bail("unable to fork: child_func");
+
+			/* State machine for synchronisation with the children. */
+			while (true) {
+				enum sync_t s;
+
+				/* This doesn't need to be global, we're in the parent. */
+				int syncfd = syncpipe[1];
+
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with child: next state");
+
+				switch (s) {
+				case SYNC_ERR: {
+						/* We have to mirror the error code of the child. */
+						int ret;
+
+						if (read(syncfd, &ret, sizeof(ret)) != sizeof(ret))
+							bail("failed to sync with child: read(error code)");
+
+						exit(ret);
+					}
+					break;
+				case SYNC_USERMAP_PLS:
+					/* Enable setgroups(2) if we've been asked to. */
+					if (config.is_setgroup)
+						update_setgroups(child, SETGROUPS_ALLOW);
+
+					/* Set up mappings. */
+					update_uidmap(child, config.uidmap, config.uidmap_len);
+					update_gidmap(child, config.gidmap, config.gidmap_len);
+
+					s = SYNC_USERMAP_ACK;
+					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+						kill(child, SIGKILL);
+						bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
+					}
+					break;
+				case SYNC_USERMAP_ACK:
+					/* We should _never_ receive acks. */
+					kill(child, SIGKILL);
+					bail("failed to sync with child: unexpected SYNC_USERMAP_ACK");
+					break;
+				case SYNC_RECVPID_PLS: {
+						pid_t old = child;
+
+						/* Get the init_func pid. */
+						if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
+							kill(old, SIGKILL);
+							bail("failed to sync with child: read(childpid)");
+						}
+
+						/* Send ACK. */
+						s = SYNC_RECVPID_ACK;
+						if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+							kill(old, SIGKILL);
+							kill(child, SIGKILL);
+							bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
+						}
+					}
+
+					/* Leave the loop. */
+					goto out;
+				case SYNC_RECVPID_ACK:
+					/* We should _never_ receive acks. */
+					kill(child, SIGKILL);
+					bail("failed to sync with child: unexpected SYNC_RECVPID_ACK");
+					break;
+				}
+			}
+
+		out:
+			/* Send the init_func pid back to our parent. */
+			len = snprintf(buf, JSON_MAX, "{\"pid\": %d}\n", child);
+			if (len < 0) {
+				kill(child, SIGKILL);
+				bail("unable to generate JSON for child pid");
+			}
+			if (write(pipenum, buf, len) != len) {
+				kill(child, SIGKILL);
+				bail("unable to send child pid to bootstrapper");
+			}
+
+			exit(0);
+		}
+
+	/*
+	 * Stage 1: We're in the first child process. Our job is to join any
+	 *          provided user namespaces in the netlink payload. If we've been
+	 *          asked to CLONE_NEWUSER, we will unshare the user namespace and
+	 *          ask our parent (stage 0) to set up our user mappings for us.
+	 *          Then, we unshare the rest of the requested namespaces and
+	 *          create a new child (stage 2: JUMP_INIT).  We then send the
+	 *          child's PID to our parent (stage 0).
+	 */
+	case JUMP_CHILD: {
+			pid_t child;
+			enum sync_t s;
 
-		/* Close the writing side of pipe. */
-		close(syncpipe[1]);
+			/* We're in a child and thus need to tell the parent if we die. */
+			syncfd = syncpipe[0];
 
-		/* Sync with parent. */
-		if (read(syncpipe[0], &s, sizeof(s)) != sizeof(s) || s != SYNC_VAL)
-			bail("failed to read sync byte from parent");
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[1:CHILD]", 0, 0, 0);
 
-		if (setsid() < 0)
-			bail("setsid failed");
+			/*
+			 * We need to setns first. We cannot do this earlier (in stage 0)
+			 * because of the fact that we forked to get here (the PID of
+			 * [stage 2: JUMP_INIT]) would be meaningless). We could send it
+			 * using cmsg(3) but that's just annoying.
+			 */
+			if (config.namespaces)
+				join_namespaces(config.namespaces);
+
+			/*
+			 * Deal with user namespaces first. They are quite special, as they
+			 * affect our ability to unshare other namespaces and are used as
+			 * context for privilege checks.
+			 */
+			if (config.cloneflags & CLONE_NEWUSER) {
+				/* Create a new user namespace. */
+				if (unshare(CLONE_NEWUSER) < 0)
+					bail("failed to unshare user namespace");
+
+				/*
+				 * We don't have the privileges to do any mapping here (see the
+				 * clone_parent rant). So signal our parent to hook us up.
+				 */
+
+				s = SYNC_USERMAP_PLS;
+				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
+
+				/* ... wait for mapping ... */
+
+				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
+					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
+				if (s != SYNC_USERMAP_ACK)
+					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
+
+				config.cloneflags &= ~CLONE_NEWUSER;
+			}
 
-		if (setuid(0) < 0)
-			bail("setuid failed");
+			/*
+			 * Now we can unshare the rest of the namespaces. We can't be sure if the
+			 * current kernel supports clone(CLONE_PARENT | CLONE_NEWPID), so we'll
+			 * just do it the long way anyway.
+			 */
+			if (unshare(config.cloneflags) < 0)
+				bail("failed to unshare namespaces");
+
+			/* TODO: What about non-namespace clone flags that we're dropping here? */
+			child = clone_parent(&env, JUMP_INIT);
+			if (child < 0)
+				bail("unable to fork: init_func");
+
+			/* Send the child to our parent, which knows what it's doing. */
+			s = SYNC_RECVPID_PLS;
+			if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
+			}
+			if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: write(childpid)");
+			}
 
-		if (setgid(0) < 0)
-			bail("setgid failed");
+			/* ... wait for parent to get the pid ... */
 
-		if (setgroups(0, NULL) < 0)
-			bail("setgroups failed");
+			if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
+			}
+			if (s != SYNC_RECVPID_ACK) {
+				kill(child, SIGKILL);
+				bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
+			}
 
-		if (consolefd != -1) {
-			if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
-				bail("ioctl TIOCSCTTY failed");
-			if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
-				bail("failed to dup stdin");
-			if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
-				bail("failed to dup stdout");
-			if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
-				bail("failed to dup stderr");
+			/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
+			exit(0);
 		}
 
-		/* Free netlink data. */
-		nl_free(&config);
+	/*
+	 * Stage 2: We're the final child process, and the only process that will
+	 *          actually return to the Go runtime. Our job is to just do the
+	 *          final cleanup steps and then return to the Go runtime to allow
+	 *          init_linux.go to run.
+	 */
+	case JUMP_INIT: {
+			/*
+			 * We're inside the child now, having jumped from the
+			 * start_child() code after forking in the parent.
+			 */
+			int consolefd = config.consolefd;
 
-		/* Finish executing, let the Go runtime take over. */
-		return;
-	}
+			/* We're in a child and thus need to tell the parent if we die. */
+			syncfd = syncpipe[0];
 
-	/* Run the parent code. */
-	start_child(pipenum, &env, syncpipe, &config);
+			/* For debugging. */
+			prctl(PR_SET_NAME, (unsigned long) "runc:[1:INIT]", 0, 0, 0);
+
+			if (setsid() < 0)
+				bail("setsid failed");
+
+			if (setuid(0) < 0)
+				bail("setuid failed");
+
+			if (setgid(0) < 0)
+				bail("setgid failed");
+
+			if (setgroups(0, NULL) < 0)
+				bail("setgroups failed");
+
+			if (consolefd != -1) {
+				if (ioctl(consolefd, TIOCSCTTY, 0) < 0)
+					bail("ioctl TIOCSCTTY failed");
+				if (dup3(consolefd, STDIN_FILENO, 0) != STDIN_FILENO)
+					bail("failed to dup stdin");
+				if (dup3(consolefd, STDOUT_FILENO, 0) != STDOUT_FILENO)
+					bail("failed to dup stdout");
+				if (dup3(consolefd, STDERR_FILENO, 0) != STDERR_FILENO)
+					bail("failed to dup stderr");
+			}
+
+			/* Close sync pipes. */
+			close(syncpipe[0]);
+			close(syncpipe[1]);
+
+			/* Free netlink data. */
+			nl_free(&config);
+
+			/* Finish executing, let the Go runtime take over. */
+			return;
+		}
+	default:
+		bail("unexpected jump value");
+		break;
+	}
 
 	/* Should never be reached. */
 	bail("should never be reached");

From e3cd191acc5eb7c76b9e8884b8217d03b22e168b Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <asarai@suse.de>
Date: Tue, 18 Oct 2016 18:26:27 +1100
Subject: [PATCH 3/3] nsenter: un-split clone(cloneflags) for RHEL

Without this patch applied, RHEL's SELinux policies cause container
creation to not really work. Unfortunately this might be an issue for
rootless containers (opencontainers/runc#774) but we'll cross that
bridge when we come to it.

Signed-off-by: Aleksa Sarai <asarai@suse.de>
---
 libcontainer/nsenter/nsexec.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index d3a50b04ce0..93265c266d5 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -621,16 +621,25 @@ void nsexec(void)
 			if (config.namespaces)
 				join_namespaces(config.namespaces);
 
+			/*
+			 * Unshare all of the namespaces. Now, it should be noted that this
+			 * ordering might break in the future (especially with rootless
+			 * containers). But for now, it's not possible to split this into
+			 * CLONE_NEWUSER + [the rest] because of some RHEL SELinux issues.
+			 *
+			 * We also  can't be sure if the current kernel supports
+			 * clone(CLONE_PARENT | CLONE_NEWPID), so we'll just do it the long
+			 * way anyway.
+			 */
+			if (unshare(config.cloneflags) < 0)
+				bail("failed to unshare namespaces");
+
 			/*
 			 * Deal with user namespaces first. They are quite special, as they
 			 * affect our ability to unshare other namespaces and are used as
 			 * context for privilege checks.
 			 */
 			if (config.cloneflags & CLONE_NEWUSER) {
-				/* Create a new user namespace. */
-				if (unshare(CLONE_NEWUSER) < 0)
-					bail("failed to unshare user namespace");
-
 				/*
 				 * We don't have the privileges to do any mapping here (see the
 				 * clone_parent rant). So signal our parent to hook us up.
@@ -646,18 +655,8 @@ void nsexec(void)
 					bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
 				if (s != SYNC_USERMAP_ACK)
 					bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
-
-				config.cloneflags &= ~CLONE_NEWUSER;
 			}
 
-			/*
-			 * Now we can unshare the rest of the namespaces. We can't be sure if the
-			 * current kernel supports clone(CLONE_PARENT | CLONE_NEWPID), so we'll
-			 * just do it the long way anyway.
-			 */
-			if (unshare(config.cloneflags) < 0)
-				bail("failed to unshare namespaces");
-
 			/* TODO: What about non-namespace clone flags that we're dropping here? */
 			child = clone_parent(&env, JUMP_INIT);
 			if (child < 0)