From 08c266d89a08c7a5f28e127b2d5b41ecc10ec78c Mon Sep 17 00:00:00 2001
From: Jiang Liu <gerry@linux.alibaba.com>
Date: Thu, 12 May 2022 20:45:47 +0800
Subject: [PATCH 1/5] extend bootstrap message to pass mount fds

Extend bootstrap message to pass mount fds for open_tree()/move_mount().

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
 libcontainer/container_linux.go | 32 ++++++++++++++++++++++++++------
 libcontainer/message_linux.go   |  3 +++
 libcontainer/nsenter/nsexec.c   | 17 +++++++++++++++++
 3 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 34f38f34e0c..1530cd69a76 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -537,10 +537,7 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 		}
 	}
 	_, sharePidns := nsMaps[configs.NEWPID]
-	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
-	if err != nil {
-		return nil, err
-	}
+	userNsMountFds := [3]int{-1, -1, -1}
 
 	if c.shouldSendMountSources() {
 		// Elements on this slice will be paired with mounts (see StartInitialization() and
@@ -571,6 +568,11 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 		)
 	}
 
+	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard, &userNsMountFds)
+	if err != nil {
+		return nil, err
+	}
+
 	init := &initProcess{
 		cmd:             cmd,
 		messageSockPair: messageSockPair,
@@ -595,7 +597,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair,
 	}
 	// for setns process, we don't have to set cloneflags as the process namespaces
 	// will only be set via setns syscall
-	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
+	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns, nil)
 	if err != nil {
 		return nil, err
 	}
@@ -2119,7 +2121,8 @@ type netlinkError struct{ error }
 // such as one that uses nsenter package to bootstrap the container's
 // init process correctly, i.e. with correct namespaces, uid/gid
 // mapping etc.
-func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
+func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string,
+	it initType, userNsMountFd *[3]int) (_ io.Reader, Err error) {
 	// create the netlink message
 	r := nl.NewNetlinkRequest(int(InitMsg), 0)
 
@@ -2238,6 +2241,23 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
 			Type:  MountSourcesAttr,
 			Value: mounts,
 		})
+
+		// File descriptors to support cross user namespace mounting
+		if userNsMountFd == nil {
+			return nil, fmt.Errorf("user mount fd array should not be null")
+		}
+		r.AddData(&Int32msg{
+			Type:  MountFdProc,
+			Value: uint32(userNsMountFd[0]),
+		})
+		r.AddData(&Int32msg{
+			Type:  MountFdSys,
+			Value: uint32(userNsMountFd[1]),
+		})
+		r.AddData(&Int32msg{
+			Type:  MountFdMqueue,
+			Value: uint32(userNsMountFd[2]),
+		})
 	}
 
 	return bytes.NewReader(r.Serialize()), nil
diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go
index 6d1107e875d..bbd55102e74 100644
--- a/libcontainer/message_linux.go
+++ b/libcontainer/message_linux.go
@@ -22,6 +22,9 @@ const (
 	UidmapPathAttr   uint16 = 27288
 	GidmapPathAttr   uint16 = 27289
 	MountSourcesAttr uint16 = 27290
+	MountFdProc      uint16 = 27291
+	MountFdSys       uint16 = 27292
+	MountFdMqueue    uint16 = 27293
 )
 
 type Int32msg struct {
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 9ecf791e93f..5bd17223af7 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -94,6 +94,11 @@ struct nlconfig_t {
 	/* Mount sources opened outside the container userns. */
 	char *mountsources;
 	size_t mountsources_len;
+
+	/* File descriptors to support cross user namespace mounting */
+	int mount_fd_proc;
+	int mount_fd_sys;
+	int mount_fd_mqueue;
 };
 
 /*
@@ -127,6 +132,9 @@ static int loglevel = DEBUG;
 #define UIDMAPPATH_ATTR		27288
 #define GIDMAPPATH_ATTR		27289
 #define MOUNT_SOURCES_ATTR	27290
+#define MOUNT_FD_PROC		27291
+#define MOUNT_FD_SYS		27292
+#define MOUNT_FD_MQUEUE		27293
 
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
@@ -552,6 +560,15 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 			config->mountsources = current;
 			config->mountsources_len = payload_len;
 			break;
+		case MOUNT_FD_PROC:
+			config->mount_fd_proc = readint32(current);
+			break;
+		case MOUNT_FD_SYS:
+			config->mount_fd_sys = readint32(current);
+			break;
+		case MOUNT_FD_MQUEUE:
+			config->mount_fd_mqueue = readint32(current);
+			break;
 		default:
 			bail("unknown netlink message type %d", nlattr->nla_type);
 		}

From 56d878e3875fdb37a48e4e5aadf9d264f9020647 Mon Sep 17 00:00:00 2001
From: Jiang Liu <gerry@linux.alibaba.com>
Date: Thu, 12 May 2022 21:11:29 +0800
Subject: [PATCH 2/5] rootfs_linux: enable support of MoveMount()

Enhance mountToRootfs() to support MoveMount(), so it could be used
to support cross user namespace mounting.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
 libcontainer/configs/mount.go |  4 ++++
 libcontainer/rootfs_linux.go  | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/libcontainer/configs/mount.go b/libcontainer/configs/mount.go
index b4c616d5538..19159d1b90a 100644
--- a/libcontainer/configs/mount.go
+++ b/libcontainer/configs/mount.go
@@ -40,3 +40,7 @@ type Mount struct {
 func (m *Mount) IsBind() bool {
 	return m.Flags&unix.MS_BIND != 0
 }
+
+func (m *Mount) IsMove() bool {
+	return m.Flags&unix.MS_MOVE != 0
+}
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 2a98372b561..61b7d657a3a 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -400,12 +400,25 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error {
 			return err
 		}
 		// Selinux kernels do not support labeling of /proc or /sys
+		if m.IsMove() && *c.fd >= 0 {
+			// fallback to normal mount if MoveMount() fails
+			if err := unix.MoveMount(*c.fd, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err == nil {
+				return nil
+			}
+		}
 		return mountPropagate(m, rootfs, "", nil)
 	case "mqueue":
 		if err := os.MkdirAll(dest, 0o755); err != nil {
 			return err
 		}
-		if err := mountPropagate(m, rootfs, "", nil); err != nil {
+		if m.IsMove() && *c.fd >= 0 {
+			// fallback to normal mount if MoveMount() fails
+			if err := unix.MoveMount(*c.fd, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
+				if err := mountPropagate(m, rootfs, "", nil); err != nil {
+					return err
+				}
+			}
+		} else if err := mountPropagate(m, rootfs, "", nil); err != nil {
 			return err
 		}
 		return label.SetFileLabel(dest, mountLabel)

From 844b8042f9191092506223db87d1a2123cdd979e Mon Sep 17 00:00:00 2001
From: Jiang Liu <gerry@linux.alibaba.com>
Date: Thu, 12 May 2022 22:34:03 +0800
Subject: [PATCH 3/5] nsexec: split join_namespaces() into stages

Introduce struct namespace_info_t to split join_namespaces() in stages,
so it could be reused later.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
 libcontainer/nsenter/nsexec.c | 167 ++++++++++++++++++++++------------
 1 file changed, 109 insertions(+), 58 deletions(-)

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 5bd17223af7..1d9767b5f1f 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -582,71 +582,118 @@ void nl_free(struct nlconfig_t *config)
 	free(config->data);
 }
 
-void join_namespaces(char *nslist)
+#define MAX_NAMSPACE_TYPE_LEN		64
+#define MAX_NAMESPACES				10
+
+struct namespace_t {
+	int target_ns_fd;
+	int flag;
+	char type[MAX_NAMSPACE_TYPE_LEN];
+	char path[PATH_MAX];
+};
+
+struct namespace_info_t {
+	int namespace_cnt;
+	struct namespace_t *namespaces[MAX_NAMESPACES];
+};
+
+static struct namespace_info_t* parse_namespace_info(char *nslist)
 {
-	int num = 0, i;
-	char *saveptr = NULL;
-	char *namespace = strtok_r(nslist, ",", &saveptr);
-	struct namespace_t {
-		int fd;
-		char type[PATH_MAX];
-		char path[PATH_MAX];
-	} *namespaces = NULL;
-
-	if (!namespace || !strlen(namespace) || !strlen(nslist))
-		bail("ns paths are empty");
+	int i;
+	struct namespace_info_t *ns_info;
 
-	/*
-	 * We have to open the file descriptors first, since after
-	 * we join the mnt namespace we might no longer be able to
-	 * access the paths.
-	 */
-	do {
-		int fd;
-		char *path;
-		struct namespace_t *ns;
-
-		/* Resize the namespace array. */
-		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
-		if (!namespaces)
-			bail("failed to reallocate namespace array");
-		ns = &namespaces[num - 1];
-
-		/* Split 'ns:path'. */
-		path = strstr(namespace, ":");
-		if (!path)
-			bail("failed to parse %s", namespace);
-		*path++ = '\0';
-
-		fd = open(path, O_RDONLY);
-		if (fd < 0)
-			bail("failed to open %s", path);
+	ns_info = malloc(sizeof(*ns_info));
+	if (ns_info == NULL) {
+		bail("Can't allocate memory for namespace_info.");
+	}
+	for (i = 0; i < MAX_NAMESPACES; i++) {
+		ns_info->namespaces[i]->target_ns_fd = -1;
+	}
 
-		ns->fd = fd;
-		strncpy(ns->type, namespace, PATH_MAX - 1);
-		strncpy(ns->path, path, PATH_MAX - 1);
-		ns->path[PATH_MAX - 1] = '\0';
-	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
+	if (nslist != NULL) {
+		char *saveptr = NULL;
+		char *namespace = strtok_r(nslist, ",", &saveptr);
 
-	/*
-	 * The ordering in which we join namespaces is important. We should
-	 * always join the user namespace *first*. This is all guaranteed
-	 * from the container_linux.go side of this, so we're just going to
-	 * follow the order given to us.
-	 */
+		if (!namespace || !strlen(namespace) || !strlen(nslist))
+			bail("ns paths are empty");
 
-	for (i = 0; i < num; i++) {
-		struct namespace_t *ns = &namespaces[i];
-		int flag = nsflag(ns->type);
+		/*
+		 * We have to open the file descriptors first, since after
+		 * we join the mnt namespace we might no longer be able to
+		 * access the paths.
+		 */
+		do {
+			int fd;
+			char *path;
+			struct namespace_t *ns;
+
+			/* Resize the namespace array. */
+			ns = malloc(sizeof(struct namespace_t));
+			if (!ns)
+				bail("failed to reallocate namespace array");
+
+			/* Split 'ns:path'. */
+			path = strstr(namespace, ":");
+			if (!path)
+				bail("failed to parse %s", namespace);
+			*path++ = '\0';
+
+			fd = open(path, O_RDONLY);
+			if (fd < 0)
+				bail("failed to open %s", path);
+
+			ns->target_ns_fd = fd;
+			strncpy(ns->type, namespace, MAX_NAMSPACE_TYPE_LEN - 1);
+			ns->type[MAX_NAMSPACE_TYPE_LEN - 1] = '\0';
+			strncpy(ns->path, path, PATH_MAX - 1);
+			ns->path[PATH_MAX - 1] = '\0';
+			ns->flag = nsflag(ns->type);
+
+			if (ns_info->namespace_cnt >= MAX_NAMESPACES)
+				bail("too many namespace configured");
+			ns_info->namespaces[ns_info->namespace_cnt] = ns;
+			ns_info->namespace_cnt++;
+		} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
+	}
 
-		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
-		if (setns(ns->fd, flag) < 0)
-			bail("failed to setns into %s namespace", ns->type);
+	return ns_info;
+}
+
+/*
+ * The ordering in which we join namespaces is important. We should
+ * always join the user namespace *first*. This is all guaranteed
+ * from the container_linux.go side of this, so we're just going to
+ * follow the order given to us.
+ */
+static void join_namespaces(struct namespace_info_t *ns_info)
+{
+	int i;
+
+	for (i = 0; i < ns_info->namespace_cnt; i++) {
+		struct namespace_t *ns = ns_info->namespaces[i];
+
+		if (ns->target_ns_fd >= 0) {
+			write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", ns->flag, ns->type, ns->path);
+			if (setns(ns->target_ns_fd, ns->flag) < 0)
+				bail("failed to setns into %s namespace", ns->type);
+		}
+	}
+}
+
+static void free_namespace_info(struct namespace_info_t *ns_info)
+{
+	int i;
 
-		close(ns->fd);
+	for (i = 0; i < ns_info->namespace_cnt; i++) {
+		struct namespace_t *ns = ns_info->namespaces[i];
+
+		if (ns->target_ns_fd >= 0) {
+			close(ns->target_ns_fd);
+		}
+		free(ns);
 	}
 
-	free(namespaces);
+	free(ns_info);
 }
 
 /* Defined in cloned_binary.c. */
@@ -1146,6 +1193,7 @@ void nsexec(void)
 	case STAGE_CHILD:{
 			pid_t stage2_pid = -1;
 			enum sync_t s;
+			struct namespace_info_t *ns_info;
 
 			/* We're in a child and thus need to tell the parent if we die. */
 			syncfd = sync_child_pipe[0];
@@ -1162,8 +1210,11 @@ void nsexec(void)
 			 * [stage 2: STAGE_INIT]) would be meaningless). We could send it
 			 * using cmsg(3) but that's just annoying.
 			 */
-			if (config.namespaces)
-				join_namespaces(config.namespaces);
+			ns_info = parse_namespace_info(config.namespaces);
+			if (ns_info->namespace_cnt > 0)
+				join_namespaces(ns_info);
+			free_namespace_info(ns_info);
+			ns_info = NULL;
 
 			/*
 			 * Deal with user namespaces first. They are quite special, as they

From 2d5bdfb1332a3ff5f88ef1d022696c47eea95fbd Mon Sep 17 00:00:00 2001
From: Jiang Liu <gerry@linux.alibaba.com>
Date: Fri, 13 May 2022 00:34:17 +0800
Subject: [PATCH 4/5] nsexec: prepare mount fds for cross user namespace
 mounting

Prepare source mount fds for move_mount() to support cross user
namespace mounting.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
---
 libcontainer/nsenter/nsexec.c | 140 ++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 1d9767b5f1f..f6933396b95 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -18,6 +18,7 @@
 #include <unistd.h>
 
 #include <sys/ioctl.h>
+#include <sys/mount.h>
 #include <sys/prctl.h>
 #include <sys/socket.h>
 #include <sys/types.h>
@@ -587,6 +588,7 @@ void nl_free(struct nlconfig_t *config)
 
 struct namespace_t {
 	int target_ns_fd;
+	int original_ns_fd;
 	int flag;
 	char type[MAX_NAMSPACE_TYPE_LEN];
 	char path[PATH_MAX];
@@ -608,6 +610,7 @@ static struct namespace_info_t* parse_namespace_info(char *nslist)
 	}
 	for (i = 0; i < MAX_NAMESPACES; i++) {
 		ns_info->namespaces[i]->target_ns_fd = -1;
+		ns_info->namespaces[i]->original_ns_fd = -1;
 	}
 
 	if (nslist != NULL) {
@@ -690,12 +693,147 @@ static void free_namespace_info(struct namespace_info_t *ns_info)
 		if (ns->target_ns_fd >= 0) {
 			close(ns->target_ns_fd);
 		}
+		if (ns->original_ns_fd >= 0) {
+			close(ns->original_ns_fd);
+		}
 		free(ns);
 	}
 
 	free(ns_info);
 }
 
+#ifndef OPEN_TREE_CLONE
+#define OPEN_TREE_CLONE					1
+#endif
+
+#ifndef OPEN_TREE_CLOEXEC
+#define OPEN_TREE_CLOEXEC				O_CLOEXEC
+#endif
+
+#ifndef __NR_open_tree
+#define __NR_open_tree					428
+#endif
+
+// container_linux.go ensures that syscall open_tree is available.
+static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
+{
+	return syscall(__NR_open_tree, dfd, filename, flags);
+}
+
+static void cleanup_mount_fds_tempdir(char *dirname)
+{
+	umount2(dirname, MNT_FORCE | MNT_DETACH);
+	rmdir(dirname);
+}
+
+static void prepare_mount_fd(char *dirname, char *type, int target_fd, int flags)
+{
+	int fd, ret;
+	char path[PATH_MAX];
+
+	if (snprintf(path, PATH_MAX, "%s/%s", dirname, type) < 0) {
+		cleanup_mount_fds_tempdir(dirname);
+		bail("failed to prepare temp directory path for %s", type);
+	}
+
+	if (mount(type, path, type, flags, NULL) < 0) {
+		rmdir(path);
+		cleanup_mount_fds_tempdir(dirname);
+		bail("failed to mount %s onto temp directory", type);
+	}
+
+	fd = sys_open_tree(-1, path, OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE | AT_RECURSIVE);
+	umount2(path,  MNT_FORCE | MNT_DETACH);
+	rmdir(path);
+	if (fd < 0) {
+		cleanup_mount_fds_tempdir(dirname);
+		bail("failed to open temporary mountpoint of %s", type);
+	}
+
+	ret = dup3(fd, target_fd, O_CLOEXEC);
+	close(fd);
+	if (ret < 0) {
+		cleanup_mount_fds_tempdir(dirname);
+		bail("failed to duplicate the mount fd for %s", type);
+	}
+}
+
+/* Prepare temparory mounts for move_mount() */
+static void prepare_mount_fds(struct namespace_info_t *ns_info, struct nlconfig_t *config)
+{
+	int i, fd, ret, flags;
+	char name[PATH_MAX];
+	char dirname_buf[PATH_MAX];
+	char *dirname;
+
+	// Get handles of current namespaces except user namespace
+	for (i = 0; i < ns_info->namespace_cnt; i++) {
+		struct namespace_t *ns = ns_info->namespaces[i];
+
+		if (ns->target_ns_fd >= 0 && ns->flag != 0 && ns->flag != CLONE_NEWUSER) {
+			if (snprintf(name, PATH_MAX, "/proc/self/ns/%s", ns->type) < 0)
+				bail("failed to prepare file path for current %s namespace", ns->type);
+			fd = open(name, O_RDONLY | O_CLOEXEC);
+			if (fd == -1)
+				bail("failed to open current %s namespace", ns->type);
+			ns->original_ns_fd = fd;
+		}
+	}
+
+	// Join target namespaces except user namespace
+	for (i = 0; i < ns_info->namespace_cnt; i++) {
+		struct namespace_t *ns = ns_info->namespaces[i];
+
+		if (ns->target_ns_fd >= 0 && ns->flag != 0 && ns->flag != CLONE_NEWUSER) {
+			if (setns(ns->target_ns_fd, ns->flag) < 0)
+				bail("failed to setns into %s namespace", ns->type);
+		}
+	}
+
+	// Create a temp working directory, similar to prepareTmp()
+	strcpy(dirname_buf, "/tmp/runc-mountfds-XXXXXX");
+	dirname = mkdtemp(dirname_buf);
+	if (dirname == NULL)
+		bail("failed to create temporary directory for mount fds");
+	ret = mount(dirname, dirname, "bind", MS_BIND, NULL);
+	if (ret < 0) {
+		cleanup_mount_fds_tempdir(dirname);
+		bail("failed to bind mount temporary directory for mount fds");
+	}
+	ret = mount(dirname, dirname, NULL, MS_PRIVATE, NULL);
+	if (ret < 0) {
+		cleanup_mount_fds_tempdir(dirname);
+		bail("failed to bind mount temporary directory as private for mount fds");
+	}
+
+	if (config->mount_fd_proc >= 0) {
+		flags = MS_NOSUID | MS_NOEXEC | MS_NODEV;
+		prepare_mount_fd(dirname, "proc", config->mount_fd_proc, flags);
+	}
+	if (config->mount_fd_sys >= 0) {
+		flags = MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_RDONLY;
+		prepare_mount_fd(dirname, "sysfs", config->mount_fd_sys, flags);
+	}
+	if (config->mount_fd_mqueue >= 0) {
+		flags = MS_NOSUID | MS_NOEXEC | MS_NODEV;
+		prepare_mount_fd(dirname, "mqueue", config->mount_fd_mqueue, flags);
+	}
+
+	cleanup_mount_fds_tempdir(dirname);
+
+	// Rejoin original namespaces except user namespace
+	for (i = 0; i < ns_info->namespace_cnt; i++) {
+		struct namespace_t *ns = ns_info->namespaces[i];
+
+		if (ns->original_ns_fd >= 0) {
+			if (setns(ns->original_ns_fd, ns->flag) < 0)
+				bail("failed to setns into %s namespace", ns->type);
+			close(ns->original_ns_fd);
+			ns->original_ns_fd = -1;
+		}
+	}
+}
+
 /* Defined in cloned_binary.c. */
 extern int ensure_cloned_binary(void);
 
@@ -1211,6 +1349,8 @@ void nsexec(void)
 			 * using cmsg(3) but that's just annoying.
 			 */
 			ns_info = parse_namespace_info(config.namespaces);
+			if (config.mount_fd_proc >= 0 || config.mount_fd_proc >= 0 || config.mount_fd_mqueue >= 0)
+				prepare_mount_fds(ns_info, &config);
 			if (ns_info->namespace_cnt > 0)
 				join_namespaces(ns_info);
 			free_namespace_info(ns_info);

From 6dfc4ffe1c009d90d9eb57d09aaca5b9320448c1 Mon Sep 17 00:00:00 2001
From: Tingting Yang <shidao@linux.alibaba.com>
Date: Tue, 27 Sep 2022 13:03:26 +0800
Subject: [PATCH 5/5] Enable cross user namespace mounting

When a user namespace is enabled for a pod/container, it may fail to
mount /proc, /sys and /dev/mqueue under certain conditions. This may
be solved by enabling cross user namespace mounting.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
Signed-off-by: shidao.ytt <shidao.ytt.kernel@linux.alibaba.com>
---
 libcontainer/container_linux.go | 46 ++++++++++++++++++++-
 libcontainer/nsenter/nsexec.c   | 71 +++++++++++++++++++++++++++------
 2 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index 1530cd69a76..96091888b7b 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -520,7 +520,7 @@ func (c *Container) shouldSendMountSources() bool {
 
 	// We need to send sources if there are bind-mounts.
 	for _, m := range c.config.Mounts {
-		if m.IsBind() {
+		if m.IsBind() || m.IsMove() {
 			return true
 		}
 	}
@@ -536,15 +536,46 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 			nsMaps[ns.Type] = ns.Path
 		}
 	}
+
+	nsList := make(map[configs.NamespaceType]string)
+	for _, ns := range c.config.Namespaces {
+		nsList[ns.Type] = ns.Path
+	}
+
 	_, sharePidns := nsMaps[configs.NEWPID]
 	userNsMountFds := [3]int{-1, -1, -1}
 
+	// Enable open_tree()/move_mount() for special filesystems to support cross user namespace mounting.
+	if _, ok := nsList[configs.NEWUSER]; ok {
+		for idx, m := range c.config.Mounts {
+			if m.Device == "proc" && m.Source == "proc" && m.Destination == "/proc" &&
+				m.Flags == unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID {
+				// procfs depends on Pid namespace
+				if _, exist := nsList[configs.NEWPID]; exist {
+				}
+			} else if m.Device == "sysfs" && m.Source == "sysfs" && m.Destination == "/sys" &&
+				m.Flags == unix.MS_NODEV|unix.MS_NOSUID|unix.MS_NOEXEC|unix.MS_RDONLY {
+				// sysfs depends on Net namespace
+				// meaning exclude runc unshare new netns
+				if path, exist := nsList[configs.NEWNET]; !exist || path != "" {
+					c.config.Mounts[idx].Flags |= unix.MS_MOVE
+				}
+			} else if m.Device == "mqueue" && m.Source == "mqueue" && m.Destination == "/dev/mqueue" &&
+				m.Flags == unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID {
+				// /mqueue depends on IPC namespace
+				if _, exist := nsList[configs.NEWIPC]; exist {
+					c.config.Mounts[idx].Flags |= unix.MS_MOVE
+				}
+			}
+		}
+	}
+
 	if c.shouldSendMountSources() {
 		// Elements on this slice will be paired with mounts (see StartInitialization() and
 		// prepareRootfs()). This slice MUST have the same size as c.config.Mounts.
 		mountFds := make([]int, len(c.config.Mounts))
 		for i, m := range c.config.Mounts {
-			if !m.IsBind() {
+			if !m.IsBind() && !m.IsMove() {
 				// Non bind-mounts do not use an fd.
 				mountFds[i] = -1
 				continue
@@ -556,6 +587,17 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l
 			// lifecycle of that fd is already taken care of.
 			cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child)
 			mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
+
+			// MS_MOVE flag is set for cross user namespace mounting
+			if m.IsMove() {
+				if m.Device == "proc" {
+					userNsMountFds[0] = mountFds[i]
+				} else if m.Device == "sysfs" {
+					userNsMountFds[1] = mountFds[i]
+				} else if m.Device == "mqueue" {
+					userNsMountFds[2] = mountFds[i]
+				}
+			}
 		}
 
 		mountFdsJson, err := json.Marshal(mountFds)
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index f6933396b95..5ce0a2f398c 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -27,12 +27,15 @@
 #include <linux/limits.h>
 #include <linux/netlink.h>
 #include <linux/types.h>
+#include <sys/stat.h>
 
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 
 extern char *escape_json_string(char *str);
 
+#define AT_RECURSIVE 0x8000
+
 /* Synchronisation values. */
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
@@ -486,6 +489,12 @@ static uint8_t readint8(char *buf)
 	return *(uint8_t *) buf;
 }
 
+static void set_default_config(struct nlconfig_t *config) {
+	config->mount_fd_proc = -1;
+	config->mount_fd_sys = -1;
+	config->mount_fd_mqueue = -1;
+}
+
 static void nl_parse(int fd, struct nlconfig_t *config)
 {
 	size_t len, size;
@@ -590,6 +599,7 @@ struct namespace_t {
 	int target_ns_fd;
 	int original_ns_fd;
 	int flag;
+	int done;
 	char type[MAX_NAMSPACE_TYPE_LEN];
 	char path[PATH_MAX];
 };
@@ -601,17 +611,13 @@ struct namespace_info_t {
 
 static struct namespace_info_t* parse_namespace_info(char *nslist)
 {
-	int i;
 	struct namespace_info_t *ns_info;
 
 	ns_info = malloc(sizeof(*ns_info));
 	if (ns_info == NULL) {
 		bail("Can't allocate memory for namespace_info.");
 	}
-	for (i = 0; i < MAX_NAMESPACES; i++) {
-		ns_info->namespaces[i]->target_ns_fd = -1;
-		ns_info->namespaces[i]->original_ns_fd = -1;
-	}
+	memset(ns_info, 0, sizeof(*ns_info));
 
 	if (nslist != NULL) {
 		char *saveptr = NULL;
@@ -634,6 +640,9 @@ static struct namespace_info_t* parse_namespace_info(char *nslist)
 			ns = malloc(sizeof(struct namespace_t));
 			if (!ns)
 				bail("failed to reallocate namespace array");
+			memset(ns, 0, sizeof(struct namespace_t));
+			ns->target_ns_fd = -1;
+			ns->original_ns_fd = -1;
 
 			/* Split 'ns:path'. */
 			path = strstr(namespace, ":");
@@ -671,14 +680,35 @@ static struct namespace_info_t* parse_namespace_info(char *nslist)
 static void join_namespaces(struct namespace_info_t *ns_info)
 {
 	int i;
+	int userns_idx = -1;
 
 	for (i = 0; i < ns_info->namespace_cnt; i++) {
 		struct namespace_t *ns = ns_info->namespaces[i];
+		if (!strcmp(ns->type, "user")) {
+			userns_idx = i;
+			break;
+		}
+	}
 
-		if (ns->target_ns_fd >= 0) {
+	if (userns_idx >= 0) {
+		for (i = 0; i < ns_info->namespace_cnt; i++) {
+			struct namespace_t *ns = ns_info->namespaces[i];
+
+			if (i != userns_idx && ns->target_ns_fd >= 0) {
+				write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", ns->flag, ns->type, ns->path);
+				if (setns(ns->target_ns_fd, ns->flag) >= 0)
+					ns->done = 1;
+			}
+		}
+	}
+
+	for (i = 0; i < ns_info->namespace_cnt; i++) {
+		struct namespace_t *ns = ns_info->namespaces[i];
+
+		if (ns->target_ns_fd >= 0 && !ns->done) {
 			write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", ns->flag, ns->type, ns->path);
 			if (setns(ns->target_ns_fd, ns->flag) < 0)
-				bail("failed to setns into %s namespace", ns->type);
+				bail("failed to setns into %s namespace, error %d", ns->type, errno);
 		}
 	}
 }
@@ -735,11 +765,18 @@ static void prepare_mount_fd(char *dirname, char *type, int target_fd, int flags
 		cleanup_mount_fds_tempdir(dirname);
 		bail("failed to prepare temp directory path for %s", type);
 	}
+    ret = mkdir(path, 0644);
+    if (ret < 0) {
+		rmdir(path);
+		cleanup_mount_fds_tempdir(dirname);
+		bail("failed to mount %s onto temp directory, ret %d", type, ret);
+	}
 
-	if (mount(type, path, type, flags, NULL) < 0) {
+    ret = mount(type, path, type, flags, NULL);
+	if ( ret < 0) {
 		rmdir(path);
 		cleanup_mount_fds_tempdir(dirname);
-		bail("failed to mount %s onto temp directory", type);
+		bail("failed to mount %s onto temp directory, ret %d", type, ret);
 	}
 
 	fd = sys_open_tree(-1, path, OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE | AT_RECURSIVE);
@@ -783,7 +820,8 @@ static void prepare_mount_fds(struct namespace_info_t *ns_info, struct nlconfig_
 	// Join target namespaces except user namespace
 	for (i = 0; i < ns_info->namespace_cnt; i++) {
 		struct namespace_t *ns = ns_info->namespaces[i];
-
+        if (!strcmp(ns->type, "user"))
+            continue;
 		if (ns->target_ns_fd >= 0 && ns->flag != 0 && ns->flag != CLONE_NEWUSER) {
 			if (setns(ns->target_ns_fd, ns->flag) < 0)
 				bail("failed to setns into %s namespace", ns->type);
@@ -824,7 +862,8 @@ static void prepare_mount_fds(struct namespace_info_t *ns_info, struct nlconfig_
 	// Rejoin original namespaces except user namespace
 	for (i = 0; i < ns_info->namespace_cnt; i++) {
 		struct namespace_t *ns = ns_info->namespaces[i];
-
+        if (!strcmp(ns->type, "user"))
+            continue;
 		if (ns->original_ns_fd >= 0) {
 			if (setns(ns->original_ns_fd, ns->flag) < 0)
 				bail("failed to setns into %s namespace", ns->type);
@@ -940,7 +979,7 @@ void send_fd(int sockfd, int fd)
 		bail("failed to send fd %d via unix socket %d", fd, sockfd);
 }
 
-void receive_mountsources(int sockfd)
+void receive_mountsources(int sockfd, struct nlconfig_t *config)
 {
 	char *mount_fds, *endp;
 	long new_fd;
@@ -970,6 +1009,11 @@ void receive_mountsources(int sockfd)
 			bail("malformed _LIBCONTAINER_MOUNT_FDS env var: fds out of range");
 		}
 
+		// Skip file descriptors for cross user namespace mounting
+		if (new_fd == config->mount_fd_proc || new_fd == config->mount_fd_sys || new_fd == config->mount_fd_mqueue) {
+			continue;
+		}
+
 		receive_fd(sockfd, new_fd);
 	}
 }
@@ -1074,6 +1118,7 @@ void nsexec(void)
 	write_log(DEBUG, "=> nsexec container setup");
 
 	/* Parse all of the netlink configuration. */
+	set_default_config(&config);
 	nl_parse(pipenum, &config);
 
 	/* Set oom_score_adj. This has to be done before !dumpable because
@@ -1443,7 +1488,7 @@ void nsexec(void)
 				}
 
 				/* Receive and install all mount sources fds. */
-				receive_mountsources(syncfd);
+				receive_mountsources(syncfd, &config);
 
 				/* Parent finished to send the mount sources fds. */
 				if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {