From 08c266d89a08c7a5f28e127b2d5b41ecc10ec78c Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 12 May 2022 20:45:47 +0800 Subject: [PATCH 1/5] extend bootstrap message to pass mount fds Extend bootstrap message to pass mount fds for open_tree()/move_mount(). Signed-off-by: Jiang Liu --- libcontainer/container_linux.go | 32 ++++++++++++++++++++++++++------ libcontainer/message_linux.go | 3 +++ libcontainer/nsenter/nsexec.c | 17 +++++++++++++++++ 3 files changed, 46 insertions(+), 6 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 34f38f34e0c..1530cd69a76 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -537,10 +537,7 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l } } _, sharePidns := nsMaps[configs.NEWPID] - data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) - if err != nil { - return nil, err - } + userNsMountFds := [3]int{-1, -1, -1} if c.shouldSendMountSources() { // Elements on this slice will be paired with mounts (see StartInitialization() and @@ -571,6 +568,11 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l ) } + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard, &userNsMountFds) + if err != nil { + return nil, err + } + init := &initProcess{ cmd: cmd, messageSockPair: messageSockPair, @@ -595,7 +597,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, messageSockPair, } // for setns process, we don't have to set cloneflags as the process namespaces // will only be set via setns syscall - data, err := c.bootstrapData(0, state.NamespacePaths, initSetns) + data, err := c.bootstrapData(0, state.NamespacePaths, initSetns, nil) if err != nil { return nil, err } @@ -2119,7 +2121,8 @@ type netlinkError struct{ error } // such as one that uses nsenter package to bootstrap the container's // init process correctly, i.e. with correct namespaces, uid/gid // mapping etc. -func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) { +func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, + it initType, userNsMountFd *[3]int) (_ io.Reader, Err error) { // create the netlink message r := nl.NewNetlinkRequest(int(InitMsg), 0) @@ -2238,6 +2241,23 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa Type: MountSourcesAttr, Value: mounts, }) + + // File descriptors to support cross user namespace mounting + if userNsMountFd == nil { + return nil, fmt.Errorf("user mount fd array should not be null") + } + r.AddData(&Int32msg{ + Type: MountFdProc, + Value: uint32(userNsMountFd[0]), + }) + r.AddData(&Int32msg{ + Type: MountFdSys, + Value: uint32(userNsMountFd[1]), + }) + r.AddData(&Int32msg{ + Type: MountFdMqueue, + Value: uint32(userNsMountFd[2]), + }) } return bytes.NewReader(r.Serialize()), nil diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 6d1107e875d..bbd55102e74 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -22,6 +22,9 @@ const ( UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 MountSourcesAttr uint16 = 27290 + MountFdProc uint16 = 27291 + MountFdSys uint16 = 27292 + MountFdMqueue uint16 = 27293 ) type Int32msg struct { diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 9ecf791e93f..5bd17223af7 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -94,6 +94,11 @@ struct nlconfig_t { /* Mount sources opened outside the container userns. */ char *mountsources; size_t mountsources_len; + + /* File descriptors to support cross user namespace mounting */ + int mount_fd_proc; + int mount_fd_sys; + int mount_fd_mqueue; }; /* @@ -127,6 +132,9 @@ static int loglevel = DEBUG; #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 #define MOUNT_SOURCES_ATTR 27290 +#define MOUNT_FD_PROC 27291 +#define MOUNT_FD_SYS 27292 +#define MOUNT_FD_MQUEUE 27293 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -552,6 +560,15 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->mountsources = current; config->mountsources_len = payload_len; break; + case MOUNT_FD_PROC: + config->mount_fd_proc = readint32(current); + break; + case MOUNT_FD_SYS: + config->mount_fd_sys = readint32(current); + break; + case MOUNT_FD_MQUEUE: + config->mount_fd_mqueue = readint32(current); + break; default: bail("unknown netlink message type %d", nlattr->nla_type); } From 56d878e3875fdb37a48e4e5aadf9d264f9020647 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 12 May 2022 21:11:29 +0800 Subject: [PATCH 2/5] rootfs_linux: enable support of MoveMount() Enhance mountToRootfs() to support MoveMount(), so it could be used to support cross user namespace mounting. Signed-off-by: Jiang Liu --- libcontainer/configs/mount.go | 4 ++++ libcontainer/rootfs_linux.go | 15 ++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/libcontainer/configs/mount.go b/libcontainer/configs/mount.go index b4c616d5538..19159d1b90a 100644 --- a/libcontainer/configs/mount.go +++ b/libcontainer/configs/mount.go @@ -40,3 +40,7 @@ type Mount struct { func (m *Mount) IsBind() bool { return m.Flags&unix.MS_BIND != 0 } + +func (m *Mount) IsMove() bool { + return m.Flags&unix.MS_MOVE != 0 +} diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 2a98372b561..61b7d657a3a 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -400,12 +400,25 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { return err } // Selinux kernels do not support labeling of /proc or /sys + if m.IsMove() && *c.fd >= 0 { + // fallback to normal mount if MoveMount() fails + if err := unix.MoveMount(*c.fd, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err == nil { + return nil + } + } return mountPropagate(m, rootfs, "", nil) case "mqueue": if err := os.MkdirAll(dest, 0o755); err != nil { return err } - if err := mountPropagate(m, rootfs, "", nil); err != nil { + if m.IsMove() && *c.fd >= 0 { + // fallback to normal mount if MoveMount() fails + if err := unix.MoveMount(*c.fd, "", -1, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil { + if err := mountPropagate(m, rootfs, "", nil); err != nil { + return err + } + } + } else if err := mountPropagate(m, rootfs, "", nil); err != nil { return err } return label.SetFileLabel(dest, mountLabel) From 844b8042f9191092506223db87d1a2123cdd979e Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Thu, 12 May 2022 22:34:03 +0800 Subject: [PATCH 3/5] nsexec: split join_namespaces() into stages Introduce struct namespace_info_t to split join_namespaces() in stages, so it could be reused later. Signed-off-by: Jiang Liu --- libcontainer/nsenter/nsexec.c | 167 ++++++++++++++++++++++------------ 1 file changed, 109 insertions(+), 58 deletions(-) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 5bd17223af7..1d9767b5f1f 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -582,71 +582,118 @@ void nl_free(struct nlconfig_t *config) free(config->data); } -void join_namespaces(char *nslist) +#define MAX_NAMSPACE_TYPE_LEN 64 +#define MAX_NAMESPACES 10 + +struct namespace_t { + int target_ns_fd; + int flag; + char type[MAX_NAMSPACE_TYPE_LEN]; + char path[PATH_MAX]; +}; + +struct namespace_info_t { + int namespace_cnt; + struct namespace_t *namespaces[MAX_NAMESPACES]; +}; + +static struct namespace_info_t* parse_namespace_info(char *nslist) { - int num = 0, i; - char *saveptr = NULL; - char *namespace = strtok_r(nslist, ",", &saveptr); - struct namespace_t { - int fd; - char type[PATH_MAX]; - char path[PATH_MAX]; - } *namespaces = NULL; - - if (!namespace || !strlen(namespace) || !strlen(nslist)) - bail("ns paths are empty"); + int i; + struct namespace_info_t *ns_info; - /* - * We have to open the file descriptors first, since after - * we join the mnt namespace we might no longer be able to - * access the paths. - */ - do { - int fd; - char *path; - struct namespace_t *ns; - - /* Resize the namespace array. */ - namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t)); - if (!namespaces) - bail("failed to reallocate namespace array"); - ns = &namespaces[num - 1]; - - /* Split 'ns:path'. */ - path = strstr(namespace, ":"); - if (!path) - bail("failed to parse %s", namespace); - *path++ = '\0'; - - fd = open(path, O_RDONLY); - if (fd < 0) - bail("failed to open %s", path); + ns_info = malloc(sizeof(*ns_info)); + if (ns_info == NULL) { + bail("Can't allocate memory for namespace_info."); + } + for (i = 0; i < MAX_NAMESPACES; i++) { + ns_info->namespaces[i]->target_ns_fd = -1; + } - ns->fd = fd; - strncpy(ns->type, namespace, PATH_MAX - 1); - strncpy(ns->path, path, PATH_MAX - 1); - ns->path[PATH_MAX - 1] = '\0'; - } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + if (nslist != NULL) { + char *saveptr = NULL; + char *namespace = strtok_r(nslist, ",", &saveptr); - /* - * The ordering in which we join namespaces is important. We should - * always join the user namespace *first*. This is all guaranteed - * from the container_linux.go side of this, so we're just going to - * follow the order given to us. - */ + if (!namespace || !strlen(namespace) || !strlen(nslist)) + bail("ns paths are empty"); - for (i = 0; i < num; i++) { - struct namespace_t *ns = &namespaces[i]; - int flag = nsflag(ns->type); + /* + * We have to open the file descriptors first, since after + * we join the mnt namespace we might no longer be able to + * access the paths. + */ + do { + int fd; + char *path; + struct namespace_t *ns; + + /* Resize the namespace array. */ + ns = malloc(sizeof(struct namespace_t)); + if (!ns) + bail("failed to reallocate namespace array"); + + /* Split 'ns:path'. */ + path = strstr(namespace, ":"); + if (!path) + bail("failed to parse %s", namespace); + *path++ = '\0'; + + fd = open(path, O_RDONLY); + if (fd < 0) + bail("failed to open %s", path); + + ns->target_ns_fd = fd; + strncpy(ns->type, namespace, MAX_NAMSPACE_TYPE_LEN - 1); + ns->type[MAX_NAMSPACE_TYPE_LEN - 1] = '\0'; + strncpy(ns->path, path, PATH_MAX - 1); + ns->path[PATH_MAX - 1] = '\0'; + ns->flag = nsflag(ns->type); + + if (ns_info->namespace_cnt >= MAX_NAMESPACES) + bail("too many namespace configured"); + ns_info->namespaces[ns_info->namespace_cnt] = ns; + ns_info->namespace_cnt++; + } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); + } - write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); - if (setns(ns->fd, flag) < 0) - bail("failed to setns into %s namespace", ns->type); + return ns_info; +} + +/* + * The ordering in which we join namespaces is important. We should + * always join the user namespace *first*. This is all guaranteed + * from the container_linux.go side of this, so we're just going to + * follow the order given to us. + */ +static void join_namespaces(struct namespace_info_t *ns_info) +{ + int i; + + for (i = 0; i < ns_info->namespace_cnt; i++) { + struct namespace_t *ns = ns_info->namespaces[i]; + + if (ns->target_ns_fd >= 0) { + write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", ns->flag, ns->type, ns->path); + if (setns(ns->target_ns_fd, ns->flag) < 0) + bail("failed to setns into %s namespace", ns->type); + } + } +} + +static void free_namespace_info(struct namespace_info_t *ns_info) +{ + int i; - close(ns->fd); + for (i = 0; i < ns_info->namespace_cnt; i++) { + struct namespace_t *ns = ns_info->namespaces[i]; + + if (ns->target_ns_fd >= 0) { + close(ns->target_ns_fd); + } + free(ns); } - free(namespaces); + free(ns_info); } /* Defined in cloned_binary.c. */ @@ -1146,6 +1193,7 @@ void nsexec(void) case STAGE_CHILD:{ pid_t stage2_pid = -1; enum sync_t s; + struct namespace_info_t *ns_info; /* We're in a child and thus need to tell the parent if we die. */ syncfd = sync_child_pipe[0]; @@ -1162,8 +1210,11 @@ void nsexec(void) * [stage 2: STAGE_INIT]) would be meaningless). We could send it * using cmsg(3) but that's just annoying. */ - if (config.namespaces) - join_namespaces(config.namespaces); + ns_info = parse_namespace_info(config.namespaces); + if (ns_info->namespace_cnt > 0) + join_namespaces(ns_info); + free_namespace_info(ns_info); + ns_info = NULL; /* * Deal with user namespaces first. They are quite special, as they From 2d5bdfb1332a3ff5f88ef1d022696c47eea95fbd Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 13 May 2022 00:34:17 +0800 Subject: [PATCH 4/5] nsexec: prepare mount fds for cross user namespace mounting Prepare source mount fds for move_mount() to support cross user namespace mounting. Signed-off-by: Jiang Liu --- libcontainer/nsenter/nsexec.c | 140 ++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 1d9767b5f1f..f6933396b95 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -587,6 +588,7 @@ void nl_free(struct nlconfig_t *config) struct namespace_t { int target_ns_fd; + int original_ns_fd; int flag; char type[MAX_NAMSPACE_TYPE_LEN]; char path[PATH_MAX]; @@ -608,6 +610,7 @@ static struct namespace_info_t* parse_namespace_info(char *nslist) } for (i = 0; i < MAX_NAMESPACES; i++) { ns_info->namespaces[i]->target_ns_fd = -1; + ns_info->namespaces[i]->original_ns_fd = -1; } if (nslist != NULL) { @@ -690,12 +693,147 @@ static void free_namespace_info(struct namespace_info_t *ns_info) if (ns->target_ns_fd >= 0) { close(ns->target_ns_fd); } + if (ns->original_ns_fd >= 0) { + close(ns->original_ns_fd); + } free(ns); } free(ns_info); } +#ifndef OPEN_TREE_CLONE +#define OPEN_TREE_CLONE 1 +#endif + +#ifndef OPEN_TREE_CLOEXEC +#define OPEN_TREE_CLOEXEC O_CLOEXEC +#endif + +#ifndef __NR_open_tree +#define __NR_open_tree 428 +#endif + +// container_linux.go ensures that syscall open_tree is available. +static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags) +{ + return syscall(__NR_open_tree, dfd, filename, flags); +} + +static void cleanup_mount_fds_tempdir(char *dirname) +{ + umount2(dirname, MNT_FORCE | MNT_DETACH); + rmdir(dirname); +} + +static void prepare_mount_fd(char *dirname, char *type, int target_fd, int flags) +{ + int fd, ret; + char path[PATH_MAX]; + + if (snprintf(path, PATH_MAX, "%s/%s", dirname, type) < 0) { + cleanup_mount_fds_tempdir(dirname); + bail("failed to prepare temp directory path for %s", type); + } + + if (mount(type, path, type, flags, NULL) < 0) { + rmdir(path); + cleanup_mount_fds_tempdir(dirname); + bail("failed to mount %s onto temp directory", type); + } + + fd = sys_open_tree(-1, path, OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE | AT_RECURSIVE); + umount2(path, MNT_FORCE | MNT_DETACH); + rmdir(path); + if (fd < 0) { + cleanup_mount_fds_tempdir(dirname); + bail("failed to open temporary mountpoint of %s", type); + } + + ret = dup3(fd, target_fd, O_CLOEXEC); + close(fd); + if (ret < 0) { + cleanup_mount_fds_tempdir(dirname); + bail("failed to duplicate the mount fd for %s", type); + } +} + +/* Prepare temparory mounts for move_mount() */ +static void prepare_mount_fds(struct namespace_info_t *ns_info, struct nlconfig_t *config) +{ + int i, fd, ret, flags; + char name[PATH_MAX]; + char dirname_buf[PATH_MAX]; + char *dirname; + + // Get handles of current namespaces except user namespace + for (i = 0; i < ns_info->namespace_cnt; i++) { + struct namespace_t *ns = ns_info->namespaces[i]; + + if (ns->target_ns_fd >= 0 && ns->flag != 0 && ns->flag != CLONE_NEWUSER) { + if (snprintf(name, PATH_MAX, "/proc/self/ns/%s", ns->type) < 0) + bail("failed to prepare file path for current %s namespace", ns->type); + fd = open(name, O_RDONLY | O_CLOEXEC); + if (fd == -1) + bail("failed to open current %s namespace", ns->type); + ns->original_ns_fd = fd; + } + } + + // Join target namespaces except user namespace + for (i = 0; i < ns_info->namespace_cnt; i++) { + struct namespace_t *ns = ns_info->namespaces[i]; + + if (ns->target_ns_fd >= 0 && ns->flag != 0 && ns->flag != CLONE_NEWUSER) { + if (setns(ns->target_ns_fd, ns->flag) < 0) + bail("failed to setns into %s namespace", ns->type); + } + } + + // Create a temp working directory, similar to prepareTmp() + strcpy(dirname_buf, "/tmp/runc-mountfds-XXXXXX"); + dirname = mkdtemp(dirname_buf); + if (dirname == NULL) + bail("failed to create temporary directory for mount fds"); + ret = mount(dirname, dirname, "bind", MS_BIND, NULL); + if (ret < 0) { + cleanup_mount_fds_tempdir(dirname); + bail("failed to bind mount temporary directory for mount fds"); + } + ret = mount(dirname, dirname, NULL, MS_PRIVATE, NULL); + if (ret < 0) { + cleanup_mount_fds_tempdir(dirname); + bail("failed to bind mount temporary directory as private for mount fds"); + } + + if (config->mount_fd_proc >= 0) { + flags = MS_NOSUID | MS_NOEXEC | MS_NODEV; + prepare_mount_fd(dirname, "proc", config->mount_fd_proc, flags); + } + if (config->mount_fd_sys >= 0) { + flags = MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_RDONLY; + prepare_mount_fd(dirname, "sysfs", config->mount_fd_sys, flags); + } + if (config->mount_fd_mqueue >= 0) { + flags = MS_NOSUID | MS_NOEXEC | MS_NODEV; + prepare_mount_fd(dirname, "mqueue", config->mount_fd_mqueue, flags); + } + + cleanup_mount_fds_tempdir(dirname); + + // Rejoin original namespaces except user namespace + for (i = 0; i < ns_info->namespace_cnt; i++) { + struct namespace_t *ns = ns_info->namespaces[i]; + + if (ns->original_ns_fd >= 0) { + if (setns(ns->original_ns_fd, ns->flag) < 0) + bail("failed to setns into %s namespace", ns->type); + close(ns->original_ns_fd); + ns->original_ns_fd = -1; + } + } +} + /* Defined in cloned_binary.c. */ extern int ensure_cloned_binary(void); @@ -1211,6 +1349,8 @@ void nsexec(void) * using cmsg(3) but that's just annoying. */ ns_info = parse_namespace_info(config.namespaces); + if (config.mount_fd_proc >= 0 || config.mount_fd_proc >= 0 || config.mount_fd_mqueue >= 0) + prepare_mount_fds(ns_info, &config); if (ns_info->namespace_cnt > 0) join_namespaces(ns_info); free_namespace_info(ns_info); From 6dfc4ffe1c009d90d9eb57d09aaca5b9320448c1 Mon Sep 17 00:00:00 2001 From: Tingting Yang Date: Tue, 27 Sep 2022 13:03:26 +0800 Subject: [PATCH 5/5] Enable cross user namespace mounting When a user namespace is enabled for a pod/container, it may fail to mount /proc, /sys and /dev/mqueue under certain conditions. This may be solved by enabling cross user namespace mounting. Signed-off-by: Jiang Liu Signed-off-by: shidao.ytt --- libcontainer/container_linux.go | 46 ++++++++++++++++++++- libcontainer/nsenter/nsexec.c | 71 +++++++++++++++++++++++++++------ 2 files changed, 102 insertions(+), 15 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 1530cd69a76..96091888b7b 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -520,7 +520,7 @@ func (c *Container) shouldSendMountSources() bool { // We need to send sources if there are bind-mounts. for _, m := range c.config.Mounts { - if m.IsBind() { + if m.IsBind() || m.IsMove() { return true } } @@ -536,15 +536,46 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l nsMaps[ns.Type] = ns.Path } } + + nsList := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + nsList[ns.Type] = ns.Path + } + _, sharePidns := nsMaps[configs.NEWPID] userNsMountFds := [3]int{-1, -1, -1} + // Enable open_tree()/move_mount() for special filesystems to support cross user namespace mounting. + if _, ok := nsList[configs.NEWUSER]; ok { + for idx, m := range c.config.Mounts { + if m.Device == "proc" && m.Source == "proc" && m.Destination == "/proc" && + m.Flags == unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID { + // procfs depends on Pid namespace + if _, exist := nsList[configs.NEWPID]; exist { + } + } else if m.Device == "sysfs" && m.Source == "sysfs" && m.Destination == "/sys" && + m.Flags == unix.MS_NODEV|unix.MS_NOSUID|unix.MS_NOEXEC|unix.MS_RDONLY { + // sysfs depends on Net namespace + // meaning exclude runc unshare new netns + if path, exist := nsList[configs.NEWNET]; !exist || path != "" { + c.config.Mounts[idx].Flags |= unix.MS_MOVE + } + } else if m.Device == "mqueue" && m.Source == "mqueue" && m.Destination == "/dev/mqueue" && + m.Flags == unix.MS_NODEV|unix.MS_NOEXEC|unix.MS_NOSUID { + // /mqueue depends on IPC namespace + if _, exist := nsList[configs.NEWIPC]; exist { + c.config.Mounts[idx].Flags |= unix.MS_MOVE + } + } + } + } + if c.shouldSendMountSources() { // Elements on this slice will be paired with mounts (see StartInitialization() and // prepareRootfs()). This slice MUST have the same size as c.config.Mounts. mountFds := make([]int, len(c.config.Mounts)) for i, m := range c.config.Mounts { - if !m.IsBind() { + if !m.IsBind() && !m.IsMove() { // Non bind-mounts do not use an fd. mountFds[i] = -1 continue @@ -556,6 +587,17 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l // lifecycle of that fd is already taken care of. cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 + + // MS_MOVE flag is set for cross user namespace mounting + if m.IsMove() { + if m.Device == "proc" { + userNsMountFds[0] = mountFds[i] + } else if m.Device == "sysfs" { + userNsMountFds[1] = mountFds[i] + } else if m.Device == "mqueue" { + userNsMountFds[2] = mountFds[i] + } + } } mountFdsJson, err := json.Marshal(mountFds) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index f6933396b95..5ce0a2f398c 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -27,12 +27,15 @@ #include #include #include +#include /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" extern char *escape_json_string(char *str); +#define AT_RECURSIVE 0x8000 + /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ @@ -486,6 +489,12 @@ static uint8_t readint8(char *buf) return *(uint8_t *) buf; } +static void set_default_config(struct nlconfig_t *config) { + config->mount_fd_proc = -1; + config->mount_fd_sys = -1; + config->mount_fd_mqueue = -1; +} + static void nl_parse(int fd, struct nlconfig_t *config) { size_t len, size; @@ -590,6 +599,7 @@ struct namespace_t { int target_ns_fd; int original_ns_fd; int flag; + int done; char type[MAX_NAMSPACE_TYPE_LEN]; char path[PATH_MAX]; }; @@ -601,17 +611,13 @@ struct namespace_info_t { static struct namespace_info_t* parse_namespace_info(char *nslist) { - int i; struct namespace_info_t *ns_info; ns_info = malloc(sizeof(*ns_info)); if (ns_info == NULL) { bail("Can't allocate memory for namespace_info."); } - for (i = 0; i < MAX_NAMESPACES; i++) { - ns_info->namespaces[i]->target_ns_fd = -1; - ns_info->namespaces[i]->original_ns_fd = -1; - } + memset(ns_info, 0, sizeof(*ns_info)); if (nslist != NULL) { char *saveptr = NULL; @@ -634,6 +640,9 @@ static struct namespace_info_t* parse_namespace_info(char *nslist) ns = malloc(sizeof(struct namespace_t)); if (!ns) bail("failed to reallocate namespace array"); + memset(ns, 0, sizeof(struct namespace_t)); + ns->target_ns_fd = -1; + ns->original_ns_fd = -1; /* Split 'ns:path'. */ path = strstr(namespace, ":"); @@ -671,14 +680,35 @@ static struct namespace_info_t* parse_namespace_info(char *nslist) static void join_namespaces(struct namespace_info_t *ns_info) { int i; + int userns_idx = -1; for (i = 0; i < ns_info->namespace_cnt; i++) { struct namespace_t *ns = ns_info->namespaces[i]; + if (!strcmp(ns->type, "user")) { + userns_idx = i; + break; + } + } - if (ns->target_ns_fd >= 0) { + if (userns_idx >= 0) { + for (i = 0; i < ns_info->namespace_cnt; i++) { + struct namespace_t *ns = ns_info->namespaces[i]; + + if (i != userns_idx && ns->target_ns_fd >= 0) { + write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", ns->flag, ns->type, ns->path); + if (setns(ns->target_ns_fd, ns->flag) >= 0) + ns->done = 1; + } + } + } + + for (i = 0; i < ns_info->namespace_cnt; i++) { + struct namespace_t *ns = ns_info->namespaces[i]; + + if (ns->target_ns_fd >= 0 && !ns->done) { write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", ns->flag, ns->type, ns->path); if (setns(ns->target_ns_fd, ns->flag) < 0) - bail("failed to setns into %s namespace", ns->type); + bail("failed to setns into %s namespace, error %d", ns->type, errno); } } } @@ -735,11 +765,18 @@ static void prepare_mount_fd(char *dirname, char *type, int target_fd, int flags cleanup_mount_fds_tempdir(dirname); bail("failed to prepare temp directory path for %s", type); } + ret = mkdir(path, 0644); + if (ret < 0) { + rmdir(path); + cleanup_mount_fds_tempdir(dirname); + bail("failed to mount %s onto temp directory, ret %d", type, ret); + } - if (mount(type, path, type, flags, NULL) < 0) { + ret = mount(type, path, type, flags, NULL); + if ( ret < 0) { rmdir(path); cleanup_mount_fds_tempdir(dirname); - bail("failed to mount %s onto temp directory", type); + bail("failed to mount %s onto temp directory, ret %d", type, ret); } fd = sys_open_tree(-1, path, OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE | AT_RECURSIVE); @@ -783,7 +820,8 @@ static void prepare_mount_fds(struct namespace_info_t *ns_info, struct nlconfig_ // Join target namespaces except user namespace for (i = 0; i < ns_info->namespace_cnt; i++) { struct namespace_t *ns = ns_info->namespaces[i]; - + if (!strcmp(ns->type, "user")) + continue; if (ns->target_ns_fd >= 0 && ns->flag != 0 && ns->flag != CLONE_NEWUSER) { if (setns(ns->target_ns_fd, ns->flag) < 0) bail("failed to setns into %s namespace", ns->type); @@ -824,7 +862,8 @@ static void prepare_mount_fds(struct namespace_info_t *ns_info, struct nlconfig_ // Rejoin original namespaces except user namespace for (i = 0; i < ns_info->namespace_cnt; i++) { struct namespace_t *ns = ns_info->namespaces[i]; - + if (!strcmp(ns->type, "user")) + continue; if (ns->original_ns_fd >= 0) { if (setns(ns->original_ns_fd, ns->flag) < 0) bail("failed to setns into %s namespace", ns->type); @@ -940,7 +979,7 @@ void send_fd(int sockfd, int fd) bail("failed to send fd %d via unix socket %d", fd, sockfd); } -void receive_mountsources(int sockfd) +void receive_mountsources(int sockfd, struct nlconfig_t *config) { char *mount_fds, *endp; long new_fd; @@ -970,6 +1009,11 @@ void receive_mountsources(int sockfd) bail("malformed _LIBCONTAINER_MOUNT_FDS env var: fds out of range"); } + // Skip file descriptors for cross user namespace mounting + if (new_fd == config->mount_fd_proc || new_fd == config->mount_fd_sys || new_fd == config->mount_fd_mqueue) { + continue; + } + receive_fd(sockfd, new_fd); } } @@ -1074,6 +1118,7 @@ void nsexec(void) write_log(DEBUG, "=> nsexec container setup"); /* Parse all of the netlink configuration. */ + set_default_config(&config); nl_parse(pipenum, &config); /* Set oom_score_adj. This has to be done before !dumpable because @@ -1443,7 +1488,7 @@ void nsexec(void) } /* Receive and install all mount sources fds. */ - receive_mountsources(syncfd); + receive_mountsources(syncfd, &config); /* Parent finished to send the mount sources fds. */ if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {