From ba0b5e26989f39d0bdadeeff38182902df781df6 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 1 Aug 2023 20:07:49 +1000
Subject: [PATCH 1/8] libcontainer: remove all mount logic from nsexec

With open_tree(OPEN_TREE_CLONE), it is possible to implement both the
id-mapped mounts and bind-mount source file descriptor logic entirely in
Go without requiring any complicated handling from nsexec.

However, implementing it the naive way (do the OPEN_TREE_CLONE in the
host namespace before the rootfs is set up -- which is what the existing
implementation did) exposes issues in how mount ordering (in particular
when handling mount sources from inside the container rootfs, but also
in relation to mount propagation) was handled for idmapped mounts and
bind-mount sources. In order to solve this problem completely, it is
necessary to spawn a thread which joins the container mount namespace
and provides mountfds when requested by the rootfs setup code (ensuring
that the mount order and mount propagation of the source of the
bind-mount are handled correctly). While the need to join the mount
namespace leads to other complicated (such as with the usage of
/proc/self -- fixed in a later patch) the resulting code is still
reasonable and is the only real way to solve the issue.

This allows us to reduce the amount of C code we have in nsexec, as well
as simplifying a whole host of places that were made more complicated
with the addition of id-mapped mounts and the bind sourcefd logic.
Because we join the container namespace, we can continue to use regular
O_PATH file descriptors for non-id-mapped bind-mount sources (which
means we don't have to raise the kernel requirement for that case).

In addition, we can easily add support for id-mappings that don't match
the container's user namespace. The approach taken here is to use Go's
officially supported mechanism for spawning a process in a user
namespace, but (ab)use PTRACE_TRACEME to avoid actually having to exec a
different process. The most efficient way to implement this would be to
do clone() in cgo directly to run a function that just does
kill(getpid(), SIGSTOP) -- we can always switch to that if it turns out
this approach is too slow. It should be noted that the included
micro-benchmark seems to indicate this is Fast Enough(TM):

  goos: linux
  goarch: amd64
  pkg: github.com/opencontainers/runc/libcontainer/userns
  cpu: Intel(R) Core(TM) i5-10210U CPU @ 1.60GHz
  BenchmarkSpawnProc
  BenchmarkSpawnProc-8        1670            770065 ns/op

Fixes: fda12ab1014e ("Support idmap mounts on volumes")
Fixes: 9c444070ec7b ("Open bind mount sources from the host userns")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/container_linux.go            | 161 +-----------
 libcontainer/criu_linux.go                 |   4 +-
 libcontainer/factory_linux.go              |  15 --
 libcontainer/init_linux.go                 |  34 +--
 libcontainer/message_linux.go              |   4 +-
 libcontainer/mount_linux.go                | 198 ++++++++++++---
 libcontainer/nsenter/idmap.h               |  76 ------
 libcontainer/nsenter/nsexec.c              | 277 +--------------------
 libcontainer/process_linux.go              | 162 +++++++++++-
 libcontainer/rootfs_linux.go               | 148 ++++++-----
 libcontainer/standard_init_linux.go        |  13 +-
 libcontainer/sync.go                       |   7 +
 libcontainer/userns/usernsfd_linux.go      | 153 ++++++++++++
 libcontainer/userns/usernsfd_linux_test.go |  52 ++++
 14 files changed, 617 insertions(+), 687 deletions(-)
 delete mode 100644 libcontainer/nsenter/idmap.h
 create mode 100644 libcontainer/userns/usernsfd_linux.go
 create mode 100644 libcontainer/userns/usernsfd_linux_test.go

diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index f36abaf1032..c9aacf57f97 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -2,7 +2,6 @@ package libcontainer
 
 import (
 	"bytes"
-	"encoding/json"
 	"errors"
 	"fmt"
 	"io"
@@ -629,112 +628,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	return c.newSetnsProcess(p, cmd, comm)
 }
 
-// shouldSendMountSources says whether the child process must setup bind mounts with
-// the source pre-opened (O_PATH) in the host user namespace.
-// See https://github.com/opencontainers/runc/issues/2484
-func (c *Container) shouldSendMountSources() bool {
-	// Passing the mount sources via SCM_RIGHTS is only necessary when
-	// both userns and mntns are active.
-	if !c.config.Namespaces.Contains(configs.NEWUSER) ||
-		!c.config.Namespaces.Contains(configs.NEWNS) {
-		return false
-	}
-
-	// nsexec.c send_mountsources() requires setns(mntns) capabilities
-	// CAP_SYS_CHROOT and CAP_SYS_ADMIN.
-	if c.config.RootlessEUID {
-		return false
-	}
-
-	// We need to send sources if there are non-idmap bind-mounts.
-	for _, m := range c.config.Mounts {
-		if m.IsBind() && !m.IsIDMapped() {
-			return true
-		}
-	}
-
-	return false
-}
-
-// shouldSendIdmapSources says whether the child process must setup idmap mounts with
-// the mount_setattr already done in the host user namespace.
-func (c *Container) shouldSendIdmapSources() bool {
-	// nsexec.c mount_setattr() requires CAP_SYS_ADMIN in:
-	// * the user namespace the filesystem was mounted in;
-	// * the user namespace we're trying to idmap the mount to;
-	// * the owning user namespace of the mount namespace you're currently located in.
-	//
-	// See the comment from Christian Brauner:
-	//	https://github.com/opencontainers/runc/pull/3717#discussion_r1103607972
-	//
-	// Let's just rule out rootless, we don't have those permission in the
-	// rootless case.
-	if c.config.RootlessEUID {
-		return false
-	}
-
-	// For the time being we require userns to be in use.
-	if !c.config.Namespaces.Contains(configs.NEWUSER) {
-		return false
-	}
-
-	// We need to send sources if there are idmap bind-mounts.
-	for _, m := range c.config.Mounts {
-		if m.IsBind() && m.IsIDMapped() {
-			return true
-		}
-	}
-
-	return false
-}
-
-func (c *Container) sendMountSources(cmd *exec.Cmd, comm *processComm) error {
-	if !c.shouldSendMountSources() {
-		return nil
-	}
-
-	return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_MOUNT_FDS", func(m *configs.Mount) bool {
-		return m.IsBind() && !m.IsIDMapped()
-	})
-}
-
-func (c *Container) sendIdmapSources(cmd *exec.Cmd, comm *processComm) error {
-	if !c.shouldSendIdmapSources() {
-		return nil
-	}
-
-	return c.sendFdsSources(cmd, comm, "_LIBCONTAINER_IDMAP_FDS", func(m *configs.Mount) bool {
-		return m.IsBind() && m.IsIDMapped()
-	})
-}
-
-func (c *Container) sendFdsSources(cmd *exec.Cmd, comm *processComm, envVar string, condition func(*configs.Mount) bool) error {
-	// Elements on these slices will be paired with mounts (see StartInitialization() and
-	// prepareRootfs()). These slices MUST have the same size as c.config.Mounts.
-	fds := make([]int, len(c.config.Mounts))
-	for i, m := range c.config.Mounts {
-		if !condition(m) {
-			// The -1 fd is ignored later.
-			fds[i] = -1
-			continue
-		}
-
-		// The fd passed here will not be used: nsexec.c will overwrite it with
-		// dup3(). We just need to allocate a fd so that we know the number to
-		// pass in the environment variable. The fd must not be closed before
-		// cmd.Start(), so we reuse initSockChild because the lifecycle of that
-		// fd is already taken care of.
-		cmd.ExtraFiles = append(cmd.ExtraFiles, comm.initSockChild)
-		fds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1
-	}
-	fdsJSON, err := json.Marshal(fds)
-	if err != nil {
-		return fmt.Errorf("Error creating %v: %w", envVar, err)
-	}
-	cmd.Env = append(cmd.Env, envVar+"="+string(fdsJSON))
-	return nil
-}
-
 func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm) (*initProcess, error) {
 	cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard))
 	nsMaps := make(map[configs.NamespaceType]string)
@@ -743,16 +636,10 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, comm *processComm)
 			nsMaps[ns.Type] = ns.Path
 		}
 	}
-	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard)
+	data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps)
 	if err != nil {
 		return nil, err
 	}
-	if err := c.sendMountSources(cmd, comm); err != nil {
-		return nil, err
-	}
-	if err := c.sendIdmapSources(cmd, comm); err != nil {
-		return nil, err
-	}
 
 	init := &initProcess{
 		cmd:             cmd,
@@ -776,7 +663,7 @@ func (c *Container) newSetnsProcess(p *Process, cmd *exec.Cmd, comm *processComm
 	}
 	// for setns process, we don't have to set cloneflags as the process namespaces
 	// will only be set via setns syscall
-	data, err := c.bootstrapData(0, state.NamespacePaths, initSetns)
+	data, err := c.bootstrapData(0, state.NamespacePaths)
 	if err != nil {
 		return nil, err
 	}
@@ -1165,7 +1052,7 @@ type netlinkError struct{ error }
 // such as one that uses nsenter package to bootstrap the container's
 // init process correctly, i.e. with correct namespaces, uid/gid
 // mapping etc.
-func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string, it initType) (_ io.Reader, Err error) {
+func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.NamespaceType]string) (_ io.Reader, Err error) {
 	// create the netlink message
 	r := nl.NewNetlinkRequest(int(InitMsg), 0)
 
@@ -1267,48 +1154,6 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa
 		Value: c.config.RootlessEUID,
 	})
 
-	// Bind mount source to open.
-	if it == initStandard && c.shouldSendMountSources() {
-		var mounts []byte
-		for _, m := range c.config.Mounts {
-			if m.IsBind() && !m.IsIDMapped() {
-				if strings.IndexByte(m.Source, 0) >= 0 {
-					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
-				}
-				mounts = append(mounts, []byte(m.Source)...)
-			}
-			mounts = append(mounts, byte(0))
-		}
-
-		r.AddData(&Bytemsg{
-			Type:  MountSourcesAttr,
-			Value: mounts,
-		})
-	}
-
-	// Idmap mount sources to open.
-	if it == initStandard && c.shouldSendIdmapSources() {
-		var mounts []byte
-		for _, m := range c.config.Mounts {
-			if m.IsBind() && m.IsIDMapped() {
-				// While other parts of the code check this too (like
-				// libcontainer/specconv/spec_linux.go) we do it here also because some libcontainer
-				// users don't use those functions.
-				if strings.IndexByte(m.Source, 0) >= 0 {
-					return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source)
-				}
-
-				mounts = append(mounts, []byte(m.Source)...)
-			}
-			mounts = append(mounts, byte(0))
-		}
-
-		r.AddData(&Bytemsg{
-			Type:  IdmapSourcesAttr,
-			Value: mounts,
-		})
-	}
-
 	// write boottime and monotonic time ns offsets.
 	if c.config.TimeOffsets != nil {
 		var offsetSpec bytes.Buffer
diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go
index 360639e7fcd..5cb8c674f41 100644
--- a/libcontainer/criu_linux.go
+++ b/libcontainer/criu_linux.go
@@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error {
 			// because during initial container creation mounts are
 			// set up in the order they are configured.
 			if m.Device == "bind" {
-				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error {
-					return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "")
+				if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error {
+					return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "")
 				}); err != nil {
 					return err
 				}
diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go
index b0bf6142109..bc682f23ed8 100644
--- a/libcontainer/factory_linux.go
+++ b/libcontainer/factory_linux.go
@@ -214,18 +214,3 @@ func validateID(id string) error {
 
 	return nil
 }
-
-func parseFdsFromEnv(envVar string) ([]int, error) {
-	fdsJSON := os.Getenv(envVar)
-	if fdsJSON == "" {
-		// Always return the nil slice if no fd is present.
-		return nil, nil
-	}
-
-	var fds []int
-	if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil {
-		return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err)
-	}
-
-	return fds, nil
-}
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index c02538d6847..019868c1403 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -47,18 +47,6 @@ type network struct {
 	TempVethPeerName string `json:"temp_veth_peer_name"`
 }
 
-type mountFds struct {
-	// sourceFds are the fds to use as source when mounting.
-	// The slice size should be the same as container mounts, as it will be
-	// paired with them.
-	// The value -1 is used when no fd is needed for the mount.
-	// Can't have a valid fd in the same position that other slices in this struct.
-	// We need to use only one of these fds on any single mount.
-	sourceFds []int
-	// Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount().
-	idmapFds []int
-}
-
 // initConfig is used for transferring parameters from Exec() to Init()
 type initConfig struct {
 	Args             []string              `json:"args"`
@@ -189,18 +177,6 @@ func startInitialization() (retErr error) {
 		defer pidfdSocket.Close()
 	}
 
-	// Get mount files (O_PATH).
-	mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS")
-	if err != nil {
-		return err
-	}
-
-	// Get idmap fds.
-	idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS")
-	if err != nil {
-		return err
-	}
-
 	// Get runc-dmz fds.
 	var dmzExe *os.File
 	if dmzFdStr := os.Getenv("_LIBCONTAINER_DMZEXEFD"); dmzFdStr != "" {
@@ -232,21 +208,16 @@ func startInitialization() (retErr error) {
 	}
 
 	// If init succeeds, it will not return, hence none of the defers will be called.
-	return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds})
+	return containerInit(it, &config, syncPipe, consoleSocket, pidfdSocket, fifofd, logFD, dmzExe)
 }
 
-func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File, mountFds mountFds) error {
+func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSocket, pidfdSocket *os.File, fifoFd, logFd int, dmzExe *os.File) error {
 	if err := populateProcessEnvironment(config.Env); err != nil {
 		return err
 	}
 
 	switch t {
 	case initSetns:
-		// mount and idmap fds must be nil in this case. We don't mount while doing runc exec.
-		if mountFds.sourceFds != nil || mountFds.idmapFds != nil {
-			return errors.New("mount and idmap fds must be nil; can't mount from exec")
-		}
-
 		i := &linuxSetnsInit{
 			pipe:          pipe,
 			consoleSocket: consoleSocket,
@@ -266,7 +237,6 @@ func containerInit(t initType, config *initConfig, pipe *syncSocket, consoleSock
 			fifoFd:        fifoFd,
 			logFd:         logFd,
 			dmzExe:        dmzExe,
-			mountFds:      mountFds,
 		}
 		return i.Init()
 	}
diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go
index 4e48826cdb5..2790f018d06 100644
--- a/libcontainer/message_linux.go
+++ b/libcontainer/message_linux.go
@@ -21,9 +21,7 @@ const (
 	RootlessEUIDAttr uint16 = 27287
 	UidmapPathAttr   uint16 = 27288
 	GidmapPathAttr   uint16 = 27289
-	MountSourcesAttr uint16 = 27290
-	IdmapSourcesAttr uint16 = 27291
-	TimeOffsetsAttr  uint16 = 27292
+	TimeOffsetsAttr  uint16 = 27290
 )
 
 type Int32msg struct {
diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go
index da11da68ea0..59861a4f435 100644
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@@ -1,22 +1,47 @@
 package libcontainer
 
 import (
+	"errors"
+	"fmt"
 	"io/fs"
+	"os"
 	"strconv"
 
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+	"github.com/opencontainers/runc/libcontainer/userns"
 )
 
+// mountSourceType indicates what type of file descriptor is being returned. It
+// is used to tell rootfs_linux.go whether or not to use move_mount(2) to
+// install the mount.
+type mountSourceType string
+
+const (
+	// An open_tree(2)-style file descriptor that needs to be installed using
+	// move_mount(2) to install.
+	mountSourceOpenTree mountSourceType = "open_tree"
+	// A plain file descriptor that can be mounted through /proc/self/fd.
+	mountSourcePlain mountSourceType = "plain-open"
+)
+
+type mountSource struct {
+	Type mountSourceType `json:"type"`
+	file *os.File        `json:"-"`
+}
+
 // mountError holds an error from a failed mount or unmount operation.
 type mountError struct {
-	op     string
-	source string
-	srcFD  *int
-	target string
-	dstFD  string
-	flags  uintptr
-	data   string
-	err    error
+	op      string
+	source  string
+	srcFile *mountSource
+	target  string
+	dstFd   string
+	flags   uintptr
+	data    string
+	err     error
 }
 
 // Error provides a string error representation.
@@ -25,13 +50,14 @@ func (e *mountError) Error() string {
 
 	if e.source != "" {
 		out += "src=" + e.source + ", "
-		if e.srcFD != nil {
-			out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", "
+		if e.srcFile != nil {
+			out += "srcType=" + string(e.srcFile.Type) + ", "
+			out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", "
 		}
 	}
 	out += "dst=" + e.target
-	if e.dstFD != "" {
-		out += ", dstFD=" + e.dstFD
+	if e.dstFd != "" {
+		out += ", dstFd=" + e.dstFd
 	}
 
 	if e.flags != uintptr(0) {
@@ -54,38 +80,58 @@ func (e *mountError) Unwrap() error {
 // mount is a simple unix.Mount wrapper, returning an error with more context
 // in case it failed.
 func mount(source, target, fstype string, flags uintptr, data string) error {
-	return mountViaFDs(source, nil, target, "", fstype, flags, data)
+	return mountViaFds(source, nil, target, "", fstype, flags, data)
 }
 
-// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source,
-// and dstFD instead of target, unless those are empty.
-// If srcFD is different than nil, its path (i.e. "/proc/self/fd/NN") will be
-// constructed by this function.
-// dstFD argument, if non-empty, is expected to be in the form of a path to an
-// opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
+// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source,
+// and dstFd instead of target, unless those are empty.
 //
-// If case an FD is used instead of a source or a target path, the
-// corresponding path is only used to add context to an error in case
-// the mount operation has failed.
-func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error {
-	src := source
-	if srcFD != nil {
-		src = "/proc/self/fd/" + strconv.Itoa(*srcFD)
+// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds
+// will mount it according to the mountSourceType of the file descriptor.
+//
+// The dstFd argument, if non-empty, is expected to be in the form of a path to
+// an opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
+//
+// If a file descriptor is used instead of a source or a target path, the
+// corresponding path is only used to add context to an error in case the mount
+// operation has failed.
+func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error {
+	// MS_REMOUNT and srcFile don't make sense together.
+	if srcFile != nil && flags&unix.MS_REMOUNT != 0 {
+		logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
+		srcFile = nil
 	}
+
 	dst := target
-	if dstFD != "" {
-		dst = dstFD
+	if dstFd != "" {
+		dst = dstFd
+	}
+	src := source
+	if srcFile != nil {
+		src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd()))
+	}
+
+	var op string
+	var err error
+	if srcFile != nil && srcFile.Type == mountSourceOpenTree {
+		op = "move_mount"
+		err = unix.MoveMount(int(srcFile.file.Fd()), "",
+			unix.AT_FDCWD, dstFd,
+			unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS)
+	} else {
+		op = "mount"
+		err = unix.Mount(src, dst, fstype, flags, data)
 	}
-	if err := unix.Mount(src, dst, fstype, flags, data); err != nil {
+	if err != nil {
 		return &mountError{
-			op:     "mount",
-			source: source,
-			srcFD:  srcFD,
-			target: target,
-			dstFD:  dstFD,
-			flags:  flags,
-			data:   data,
-			err:    err,
+			op:      op,
+			source:  source,
+			srcFile: srcFile,
+			target:  target,
+			dstFd:   dstFd,
+			flags:   flags,
+			data:    data,
+			err:     err,
 		}
 	}
 	return nil
@@ -121,3 +167,81 @@ func syscallMode(i fs.FileMode) (o uint32) {
 	// No mapping for Go's ModeTemporary (plan9 only).
 	return
 }
+
+// mountFd creates a "mount source fd" (either through open_tree(2) or just
+// open(O_PATH)) based on the provided configuration. This function must be
+// called from within the container's mount namespace.
+//
+// In the case of idmapped mount configurations, the returned mount source will
+// be an open_tree(2) file with MOUNT_ATTR_IDMAP applied. For other
+// bind-mounts, it will be an O_PATH. If the type of mount cannot be handled,
+// the returned mountSource will be nil, indicating that the container init
+// process will need to do an old-fashioned mount(2) themselves.
+//
+// This helper is only intended to be used by goCreateMountSources.
+func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error) {
+	if !m.IsBind() {
+		return nil, errors.New("new mount api: only bind-mounts are supported")
+	}
+	if nsHandles == nil {
+		nsHandles = new(userns.Handles)
+		defer nsHandles.Release()
+	}
+
+	var mountFile *os.File
+	var sourceType mountSourceType
+
+	// Ideally, we would use OPEN_TREE_CLONE for everything, because we can
+	// be sure that the file descriptor cannot be used to escape outside of
+	// the mount root. Unfortunately, OPEN_TREE_CLONE is far more expensive
+	// than open(2) because it requires doing mounts inside a new anonymous
+	// mount namespace. So we use open(2) for standard bind-mounts, and
+	// OPEN_TREE_CLONE when we need to set mount attributes here.
+	//
+	// While passing open(2)'d paths from the host rootfs isn't exactly the
+	// safest thing in the world, the files will not survive across
+	// execve(2) and "runc init" is non-dumpable so it should not be
+	// possible for a malicious container process to gain access to the
+	// file descriptors. We also don't do any of this for "runc exec",
+	// lessening the risk even further.
+	if m.IsIDMapped() {
+		flags := uint(unix.OPEN_TREE_CLONE | unix.OPEN_TREE_CLOEXEC)
+		if m.Flags&unix.MS_REC == unix.MS_REC {
+			flags |= unix.AT_RECURSIVE
+		}
+		fd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags)
+		if err != nil {
+			return nil, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err}
+		}
+		mountFile = os.NewFile(uintptr(fd), m.Source)
+		sourceType = mountSourceOpenTree
+
+		// Configure the id mapping.
+		usernsFile, err := nsHandles.Get(userns.Mapping{
+			UIDMappings: m.UIDMappings,
+			GIDMappings: m.GIDMappings,
+		})
+		if err != nil {
+			return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
+		}
+		defer usernsFile.Close()
+
+		if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
+			Attr_set:  unix.MOUNT_ATTR_IDMAP,
+			Userns_fd: uint64(usernsFile.Fd()),
+		}); err != nil {
+			return nil, fmt.Errorf("failed to set MOUNT_ATTR_IDMAP on %s: %w", m.Source, err)
+		}
+	} else {
+		var err error
+		mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0)
+		if err != nil {
+			return nil, err
+		}
+		sourceType = mountSourcePlain
+	}
+	return &mountSource{
+		Type: sourceType,
+		file: mountFile,
+	}, nil
+}
diff --git a/libcontainer/nsenter/idmap.h b/libcontainer/nsenter/idmap.h
deleted file mode 100644
index fa25a3bfb2f..00000000000
--- a/libcontainer/nsenter/idmap.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef IDMAP_H
-#define IDMAP_H
-
-#include <sys/mount.h>
-// Centos-7 doesn't have this file nor the __has_include() directive, so let's
-// just leave this commented out and we can uncomment when it hits EOL (2024-06-30).
-//#include <linux/mount.h>
-#include <sys/syscall.h>
-#include <unistd.h>
-
-/* mount_setattr() */
-#ifndef MOUNT_ATTR_IDMAP
-#define MOUNT_ATTR_IDMAP 0x00100000
-#endif
-
-#ifndef __NR_mount_setattr
-	#if defined _MIPS_SIM
-		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
-			#define __NR_mount_setattr (442 + 4000)
-		#endif
-		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
-			#define __NR_mount_setattr (442 + 6000)
-		#endif
-		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
-			#define __NR_mount_setattr (442 + 5000)
-		#endif
-	#else
-		#define __NR_mount_setattr 442
-	#endif
-#endif
-#ifndef MOUNT_ATTR_SIZE_VER0
-struct mount_attr {
-	__u64 attr_set;
-	__u64 attr_clr;
-	__u64 propagation;
-	__u64 userns_fd;
-};
-#endif
-
-/* open_tree() */
-#ifndef OPEN_TREE_CLONE
-#define OPEN_TREE_CLONE 1
-#endif
-
-#ifndef OPEN_TREE_CLOEXEC
-#define OPEN_TREE_CLOEXEC O_CLOEXEC
-#endif
-
-#ifndef __NR_open_tree
-	#if defined _MIPS_SIM
-		#if _MIPS_SIM == _MIPS_SIM_ABI32	/* o32 */
-			#define __NR_open_tree 4428
-		#endif
-		#if _MIPS_SIM == _MIPS_SIM_NABI32	/* n32 */
-			#define __NR_open_tree 6428
-		#endif
-		#if _MIPS_SIM == _MIPS_SIM_ABI64	/* n64 */
-			#define __NR_open_tree 5428
-		#endif
-	#else
-		#define __NR_open_tree 428
-	#endif
-#endif
-
-static inline int sys_mount_setattr(int dfd, const char *path, unsigned int flags, struct mount_attr *attr, size_t size)
-{
-	return syscall(__NR_mount_setattr, dfd, path, flags, attr, size);
-}
-
-static inline int sys_open_tree(int dfd, const char *filename, unsigned int flags)
-{
-	return syscall(__NR_open_tree, dfd, filename, flags);
-}
-
-
-#endif /* IDMAP_H */
diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index f64cc41eb4a..bfc6a79a8e1 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -33,9 +33,6 @@
 /* Get all of the CLONE_NEW* flags. */
 #include "namespace.h"
 
-/* Get definitions for idmap sources */
-#include "idmap.h"
-
 /* Synchronisation values. */
 enum sync_t {
 	SYNC_USERMAP_PLS = 0x40,	/* Request parent to map our users. */
@@ -44,12 +41,8 @@ enum sync_t {
 	SYNC_RECVPID_ACK = 0x43,	/* PID was correctly received by parent. */
 	SYNC_GRANDCHILD = 0x44,	/* The grandchild is ready to run. */
 	SYNC_CHILD_FINISH = 0x45,	/* The child or grandchild has finished. */
-	SYNC_MOUNTSOURCES_PLS = 0x46,	/* Tell parent to send mount sources by SCM_RIGHTS. */
-	SYNC_MOUNTSOURCES_ACK = 0x47,	/* All mount sources have been sent. */
-	SYNC_MOUNT_IDMAP_PLS = 0x48,	/* Tell parent to mount idmap sources. */
-	SYNC_MOUNT_IDMAP_ACK = 0x49,	/* All idmap mounts have been done. */
-	SYNC_TIMEOFFSETS_PLS = 0x50,	/* Request parent to write timens offsets. */
-	SYNC_TIMEOFFSETS_ACK = 0x51,	/* Timens offsets were written. */
+	SYNC_TIMEOFFSETS_PLS = 0x46,	/* Request parent to write timens offsets. */
+	SYNC_TIMEOFFSETS_ACK = 0x47,	/* Timens offsets were written. */
 };
 
 #define STAGE_SETUP  -1
@@ -99,14 +92,6 @@ struct nlconfig_t {
 	char *gidmappath;
 	size_t gidmappath_len;
 
-	/* Mount sources opened outside the container userns. */
-	char *mountsources;
-	size_t mountsources_len;
-
-	/* Idmap sources opened outside the container userns which will be id mapped. */
-	char *idmapsources;
-	size_t idmapsources_len;
-
 	/* Time NS offsets. */
 	char *timensoffset;
 	size_t timensoffset_len;
@@ -126,9 +111,7 @@ struct nlconfig_t {
 #define ROOTLESS_EUID_ATTR	27287
 #define UIDMAPPATH_ATTR		27288
 #define GIDMAPPATH_ATTR		27289
-#define MOUNT_SOURCES_ATTR	27290
-#define IDMAP_SOURCES_ATTR	27291
-#define TIMENSOFFSET_ATTR	27292
+#define TIMENSOFFSET_ATTR	27290
 
 /*
  * Use the raw syscall for versions of glibc which don't include a function for
@@ -446,14 +429,6 @@ static void nl_parse(int fd, struct nlconfig_t *config)
 		case SETGROUP_ATTR:
 			config->is_setgroup = readint8(current);
 			break;
-		case MOUNT_SOURCES_ATTR:
-			config->mountsources = current;
-			config->mountsources_len = payload_len;
-			break;
-		case IDMAP_SOURCES_ATTR:
-			config->idmapsources = current;
-			config->idmapsources_len = payload_len;
-			break;
 		case TIMENSOFFSET_ATTR:
 			config->timensoffset = current;
 			config->timensoffset_len = payload_len;
@@ -546,115 +521,6 @@ static inline int sane_kill(pid_t pid, int signum)
 		return 0;
 }
 
-/* receive_fd_sources parses env_var as an array of fd numbers and, for each element that is
- * not -1, it receives an fd via SCM_RIGHTS and dup3 it to the fd requested in
- * the element of the env var.
- */
-void receive_fd_sources(int sockfd, const char *env_var)
-{
-	char *fds, *endp;
-	long new_fd;
-
-	// This env var must be a json array of ints.
-	fds = getenv(env_var);
-
-	if (fds[0] != '[') {
-		bail("malformed %s env var: missing '['", env_var);
-	}
-	fds++;
-
-	for (endp = fds; *endp != ']'; fds = endp + 1) {
-		new_fd = strtol(fds, &endp, 10);
-		if (endp == fds) {
-			bail("malformed %s env var: not a number", env_var);
-		}
-		if (*endp == '\0') {
-			bail("malformed %s env var: missing ]", env_var);
-		}
-		// The list contains -1 when no fd is needed. Ignore them.
-		if (new_fd == -1) {
-			continue;
-		}
-
-		if (new_fd == LONG_MAX || new_fd < 0 || new_fd > INT_MAX) {
-			bail("malformed %s env var: fds out of range", env_var);
-		}
-
-		int recv_fd = receive_fd(sockfd);
-		if (dup3(recv_fd, new_fd, O_CLOEXEC) < 0) {
-			bail("cannot dup3 fd %d to %ld", recv_fd, new_fd);
-		}
-		if (close(recv_fd) < 0) {
-			bail("cannot close fd %d", recv_fd);
-		}
-	}
-}
-
-void receive_mountsources(int sockfd)
-{
-	receive_fd_sources(sockfd, "_LIBCONTAINER_MOUNT_FDS");
-}
-
-void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mountsources_len)
-{
-	char proc_path[PATH_MAX];
-	int host_mntns_fd;
-	int container_mntns_fd;
-	int fd;
-	int ret;
-
-	// container_linux.go shouldSendMountSources() decides if mount sources
-	// should be pre-opened (O_PATH) and passed via SCM_RIGHTS
-	if (mountsources == NULL)
-		return;
-
-	host_mntns_fd = open("/proc/self/ns/mnt", O_RDONLY | O_CLOEXEC);
-	if (host_mntns_fd == -1)
-		bail("failed to get current mount namespace");
-
-	if (snprintf(proc_path, PATH_MAX, "/proc/%d/ns/mnt", child) < 0)
-		bail("failed to get mount namespace path");
-
-	container_mntns_fd = open(proc_path, O_RDONLY | O_CLOEXEC);
-	if (container_mntns_fd == -1)
-		bail("failed to get container mount namespace");
-
-	if (setns(container_mntns_fd, CLONE_NEWNS) < 0)
-		bail("failed to setns to container mntns");
-
-	char *mountsources_end = mountsources + mountsources_len;
-	while (mountsources < mountsources_end) {
-		if (mountsources[0] == '\0') {
-			mountsources++;
-			continue;
-		}
-
-		fd = open(mountsources, O_PATH | O_CLOEXEC);
-		if (fd < 0)
-			bail("failed to open mount source %s", mountsources);
-
-		write_log(DEBUG, "~> sending fd for: %s", mountsources);
-		if (send_fd(sockfd, fd) < 0)
-			bail("failed to send fd %d via unix socket %d", fd, sockfd);
-
-		ret = close(fd);
-		if (ret != 0)
-			bail("failed to close mount source fd %d", fd);
-
-		mountsources += strlen(mountsources) + 1;
-	}
-
-	if (setns(host_mntns_fd, CLONE_NEWNS) < 0)
-		bail("failed to setns to host mntns");
-
-	ret = close(host_mntns_fd);
-	if (ret != 0)
-		bail("failed to close host mount namespace fd %d", host_mntns_fd);
-	ret = close(container_mntns_fd);
-	if (ret != 0)
-		bail("failed to close container mount namespace fd %d", container_mntns_fd);
-}
-
 void try_unshare(int flags, const char *msg)
 {
 	write_log(DEBUG, "unshare %s", msg);
@@ -674,89 +540,6 @@ void try_unshare(int flags, const char *msg)
 	bail("failed to unshare %s", msg);
 }
 
-void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len)
-{
-	char proc_user_path[PATH_MAX];
-
-	/* Open the userns fd only once.
-	 * Currently we only support idmap mounts that use the same mapping than
-	 * the userns. This is validated in libcontainer/configs/validate/validator.go,
-	 * so if we reached here, we know the mapping for the idmap is the same
-	 * as the userns. This is why we just open the userns_fd once from the
-	 * PID of the child process that has the userns already applied.
-	 */
-	int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid);
-	if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) {
-		sane_kill(pid, SIGKILL);
-		bail("failed to create userns path string");
-	}
-
-	int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY);
-	if (userns_fd < 0) {
-		sane_kill(pid, SIGKILL);
-		bail("failed to get user namespace fd");
-	}
-
-	char *idmap_end = idmap_src + idmap_src_len;
-	while (idmap_src < idmap_end) {
-		if (idmap_src[0] == '\0') {
-			idmap_src++;
-			continue;
-		}
-
-		int fd_tree = sys_open_tree(-EBADF, idmap_src,
-					    OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
-					    AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT);
-		if (fd_tree < 0) {
-			sane_kill(pid, SIGKILL);
-			if (errno == ENOSYS) {
-				bail("open_tree(2) failed, the kernel doesn't support ID-mapped mounts");
-			} else if (errno == EINVAL) {
-				bail("open_tree(2) failed with path: %s, the kernel doesn't support ID-mapped mounts",
-				     idmap_src);
-			} else {
-				bail("open_tree(2) failed with path: %s", idmap_src);
-			}
-		}
-
-		struct mount_attr attr = {
-			.attr_set = MOUNT_ATTR_IDMAP,
-			.userns_fd = userns_fd,
-		};
-
-		ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr));
-		if (ret < 0) {
-			sane_kill(pid, SIGKILL);
-			if (errno == ENOSYS)
-				bail("mount_setattr(2) failed, the kernel doesn't support ID-mapped mounts");
-			else if (errno == EINVAL)
-				bail("mount_setattr(2) failed with path: %s, maybe the filesystem doesn't support ID-mapped mounts", idmap_src);
-			else
-				bail("mount_setattr(2) failed with path: %s", idmap_src);
-		}
-
-		write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path);
-		send_fd(sockfd, fd_tree);
-
-		if (close(fd_tree) < 0) {
-			sane_kill(pid, SIGKILL);
-			bail("error closing fd_tree");
-		}
-
-		idmap_src += strlen(idmap_src) + 1;
-	}
-
-	if (close(userns_fd) < 0) {
-		sane_kill(pid, SIGKILL);
-		bail("error closing userns fd");
-	}
-}
-
-void receive_idmapsources(int sockfd)
-{
-	receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS");
-}
-
 static void update_timens_offsets(pid_t pid, char *map, size_t map_len)
 {
 	if (map == NULL || map_len == 0)
@@ -988,28 +771,6 @@ void nsexec(void)
 						sane_kill(stage2_pid, SIGKILL);
 						bail("failed to sync with runc: write(pid-JSON)");
 					}
-					break;
-				case SYNC_MOUNTSOURCES_PLS:
-					write_log(DEBUG, "stage-1 requested to open mount sources");
-					send_mountsources(syncfd, stage1_pid, config.mountsources,
-							  config.mountsources_len);
-
-					s = SYNC_MOUNTSOURCES_ACK;
-					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						sane_kill(stage1_pid, SIGKILL);
-						bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)");
-					}
-					break;
-				case SYNC_MOUNT_IDMAP_PLS:
-					write_log(DEBUG, "stage-1 requested to open idmap sources");
-					send_idmapsources(syncfd, stage1_pid, config.idmapsources,
-							  config.idmapsources_len);
-					s = SYNC_MOUNT_IDMAP_ACK;
-					if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
-						sane_kill(stage1_pid, SIGKILL);
-						bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)");
-					}
-
 					break;
 				case SYNC_TIMEOFFSETS_PLS:
 					write_log(DEBUG, "stage-1 requested timens offsets to be configured");
@@ -1186,38 +947,6 @@ void nsexec(void)
 					bail("failed to sync with parent: SYNC_TIMEOFFSETS_ACK: got %u", s);
 			}
 
-			/* Ask our parent to send the mount sources fds. */
-			if (config.mountsources) {
-				write_log(DEBUG, "request stage-0 to send mount sources");
-				s = SYNC_MOUNTSOURCES_PLS;
-				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: write(SYNC_MOUNTSOURCES_PLS)");
-
-				/* Receive and install all mount sources fds. */
-				receive_mountsources(syncfd);
-
-				/* Parent finished to send the mount sources fds. */
-				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: read(SYNC_MOUNTSOURCES_ACK)");
-				if (s != SYNC_MOUNTSOURCES_ACK)
-					bail("failed to sync with parent: SYNC_MOUNTSOURCES_ACK: got %u", s);
-			}
-
-			if (config.idmapsources) {
-				write_log(DEBUG, "request stage-0 to send idmap sources");
-				s = SYNC_MOUNT_IDMAP_PLS;
-				if (write(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)");
-
-				/* Receive and install all idmap fds. */
-				receive_idmapsources(syncfd);
-
-				if (read(syncfd, &s, sizeof(s)) != sizeof(s))
-					bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)");
-				if (s != SYNC_MOUNT_IDMAP_ACK)
-					bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s);
-			}
-
 			/*
 			 * TODO: What about non-namespace clone flags that we're dropping here?
 			 *
diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go
index c1b60c49884..ba916571489 100644
--- a/libcontainer/process_linux.go
+++ b/libcontainer/process_linux.go
@@ -1,6 +1,7 @@
 package libcontainer
 
 import (
+	"context"
 	"encoding/json"
 	"errors"
 	"fmt"
@@ -11,18 +12,21 @@ import (
 	"path/filepath"
 	"runtime"
 	"strconv"
+	"sync"
 	"time"
 
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/intelrdt"
 	"github.com/opencontainers/runc/libcontainer/logs"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
-	"github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/sirupsen/logrus"
-	"golang.org/x/sys/unix"
 )
 
 type parentProcess interface {
@@ -208,6 +212,9 @@ func (p *setnsProcess) start() (retErr error) {
 		case procHooks:
 			// This shouldn't happen.
 			panic("unexpected procHooks in setns")
+		case procMountPlease:
+			// This shouldn't happen.
+			panic("unexpected procMountPlease in setns")
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
 				return errors.New("seccomp listenerPath is not set")
@@ -398,6 +405,110 @@ func (p *initProcess) waitForChildExit(childPid int) error {
 	return nil
 }
 
+type mountSourceRequestFn func(*configs.Mount) (*mountSource, error)
+
+// goCreateMountSources spawns a goroutine which creates open_tree(2)-style
+// mountfds based on the requested configs.Mount configuration. The returned
+// requestFn and cancelFn are used to interact with the goroutine.
+//
+// The caller of the returned mountSourceRequestFn is responsible for closing
+// the returned file.
+func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) {
+	type response struct {
+		src *mountSource
+		err error
+	}
+
+	errCh := make(chan error, 1)
+	requestCh := make(chan *configs.Mount)
+	responseCh := make(chan response)
+
+	ctx, cancelFn := context.WithTimeout(ctx, 1*time.Minute)
+	go func() {
+		// We lock this thread because we need to setns(2) here. There is no
+		// UnlockOSThread() here, to ensure that the Go runtime will kill this
+		// thread once this goroutine returns (ensuring no other goroutines run
+		// in this context).
+		runtime.LockOSThread()
+
+		// Detach from the shared fs of the rest of the Go process in order to
+		// be able to CLONE_NEWNS.
+		if err := unix.Unshare(unix.CLONE_FS); err != nil {
+			err = os.NewSyscallError("unshare(CLONE_FS)", err)
+			errCh <- fmt.Errorf("mount source thread: %w", err)
+			return
+		}
+
+		// Attach to the container's mount namespace.
+		nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid()))
+		if err != nil {
+			errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err)
+			return
+		}
+		defer nsFd.Close()
+		if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil {
+			err = os.NewSyscallError("setns", err)
+			errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err)
+			return
+		}
+
+		// No errors during setup!
+		close(errCh)
+		logrus.Debugf("mount source thread: successfully running in container mntns")
+
+		nsHandles := new(userns.Handles)
+		defer nsHandles.Release()
+	loop:
+		for {
+			select {
+			case m, ok := <-requestCh:
+				if !ok {
+					break loop
+				}
+				src, err := mountFd(nsHandles, m)
+				logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err)
+				responseCh <- response{
+					src: src,
+					err: err,
+				}
+			case <-ctx.Done():
+				break loop
+			}
+		}
+		logrus.Debugf("mount source thread: closing thread: %v", ctx.Err())
+		close(responseCh)
+	}()
+
+	// Check for setup errors.
+	err := <-errCh
+	if err != nil {
+		cancelFn()
+		return nil, nil, err
+	}
+
+	// TODO: Switch to context.AfterFunc when we switch to Go 1.21.
+	var requestChCloseOnce sync.Once
+	requestFn := func(m *configs.Mount) (*mountSource, error) {
+		var err error
+		select {
+		case requestCh <- m:
+			select {
+			case resp, ok := <-responseCh:
+				if ok {
+					return resp.src, resp.err
+				}
+			case <-ctx.Done():
+				err = fmt.Errorf("receive mount source context cancelled: %w", ctx.Err())
+			}
+		case <-ctx.Done():
+			err = fmt.Errorf("send mount request cancelled: %w", ctx.Err())
+		}
+		requestChCloseOnce.Do(func() { close(requestCh) })
+		return nil, err
+	}
+	return requestFn, cancelFn, nil
+}
+
 func (p *initProcess) start() (retErr error) {
 	defer p.comm.closeParent()
 	err := p.cmd.Start()
@@ -487,6 +598,22 @@ func (p *initProcess) start() (retErr error) {
 		return fmt.Errorf("error waiting for our first child to exit: %w", err)
 	}
 
+	// Spin up a goroutine to handle remapping mount requests by runc init.
+	// There is no point doing this for rootless containers because they cannot
+	// configure MOUNT_ATTR_IDMAP, nor do OPEN_TREE_CLONE. We could just
+	// service plain-open requests for plain bind-mounts but there's no need
+	// (rootless containers will never have permission issues on a source mount
+	// that the parent process can help with -- they are the same user).
+	var mountRequest mountSourceRequestFn
+	if !p.container.config.RootlessEUID {
+		request, cancel, err := p.goCreateMountSources(context.Background())
+		if err != nil {
+			return fmt.Errorf("error spawning mount remapping thread: %w", err)
+		}
+		defer cancel()
+		mountRequest = request
+	}
+
 	if err := p.createNetworkInterfaces(); err != nil {
 		return fmt.Errorf("error creating network interfaces: %w", err)
 	}
@@ -500,6 +627,35 @@ func (p *initProcess) start() (retErr error) {
 	var seenProcReady bool
 	ierr := parseSync(p.comm.syncSockParent, func(sync *syncT) error {
 		switch sync.Type {
+		case procMountPlease:
+			if mountRequest == nil {
+				return fmt.Errorf("cannot fulfil mount requests as a rootless user")
+			}
+			var m *configs.Mount
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &m); err != nil {
+				return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err)
+			}
+			mnt, err := mountRequest(m)
+			if err != nil {
+				return fmt.Errorf("failed to fulfil mount request: %w", err)
+			}
+			defer mnt.file.Close()
+
+			arg, err := json.Marshal(mnt)
+			if err != nil {
+				return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err)
+			}
+			argMsg := json.RawMessage(arg)
+			if err := doWriteSync(p.comm.syncSockParent, syncT{
+				Type: procMountFd,
+				Arg:  &argMsg,
+				File: mnt.file,
+			}); err != nil {
+				return err
+			}
 		case procSeccomp:
 			if p.config.Config.Seccomp.ListenerPath == "" {
 				return errors.New("seccomp listenerPath is not set")
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index a2e41ea5638..7f98e15c4cc 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -1,6 +1,7 @@
 package libcontainer
 
 import (
+	"encoding/json"
 	"errors"
 	"fmt"
 	"os"
@@ -13,16 +14,17 @@ import (
 	securejoin "github.com/cyphar/filepath-securejoin"
 	"github.com/moby/sys/mountinfo"
 	"github.com/mrunalp/fileutils"
+	"github.com/opencontainers/runtime-spec/specs-go"
+	"github.com/opencontainers/selinux/go-selinux/label"
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
 	"github.com/opencontainers/runc/libcontainer/cgroups"
 	"github.com/opencontainers/runc/libcontainer/cgroups/fs2"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/devices"
 	"github.com/opencontainers/runc/libcontainer/userns"
 	"github.com/opencontainers/runc/libcontainer/utils"
-	"github.com/opencontainers/runtime-spec/specs-go"
-	"github.com/opencontainers/selinux/go-selinux/label"
-	"github.com/sirupsen/logrus"
-	"golang.org/x/sys/unix"
 )
 
 const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
@@ -39,12 +41,12 @@ type mountConfig struct {
 // mountEntry contains mount data specific to a mount point.
 type mountEntry struct {
 	*configs.Mount
-	srcFD *int
+	srcFile *mountSource
 }
 
 func (m *mountEntry) src() string {
-	if m.srcFD != nil {
-		return "/proc/self/fd/" + strconv.Itoa(*m.srcFD)
+	if m.srcFile != nil {
+		return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd()))
 	}
 	return m.Source
 }
@@ -62,20 +64,12 @@ func needsSetupDev(config *configs.Config) bool {
 // prepareRootfs sets up the devices, mount points, and filesystems for use
 // inside a new mount namespace. It doesn't set anything as ro. You must call
 // finalizeRootfs after this function to finish setting up the rootfs.
-func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (err error) {
+func prepareRootfs(pipe *syncSocket, iConfig *initConfig) (err error) {
 	config := iConfig.Config
 	if err := prepareRoot(config); err != nil {
 		return fmt.Errorf("error preparing rootfs: %w", err)
 	}
 
-	if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) {
-		return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v", len(config.Mounts), len(mountFds.sourceFds))
-	}
-
-	if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) {
-		return fmt.Errorf("malformed idmapFds slice: expected size: %v, got: %v", len(config.Mounts), len(mountFds.idmapFds))
-	}
-
 	mountConfig := &mountConfig{
 		root:            config.Rootfs,
 		label:           config.MountLabel,
@@ -83,22 +77,53 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (er
 		rootlessCgroups: iConfig.RootlessCgroups,
 		cgroupns:        config.Namespaces.Contains(configs.NEWCGROUP),
 	}
-	for i, m := range config.Mounts {
+	for _, m := range config.Mounts {
 		entry := mountEntry{Mount: m}
-		// Just before the loop we checked that if not empty, len(mountFds.sourceFds) == len(config.Mounts).
-		// Therefore, we can access mountFds.sourceFds[i] without any concerns.
-		if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 {
-			entry.srcFD = &mountFds.sourceFds[i]
+		// Figure out whether we need to request runc to give us an
+		// open_tree(2)-style mountfd. For idmapped mounts, this is always
+		// necessary. For bind-mounts, this is only necessary if we cannot
+		// resolve the parent mount (this is only hit if you are running in a
+		// userns -- but for rootless the host-side thread can't help).
+		wantSourceFile := m.IsIDMapped()
+		if m.IsBind() && !config.RootlessEUID {
+			if _, err := os.Stat(m.Source); err != nil {
+				wantSourceFile = true
+			}
 		}
-
-		// We validated before we can access mountFds.idmapFds[i].
-		if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 {
-			if entry.srcFD != nil {
-				return fmt.Errorf("malformed mountFds and idmapFds slice, entry: %v has fds in both slices", i)
+		if wantSourceFile {
+			// Request a source file from the host.
+			if err := writeSyncArg(pipe, procMountPlease, m); err != nil {
+				return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err)
+			}
+			sync, err := readSyncFull(pipe, procMountFd)
+			if err != nil {
+				return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err)
 			}
-			entry.srcFD = &mountFds.idmapFds[i]
+			if sync.File == nil {
+				return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source)
+			}
+			defer sync.File.Close()
+			// Sanity-check to make sure we didn't get the wrong fd back. Note
+			// that while m.Source might contain symlinks, the (*os.File).Name
+			// is based on the path provided to os.OpenFile, not what it
+			// resolves to. So this should never happen.
+			if sync.File.Name() != m.Source {
+				return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name())
+			}
+			// Unmarshal the procMountFd argument (the file is sync.File).
+			var src *mountSource
+			if sync.Arg == nil {
+				return fmt.Errorf("sync %q is missing an argument", sync.Type)
+			}
+			if err := json.Unmarshal(*sync.Arg, &src); err != nil {
+				return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err)
+			}
+			if src == nil {
+				return fmt.Errorf("mountfd request for %q: no mount source info received", m.Source)
+			}
+			src.file = sync.File
+			entry.srcFile = src
 		}
-
 		if err := mountToRootfs(mountConfig, entry); err != nil {
 			return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err)
 		}
@@ -281,7 +306,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 			if err := os.MkdirAll(subsystemPath, 0o755); err != nil {
 				return err
 			}
-			if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error {
+			if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error {
 				flags := defaultMountFlags
 				if m.Flags&unix.MS_RDONLY != 0 {
 					flags = flags | unix.MS_RDONLY
@@ -294,7 +319,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 					data = cgroups.CgroupNamePrefix + data
 					source = "systemd"
 				}
-				return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data)
+				return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data)
 			}); err != nil {
 				return err
 			}
@@ -325,8 +350,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 	if err := os.MkdirAll(dest, 0o755); err != nil {
 		return err
 	}
-	err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error {
-		return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data)
+	err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error {
+		return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data)
 	})
 	if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) {
 		return err
@@ -347,7 +372,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error {
 		bindM.Source = c.cgroup2Path
 	}
 	// mountToRootfs() handles remounting for MS_RDONLY.
-	// No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate().
 	err = mountToRootfs(c, mountEntry{Mount: bindM})
 	if c.rootlessCgroups && errors.Is(err, unix.ENOENT) {
 		// ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed
@@ -392,15 +416,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) {
 		}
 	}()
 
-	return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) {
+	return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) {
 		// Copy the container data to the host tmpdir. We append "/" to force
 		// CopyDirectory to resolve the symlink rather than trying to copy the
 		// symlink itself.
-		if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil {
-			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err)
+		if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil {
+			return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err)
 		}
 		// Now move the mount into the container.
-		if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil {
+		if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil {
 			return fmt.Errorf("tmpcopyup: failed to move mount: %w", err)
 		}
 		return nil
@@ -522,35 +546,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		if err := prepareBindMount(m, rootfs); err != nil {
 			return err
 		}
-
-		if m.IsBind() && m.IsIDMapped() {
-			if m.srcFD == nil {
-				return fmt.Errorf("error creating mount %+v: idmapFD is invalid, should point to a valid fd", m)
-			}
-			if err := unix.MoveMount(*m.srcFD, "", unix.AT_FDCWD, dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil {
-				return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err)
-			}
-
-			// In nsexec.c, we did not set the propagation field of mount_attr struct.
-			// So, let's deal with these flags right now!
-			if err := utils.WithProcfd(rootfs, dest, func(dstFD string) error {
-				for _, pflag := range m.PropagationFlags {
-					// When using mount for setting propagations flags, the source, file
-					// system type and data arguments are ignored:
-					// https://man7.org/linux/man-pages/man2/mount.2.html
-					// We also ignore procfd because we want to act on dest.
-					if err := mountViaFDs("", nil, dest, dstFD, "", uintptr(pflag), ""); err != nil {
-						return err
-					}
-				}
-				return nil
-			}); err != nil {
-				return fmt.Errorf("change mount propagation through procfd: %w", err)
-			}
-		} else {
-			if err := mountPropagate(m, rootfs, mountLabel); err != nil {
-				return err
-			}
+		// open_tree()-related shenanigans are all handled in mountViaFds.
+		if err := mountPropagate(m, rootfs, mountLabel); err != nil {
+			return err
 		}
 
 		// The initial MS_BIND won't change the mount options, we need to do a
@@ -563,7 +561,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		// contrast to mount(8)'s current behaviour, but is what users probably
 		// expect. See <https://github.com/util-linux/util-linux/issues/2433>.
 		if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 {
-			if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+			if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 				flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT
 				// The runtime-spec says we SHOULD map to the relevant mount(8)
 				// behaviour. However, it's not clear whether we want the
@@ -590,7 +588,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 				// different set of flags. This also has the mount(8) bug where
 				// "nodiratime,norelatime" will result in a
 				// "nodiratime,relatime" mount.
-				mountErr := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "")
+				mountErr := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
 				if mountErr == nil {
 					return nil
 				}
@@ -639,7 +637,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 				// Retry the mount with the existing lockable mount flags
 				// applied.
 				flags |= srcFlags & mntLockFlags
-				mountErr = mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "")
+				mountErr = mountViaFds("", nil, m.Destination, dstFd, "", uintptr(flags), "")
 				logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr)
 				return mountErr
 			}); err != nil {
@@ -857,8 +855,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error {
 	if f != nil {
 		_ = f.Close()
 	}
-	return utils.WithProcfd(rootfs, dest, func(dstFD string) error {
-		return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "")
+	return utils.WithProcfd(rootfs, dest, func(dstFd string) error {
+		return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "")
 	})
 }
 
@@ -1251,17 +1249,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error {
 	// mutating underneath us, we verify that we are actually going to mount
 	// inside the container with WithProcfd() -- mounting through a procfd
 	// mounts on the target.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
-		return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data)
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
+		return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data)
 	}); err != nil {
 		return err
 	}
 	// We have to apply mount propagation flags in a separate WithProcfd() call
 	// because the previous call invalidates the passed procfd -- the mount
 	// target needs to be re-opened.
-	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error {
+	if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error {
 		for _, pflag := range m.PropagationFlags {
-			if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil {
+			if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil {
 				return err
 			}
 		}
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index 0f80cb33373..b27732778c4 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -27,7 +27,6 @@ type linuxStandardInit struct {
 	fifoFd        int
 	logFd         int
 	dmzExe        *os.File
-	mountFds      mountFds
 	config        *initConfig
 }
 
@@ -88,17 +87,7 @@ func (l *linuxStandardInit) Init() error {
 	// initialises the labeling system
 	selinux.GetEnabled()
 
-	// We don't need the mount nor idmap fds after prepareRootfs() nor if it fails.
-	err := prepareRootfs(l.pipe, l.config, l.mountFds)
-	for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) {
-		if m == -1 {
-			continue
-		}
-		if err := unix.Close(m); err != nil {
-			return fmt.Errorf("unable to close mountFds fds: %w", err)
-		}
-	}
-
+	err := prepareRootfs(l.pipe, l.config)
 	if err != nil {
 		return err
 	}
diff --git a/libcontainer/sync.go b/libcontainer/sync.go
index 1894e870e53..0a54a4b81e0 100644
--- a/libcontainer/sync.go
+++ b/libcontainer/sync.go
@@ -21,6 +21,11 @@ type syncType string
 //
 //	     [  child  ] <-> [   parent   ]
 //
+//	procMountPlease      --> [open(2) or open_tree(2) and configure mount]
+//	  Arg: configs.Mount
+//	                     <-- procMountFd
+//	                           file: mountfd
+//
 //	procSeccomp         --> [forward fd to listenerPath]
 //	  file: seccomp fd
 //	                    --- no return synchronisation
@@ -39,6 +44,8 @@ const (
 	procRun         syncType = "procRun"
 	procHooks       syncType = "procHooks"
 	procHooksDone   syncType = "procHooksDone"
+	procMountPlease syncType = "procMountPlease"
+	procMountFd     syncType = "procMountFd"
 	procSeccomp     syncType = "procSeccomp"
 	procSeccompDone syncType = "procSeccompDone"
 )
diff --git a/libcontainer/userns/usernsfd_linux.go b/libcontainer/userns/usernsfd_linux.go
new file mode 100644
index 00000000000..64446815581
--- /dev/null
+++ b/libcontainer/userns/usernsfd_linux.go
@@ -0,0 +1,153 @@
+package userns
+
+import (
+	"fmt"
+	"os"
+	"sort"
+	"strings"
+	"sync"
+	"syscall"
+
+	"github.com/sirupsen/logrus"
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+type Mapping struct {
+	UIDMappings []configs.IDMap
+	GIDMappings []configs.IDMap
+}
+
+func (m Mapping) toSys() (uids, gids []syscall.SysProcIDMap) {
+	for _, uid := range m.UIDMappings {
+		uids = append(uids, syscall.SysProcIDMap{
+			ContainerID: uid.ContainerID,
+			HostID:      uid.HostID,
+			Size:        uid.Size,
+		})
+	}
+	for _, gid := range m.GIDMappings {
+		gids = append(gids, syscall.SysProcIDMap{
+			ContainerID: gid.ContainerID,
+			HostID:      gid.HostID,
+			Size:        gid.Size,
+		})
+	}
+	return
+}
+
+// id returns a unique identifier for this mapping, agnostic of the order of
+// the uid and gid mappings (because the order doesn't matter to the kernel).
+// The set of userns handles is indexed using this ID.
+func (m Mapping) id() string {
+	var uids, gids []string
+	for _, idmap := range m.UIDMappings {
+		uids = append(uids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
+	}
+	for _, idmap := range m.GIDMappings {
+		gids = append(gids, fmt.Sprintf("%d:%d:%d", idmap.ContainerID, idmap.HostID, idmap.Size))
+	}
+	// We don't care about the sort order -- just sort them.
+	sort.Strings(uids)
+	sort.Strings(gids)
+	return "uid=" + strings.Join(uids, ",") + ";gid=" + strings.Join(gids, ",")
+}
+
+type Handles struct {
+	m    sync.Mutex
+	maps map[string]*os.File
+}
+
+// Release all resources associated with this Handle. All existing files
+// returned from Get() will continue to work even after calling Release(). The
+// same Handles can be re-used after calling Release().
+func (hs *Handles) Release() {
+	hs.m.Lock()
+	defer hs.m.Unlock()
+
+	// Close the files for good measure, though GC will do that for us anyway.
+	for _, file := range hs.maps {
+		_ = file.Close()
+	}
+	hs.maps = nil
+}
+
+func spawnProc(req Mapping) (*os.Process, error) {
+	// We need to spawn a subprocess with the requested mappings, which is
+	// unfortunately quite expensive. The "safe" way of doing this is natively
+	// with Go (and then spawning something like "sleep infinity"), but
+	// execve() is a waste of cycles because we just need some process to have
+	// the right mapping, we don't care what it's executing. The "unsafe"
+	// option of doing a clone() behind the back of Go is probably okay in
+	// theory as long as we just do kill(getpid(), SIGSTOP). However, if we
+	// tell Go to put the new process into PTRACE_TRACEME mode, we can avoid
+	// the exec and not have to faff around with the mappings.
+	//
+	// Note that Go's stdlib does not support newuidmap, but in the case of
+	// id-mapped mounts, it seems incredibly unlikely that the user will be
+	// requesting us to do a remapping as an unprivileged user with mappings
+	// they have privileges over.
+	logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
+	uidMappings, gidMappings := req.toSys()
+	return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
+		Sys: &syscall.SysProcAttr{
+			Cloneflags:                 unix.CLONE_NEWUSER,
+			UidMappings:                uidMappings,
+			GidMappings:                gidMappings,
+			GidMappingsEnableSetgroups: false,
+			// Put the process into PTRACE_TRACEME mode to allow us to get the
+			// userns without having a proper execve() target.
+			Ptrace: true,
+		},
+	})
+}
+
+func dupFile(f *os.File) (*os.File, error) {
+	newFd, err := unix.FcntlInt(f.Fd(), unix.F_DUPFD_CLOEXEC, 0)
+	if err != nil {
+		return nil, os.NewSyscallError("fcntl(F_DUPFD_CLOEXEC)", err)
+	}
+	return os.NewFile(uintptr(newFd), f.Name()), nil
+}
+
+// Get returns a handle to a /proc/$pid/ns/user nsfs file with the requested
+// mapping. The processes spawned to produce userns nsfds are cached, so if
+// equivalent user namespace mappings are requested, the same user namespace
+// will be returned. The caller is responsible for closing the returned file
+// descriptor.
+func (hs *Handles) Get(req Mapping) (file *os.File, err error) {
+	hs.m.Lock()
+	defer hs.m.Unlock()
+
+	if hs.maps == nil {
+		hs.maps = make(map[string]*os.File)
+	}
+
+	file, ok := hs.maps[req.id()]
+	if !ok {
+		proc, err := spawnProc(req)
+		if err != nil {
+			return nil, fmt.Errorf("failed to spawn dummy process for map %s: %w", req.id(), err)
+		}
+		// Make sure we kill the helper process. We ignore errors because
+		// there's not much we can do about them anyway, and ultimately
+		defer func() {
+			_ = proc.Kill()
+			_, _ = proc.Wait()
+		}()
+
+		// Stash away a handle to the userns file. This is neater than keeping
+		// the process alive, because Go's GC can handle files much better than
+		// leaked processes, and having long-living useless processes seems
+		// less than ideal.
+		file, err = os.Open(fmt.Sprintf("/proc/%d/ns/user", proc.Pid))
+		if err != nil {
+			return nil, err
+		}
+		hs.maps[req.id()] = file
+	}
+	// Duplicate the file, to make sure the lifecycle of each *os.File we
+	// return is independent.
+	return dupFile(file)
+}
diff --git a/libcontainer/userns/usernsfd_linux_test.go b/libcontainer/userns/usernsfd_linux_test.go
new file mode 100644
index 00000000000..23a3419fcae
--- /dev/null
+++ b/libcontainer/userns/usernsfd_linux_test.go
@@ -0,0 +1,52 @@
+package userns
+
+import (
+	"os"
+	"testing"
+
+	"github.com/opencontainers/runc/libcontainer/configs"
+)
+
+func BenchmarkSpawnProc(b *testing.B) {
+	if os.Geteuid() != 0 {
+		b.Skip("spawning user namespaced processes requires root")
+	}
+
+	// We can reuse the mapping as we call spawnProc() directly.
+	mapping := Mapping{
+		UIDMappings: []configs.IDMap{
+			{ContainerID: 0, HostID: 1337, Size: 142},
+			{ContainerID: 150, HostID: 0, Size: 1},
+			{ContainerID: 442, HostID: 1111, Size: 12},
+			{ContainerID: 1000, HostID: 9999, Size: 92},
+			{ContainerID: 9999, HostID: 1000000, Size: 4},
+			// Newer kernels support more than 5 entries, but stick to 5 here.
+		},
+		GIDMappings: []configs.IDMap{
+			{ContainerID: 1, HostID: 2337, Size: 142},
+			{ContainerID: 145, HostID: 6, Size: 1},
+			{ContainerID: 200, HostID: 1000, Size: 12},
+			{ContainerID: 1000, HostID: 9888, Size: 92},
+			{ContainerID: 8999, HostID: 1000000, Size: 4},
+			// Newer kernels support more than 5 entries, but stick to 5 here.
+		},
+	}
+
+	procs := make([]*os.Process, 0, b.N)
+	b.Cleanup(func() {
+		for _, proc := range procs {
+			if proc != nil {
+				_ = proc.Kill()
+				_, _ = proc.Wait()
+			}
+		}
+	})
+
+	for i := 0; i < b.N; i++ {
+		proc, err := spawnProc(mapping)
+		if err != nil {
+			b.Error(err)
+		}
+		procs = append(procs, proc)
+	}
+}

From 5ae88daf06daa8d2439a98c6b86885d22e3064c4 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sat, 5 Aug 2023 18:04:50 +1000
Subject: [PATCH 2/8] idmap: allow arbitrary idmap mounts regardless of userns
 configuration

With the rework of nsexec.c to handle MOUNT_ATTR_IDMAP in our Go code we
can now handle arbitrary mappings without issue, so remove the primary
artificial limit of mappings (must use the same mapping as the
container's userns) and add some tests.

We still only support idmap mounts for bind-mounts because configuring
mappings for other filesystems would require switching our entire mount
machinery to the new mount API. The current design would easily allow
for this but we would need to convert new mount options entirely to the
fsopen/fsconfig/fsmount API. This can be done in the future.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/configs/validate/validator.go    |  39 +-
 .../configs/validate/validator_test.go        |  72 ++--
 tests/integration/idmap.bats                  | 378 +++++++++++++-----
 3 files changed, 312 insertions(+), 177 deletions(-)

diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go
index 2594d05c723..33f9cc3f5fe 100644
--- a/libcontainer/configs/validate/validator.go
+++ b/libcontainer/configs/validate/validator.go
@@ -312,26 +312,12 @@ func checkIDMapMounts(config *configs.Config, m *configs.Mount) error {
 	if !m.IsIDMapped() {
 		return nil
 	}
-
 	if !m.IsBind() {
-		return fmt.Errorf("gidMappings/uidMappings is supported only for mounts with the option 'bind'")
+		return errors.New("id-mapped mounts are only supported for bind-mounts")
 	}
 	if config.RootlessEUID {
-		return fmt.Errorf("gidMappings/uidMappings is not supported when runc is being launched with EUID != 0, needs CAP_SYS_ADMIN on the runc parent's user namespace")
-	}
-	if len(config.UIDMappings) == 0 || len(config.GIDMappings) == 0 {
-		return fmt.Errorf("not yet supported to use gidMappings/uidMappings in a mount without also using a user namespace")
-	}
-	if !sameMapping(config.UIDMappings, m.UIDMappings) {
-		return fmt.Errorf("not yet supported for the mount uidMappings to be different than user namespace uidMapping")
-	}
-	if !sameMapping(config.GIDMappings, m.GIDMappings) {
-		return fmt.Errorf("not yet supported for the mount gidMappings to be different than user namespace gidMapping")
+		return errors.New("id-mapped mounts are not supported for rootless containers")
 	}
-	if !filepath.IsAbs(m.Source) {
-		return fmt.Errorf("mount source not absolute")
-	}
-
 	return nil
 }
 
@@ -356,27 +342,6 @@ func mountsStrict(config *configs.Config) error {
 	return nil
 }
 
-// sameMapping checks if the mappings are the same. If the mappings are the same
-// but in different order, it returns false.
-func sameMapping(a, b []configs.IDMap) bool {
-	if len(a) != len(b) {
-		return false
-	}
-
-	for i := range a {
-		if a[i].ContainerID != b[i].ContainerID {
-			return false
-		}
-		if a[i].HostID != b[i].HostID {
-			return false
-		}
-		if a[i].Size != b[i].Size {
-			return false
-		}
-	}
-	return true
-}
-
 func isHostNetNS(path string) (bool, error) {
 	const currentProcessNetns = "/proc/self/ns/net"
 
diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go
index 755d2a07be8..9d3f50c7c70 100644
--- a/libcontainer/configs/validate/validator_test.go
+++ b/libcontainer/configs/validate/validator_test.go
@@ -538,12 +538,13 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name:  "idmap mount without userns mappings",
-			isErr: true,
+			name: "idmap mounts without abs source path",
 			config: &configs.Config{
+				UIDMappings: mapping,
+				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/abs/path/",
+						Source:      "./rel/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
@@ -553,62 +554,47 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name:  "idmap mounts with different userns and mount mappings",
-			isErr: true,
+			name: "idmap mounts without abs dest path",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
 						Source:      "/abs/path/",
-						Destination: "/abs/path/",
+						Destination: "./rel/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
-							},
-						},
+						UIDMappings: mapping,
 						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name:  "idmap mounts with different userns and mount mappings",
-			isErr: true,
+			name: "simple idmap mount",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/abs/path/",
+						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
-						GIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
-							},
-						},
+						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name:  "idmap mounts without abs source path",
-			isErr: true,
+			name: "idmap mount with more flags",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "./rel/path/",
+						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND,
+						Flags:       unix.MS_BIND | unix.MS_RDONLY,
 						UIDMappings: mapping,
 						GIDMappings: mapping,
 					},
@@ -616,14 +602,12 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name: "idmap mounts without abs dest path",
+			name: "idmap mount without userns mappings",
 			config: &configs.Config{
-				UIDMappings: mapping,
-				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
 						Source:      "/abs/path/",
-						Destination: "./rel/path/",
+						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
 						GIDMappings: mapping,
@@ -632,33 +616,45 @@ func TestValidateIDMapMounts(t *testing.T) {
 			},
 		},
 		{
-			name: "simple idmap mount",
+			name: "idmap mounts with different userns and mount mappings",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/another-abs/path/",
+						Source:      "/abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
+						UIDMappings: []configs.IDMap{
+							{
+								ContainerID: 10,
+								HostID:      10,
+								Size:        1,
+							},
+						},
 						GIDMappings: mapping,
 					},
 				},
 			},
 		},
 		{
-			name: "idmap mount with more flags",
+			name: "idmap mounts with different userns and mount mappings",
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
 				Mounts: []*configs.Mount{
 					{
-						Source:      "/another-abs/path/",
+						Source:      "/abs/path/",
 						Destination: "/abs/path/",
-						Flags:       unix.MS_BIND | unix.MS_RDONLY,
+						Flags:       unix.MS_BIND,
 						UIDMappings: mapping,
-						GIDMappings: mapping,
+						GIDMappings: []configs.IDMap{
+							{
+								ContainerID: 10,
+								HostID:      10,
+								Size:        1,
+							},
+						},
 					},
 				},
 			},
diff --git a/tests/integration/idmap.bats b/tests/integration/idmap.bats
index 53784b55672..89fe4d82356 100644
--- a/tests/integration/idmap.bats
+++ b/tests/integration/idmap.bats
@@ -3,167 +3,341 @@
 load helpers
 
 function setup() {
+	OVERFLOW_UID="$(cat /proc/sys/kernel/overflowuid)"
+	OVERFLOW_GID="$(cat /proc/sys/kernel/overflowgid)"
+
 	requires root
 	requires_kernel 5.12
-	requires_idmap_fs /tmp
 
 	setup_debian
+	requires_idmap_fs .
 
-	# Prepare source folders for bind mount
-	mkdir -p source-{1,2}/
-	touch source-{1,2}/foo.txt
+	# Prepare source folders for mounts.
+	mkdir -p source-{1,2,multi{1,2,3}}/
+	touch source-{1,2,multi{1,2,3}}/foo.txt
+	touch source-multi{1,2,3}/{bar,baz}.txt
 
-	# Use other owner for source-2
+	# Change the owners for everything other than source-1.
 	chown 1:1 source-2/foo.txt
 
-	mkdir -p rootfs/tmp/mount-{1,2}
-	mkdir -p rootfs/mnt/bind-mount-{1,2}
-
-	update_config ' .linux.namespaces += [{"type": "user"}]
-		| .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
-		| .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}]
-		| .process.args += ["-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]
-		| .mounts += [
-					{
-						"source": "source-1/",
-						"destination": "/tmp/mount-1",
-						"options": ["bind"],
-						"uidMappings": [ {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						],
-						"gidMappings": [        {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						]
-					}
-				] '
+	# A source with multiple users owning files.
+	chown 100:211 source-multi1/foo.txt
+	chown 101:222 source-multi1/bar.txt
+	chown 102:233 source-multi1/baz.txt
 
-	remap_rootfs
+	# Same gids as multi1, different uids.
+	chown 200:211 source-multi2/foo.txt
+	chown 201:222 source-multi2/bar.txt
+	chown 202:233 source-multi2/baz.txt
+
+	# Even more users -- 1000 uids, 500 gids.
+	chown 5000528:6000491 source-multi3/foo.txt
+	chown 5000133:6000337 source-multi3/bar.txt
+	chown 5000999:6000444 source-multi3/baz.txt
+
+	# Add a symlink-containing source.
+	ln -s source-multi1 source-multi1-symlink
 }
 
 function teardown() {
 	teardown_bundle
 }
 
-@test "simple idmap mount" {
+function setup_idmap_userns() {
+	update_config '.linux.namespaces += [{"type": "user"}]
+		| .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]
+		| .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]'
+	remap_rootfs
+}
+
+function setup_bind_mount() {
+	mountname="${1:-1}"
+	update_config '.mounts += [
+			{
+				"source": "source-'"$mountname"'/",
+				"destination": "/tmp/bind-mount-'"$mountname"'",
+				"options": ["bind"]
+			}
+		]'
+}
+
+function setup_idmap_single_mount() {
+	uidmap="$1" # ctr:host:size
+	gidmap="$2" # ctr:host:size
+	mountname="$3"
+	destname="${4:-$mountname}"
+
+	read -r uid_containerID uid_hostID uid_size <<<"$(tr : ' ' <<<"$uidmap")"
+	read -r gid_containerID gid_hostID gid_size <<<"$(tr : ' ' <<<"$gidmap")"
+
+	update_config '.mounts += [
+			{
+				"source": "source-'"$mountname"'/",
+				"destination": "/tmp/mount-'"$destname"'",
+				"options": ["bind"],
+				"uidMappings": [{"containerID": '"$uid_containerID"', "hostID": '"$uid_hostID"', "size": '"$uid_size"'}],
+				"gidMappings": [{"containerID": '"$gid_containerID"', "hostID": '"$gid_hostID"', "size": '"$gid_size"'}]
+			}
+		]'
+}
+
+function setup_idmap_basic_mount() {
+	mountname="${1:-1}"
+	setup_idmap_single_mount 0:100000:65536 0:100000:65536 "$mountname"
+}
+
+@test "simple idmap mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]'
+
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"=0=0="* ]]
 }
 
-@test "write to an idmap mount" {
-	update_config ' .process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+@test "simple idmap mount [no userns]" {
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=100000=100000="* ]]
+}
+
+@test "write to an idmap mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"=0=0="* ]]
 }
 
-@test "idmap mount with propagation flag" {
-	update_config ' .process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]'
-	# Add the shared option to the idmap mount
-	update_config ' .mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)'
+@test "write to an idmap mount [no userns]" {
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "touch /tmp/mount-1/bar && stat -c =%u=%g= /tmp/mount-1/bar"]'
+
+	runc run test_debian
+	# The write must fail because the user is unmapped.
+	[ "$status" -ne 0 ]
+	[[ "$output" == *"Value too large for defined data type"* ]] # ERANGE
+}
+
+@test "idmap mount with propagation flag [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "findmnt -o PROPAGATION /tmp/mount-1"]'
+	# Add the shared option to the idmap mount.
+	update_config '.mounts |= map((select(.source == "source-1/") | .options += ["shared"]) // .)'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"shared"* ]]
 }
 
-@test "idmap mount with relative path" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | .destination = "tmp/mount-1") // .)'
+@test "idmap mount with relative path [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+
+	update_config '.process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt"]'
+	# Switch the mount to have a relative mount destination.
+	update_config '.mounts |= map((select(.source == "source-1/") | .destination = "tmp/mount-1") // .)'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
 	[[ "$output" == *"=0=0="* ]]
 }
 
-@test "idmap mount with bind mount" {
-	update_config '   .mounts += [
-					{
-						"source": "source-2/",
-						"destination": "/tmp/mount-2",
-						"options": ["bind"]
-					}
-				] '
+@test "idmap mount with bind mount [userns]" {
+	setup_idmap_userns
+	setup_idmap_basic_mount
+	setup_bind_mount
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
-	[[ "$output" == *"=0=0="* ]]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]]
+	[[ "$output" == *"=/tmp/bind-mount-1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
 }
 
-@test "two idmap mounts with two bind mounts" {
-	update_config '   .process.args = ["sh", "-c", "stat -c =%u=%g= /tmp/mount-1/foo.txt /tmp/mount-2/foo.txt"]
-			| .mounts += [
-					{
-						"source": "source-1/",
-						"destination": "/mnt/bind-mount-1",
-						"options": ["bind"]
-					},
-					{
-						"source": "source-2/",
-						"destination": "/mnt/bind-mount-2",
-						"options": ["bind"]
-					},
-					{
-						"source": "source-2/",
-						"destination": "/tmp/mount-2",
-						"options": ["bind"],
-						"uidMappings": [ {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						],
-						"gidMappings": [        {
-						                  "containerID": 0,
-						                  "hostID": 100000,
-						                  "size": 65536
-						                }
-						]
-					}
-				] '
+@test "idmap mount with bind mount [no userns]" {
+	setup_idmap_basic_mount
+	setup_bind_mount
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/{,bind-}mount-1/foo.txt"]'
 
 	runc run test_debian
 	[ "$status" -eq 0 ]
-	[[ "$output" == *"=0=0="* ]]
-	# source-2/ is owned by 1:1, so we expect this with the idmap mount too.
-	[[ "$output" == *"=1=1="* ]]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:100000=100000="* ]]
+	[[ "$output" == *"=/tmp/bind-mount-1/foo.txt:0=0="* ]]
+}
+
+@test "two idmap mounts (same mapping) with two bind mounts [userns]" {
+	setup_idmap_userns
+
+	setup_idmap_basic_mount 1
+	setup_bind_mount 1
+	setup_bind_mount 2
+	setup_idmap_basic_mount 2
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-[12]/foo.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-1/foo.txt:0=0="* ]]
+	[[ "$output" == *"=/tmp/mount-2/foo.txt:1=1="* ]]
+}
+
+@test "same idmap mount (different mappings) [userns]" {
+	setup_idmap_userns
+
+	# Mount the same directory with different mappings. Make sure we also use
+	# different mappings for uids and gids.
+	setup_idmap_single_mount 100:100000:100 200:100000:100 multi1
+	setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt
+	setup_idmap_single_mount 100:102000:100 200:103000:100 multi1-symlink multi1-alt-sym
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt{,-sym}}/{foo,bar,baz}.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:0=11="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1=22="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:2=33="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:1000=2011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:1001=2022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:1002=2033="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt-sym/foo.txt:2000=3011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt-sym/bar.txt:2001=3022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt-sym/baz.txt:2002=3033="* ]]
+}
+
+@test "same idmap mount (different mappings) [no userns]" {
+	# Mount the same directory with different mappings. Make sure we also use
+	# different mappings for uids and gids.
+	setup_idmap_single_mount 100:100000:100 200:100000:100 multi1
+	setup_idmap_single_mount 100:101000:100 200:102000:100 multi1 multi1-alt
+	setup_idmap_single_mount 100:102000:100 200:103000:100 multi1-symlink multi1-alt-sym
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1{,-alt{,-sym}}/{foo,bar,baz}.txt"]'
+
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:100000=100011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:100001=100022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:100002=100033="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/foo.txt:101000=102011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/bar.txt:101001=102022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt/baz.txt:101002=102033="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt-sym/foo.txt:102000=103011="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt-sym/bar.txt:102001=103022="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1-alt-sym/baz.txt:102002=103033="* ]]
 }
 
-@test "idmap mount without gidMappings fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | del(.gidMappings) ) // .)'
+@test "multiple idmap mounts (different mappings) [userns]" {
+	setup_idmap_userns
+
+	# Make sure we use different mappings for uids and gids.
+	setup_idmap_single_mount 100:101100:3 200:101900:50 multi1
+	setup_idmap_single_mount 200:102200:3 200:102900:100 multi2
+	setup_idmap_single_mount 5000000:103000:1000 6000000:103000:500 multi3
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]]
 }
 
-@test "idmap mount without uidMappings fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | del(.uidMappings) ) // .)'
+@test "multiple idmap mounts (different mappings) [no userns]" {
+	# Make sure we use different mappings for uids and gids.
+	setup_idmap_single_mount 100:1100:3 200:1900:50 multi1
+	setup_idmap_single_mount 200:2200:3 200:2900:100 multi2
+	setup_idmap_single_mount 5000000:3000:1000 6000000:3000:500 multi3
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi[123]/{foo,bar,baz}.txt"]'
 
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1100=1911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:1101=1922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:1102=1933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/foo.txt:2200=2911="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/bar.txt:2201=2922="* ]]
+	[[ "$output" == *"=/tmp/mount-multi2/baz.txt:2202=2933="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/foo.txt:3528=3491="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/bar.txt:3133=3337="* ]]
+	[[ "$output" == *"=/tmp/mount-multi3/baz.txt:3999=3444="* ]]
 }
 
-@test "idmap mount without bind fails" {
-	update_config ' .mounts |= map((select(.source == "source-1/") | .options = [""]) // .)'
+@test "idmap mount (complicated mapping) [userns]" {
+	setup_idmap_userns
+
+	update_config '.mounts += [
+			{
+				"source": "source-multi1/",
+				"destination": "/tmp/mount-multi1",
+				"options": ["bind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 1},
+					{"containerID": 101, "hostID": 102000, "size": 1},
+					{"containerID": 102, "hostID": 103000, "size": 1}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
 
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]'
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]]
 }
 
-@test "idmap mount with different mapping than userns fails" {
-	# Let's modify the containerID to be 1, instead of 0 as it is in the
-	# userns config.
-	update_config ' .mounts |= map((select(.source == "source-1/") | .uidMappings[0]["containerID"] = 1 ) // .)'
+@test "idmap mount (complicated mapping) [no userns]" {
+	update_config '.mounts += [
+			{
+				"source": "source-multi1/",
+				"destination": "/tmp/mount-multi1",
+				"options": ["bind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 1000, "size": 1},
+					{"containerID": 101, "hostID": 2000, "size": 1},
+					{"containerID": 102, "hostID": 3000, "size": 1}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 1100, "size": 10},
+					{"containerID": 220, "hostID": 2200, "size": 10},
+					{"containerID": 230, "hostID": 3300, "size": 10}
+				]
+			}
+		]'
 
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-multi1/{foo,bar,baz}.txt"]'
 	runc run test_debian
-	[ "$status" -eq 1 ]
-	[[ "${output}" == *"invalid mount"* ]]
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-multi1/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]]
 }

From a04d88ec735de6b0e6c2dcdd62da9fac966a2efa Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Tue, 7 Nov 2023 12:55:04 +1100
Subject: [PATCH 3/8] vendor: update to github.com/moby/sys/mountinfo@v0.7.1

The primary change is a switch to using /proc/thread-self, which is
needed for when we add a CLONE_FS thread to runc.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 go.mod                                        |  2 +-
 go.sum                                        |  4 +-
 .../moby/sys/mountinfo/mountinfo_linux.go     | 50 ++++++++++++++++---
 vendor/modules.txt                            |  2 +-
 4 files changed, 47 insertions(+), 11 deletions(-)

diff --git a/go.mod b/go.mod
index dcbb0ae3c16..4b2adfc2e80 100644
--- a/go.mod
+++ b/go.mod
@@ -10,7 +10,7 @@ require (
 	github.com/cyphar/filepath-securejoin v0.2.4
 	github.com/docker/go-units v0.5.0
 	github.com/godbus/dbus/v5 v5.1.0
-	github.com/moby/sys/mountinfo v0.6.2
+	github.com/moby/sys/mountinfo v0.7.1
 	github.com/moby/sys/user v0.1.0
 	github.com/mrunalp/fileutils v0.5.1
 	github.com/opencontainers/runtime-spec v1.1.1-0.20230823135140-4fec88fd00a4
diff --git a/go.sum b/go.sum
index 3b347e30a91..fbeb4d83575 100644
--- a/go.sum
+++ b/go.sum
@@ -26,8 +26,8 @@ github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
 github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
-github.com/moby/sys/mountinfo v0.6.2 h1:BzJjoreD5BMFNmD9Rus6gdd1pLuecOFPt8wC+Vygl78=
-github.com/moby/sys/mountinfo v0.6.2/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI=
+github.com/moby/sys/mountinfo v0.7.1 h1:/tTvQaSJRr2FshkhXiIpux6fQ2Zvc4j7tAhMTStAG2g=
+github.com/moby/sys/mountinfo v0.7.1/go.mod h1:IJb6JQeOklcdMU9F5xQ8ZALD+CUr5VlGpwtX+VE0rpI=
 github.com/moby/sys/user v0.1.0 h1:WmZ93f5Ux6het5iituh9x2zAG7NFY9Aqi49jjE1PaQg=
 github.com/moby/sys/user v0.1.0/go.mod h1:fKJhFOnsCN6xZ5gSfbM6zaHGgDJMrqt9/reuj4T7MmU=
 github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q=
diff --git a/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go b/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go
index 59332b07bf4..b32b5c9b150 100644
--- a/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go
+++ b/vendor/github.com/moby/sys/mountinfo/mountinfo_linux.go
@@ -5,15 +5,19 @@ import (
 	"fmt"
 	"io"
 	"os"
+	"runtime"
 	"strconv"
 	"strings"
+	"sync"
+
+	"golang.org/x/sys/unix"
 )
 
 // GetMountsFromReader retrieves a list of mounts from the
 // reader provided, with an optional filter applied (use nil
 // for no filter). This can be useful in tests or benchmarks
 // that provide fake mountinfo data, or when a source other
-// than /proc/self/mountinfo needs to be read from.
+// than /proc/thread-self/mountinfo needs to be read from.
 //
 // This function is Linux-specific.
 func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
@@ -127,8 +131,40 @@ func GetMountsFromReader(r io.Reader, filter FilterFunc) ([]*Info, error) {
 	return out, nil
 }
 
-func parseMountTable(filter FilterFunc) ([]*Info, error) {
-	f, err := os.Open("/proc/self/mountinfo")
+var (
+	haveProcThreadSelf     bool
+	haveProcThreadSelfOnce sync.Once
+)
+
+func parseMountTable(filter FilterFunc) (_ []*Info, err error) {
+	haveProcThreadSelfOnce.Do(func() {
+		_, err := os.Stat("/proc/thread-self/mountinfo")
+		haveProcThreadSelf = err == nil
+	})
+
+	// We need to lock ourselves to the current OS thread in order to make sure
+	// that the thread referenced by /proc/thread-self stays alive until we
+	// finish parsing the file.
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+
+	var f *os.File
+	if haveProcThreadSelf {
+		f, err = os.Open("/proc/thread-self/mountinfo")
+	} else {
+		// On pre-3.17 kernels (such as CentOS 7), we don't have
+		// /proc/thread-self/ so we need to manually construct
+		// /proc/self/task/<tid>/ as a fallback.
+		f, err = os.Open("/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/mountinfo")
+		if os.IsNotExist(err) {
+			// If /proc/self/task/... failed, it means that our active pid
+			// namespace doesn't match the pid namespace of the /proc mount. In
+			// this case we just have to make do with /proc/self, since there
+			// is no other way of figuring out our tid in a parent pid
+			// namespace on pre-3.17 kernels.
+			f, err = os.Open("/proc/self/mountinfo")
+		}
+	}
 	if err != nil {
 		return nil, err
 	}
@@ -158,10 +194,10 @@ func PidMountInfo(pid int) ([]*Info, error) {
 // A few specific characters in mountinfo path entries (root and mountpoint)
 // are escaped using a backslash followed by a character's ascii code in octal.
 //
-//   space              -- as \040
-//   tab (aka \t)       -- as \011
-//   newline (aka \n)   -- as \012
-//   backslash (aka \\) -- as \134
+//	space              -- as \040
+//	tab (aka \t)       -- as \011
+//	newline (aka \n)   -- as \012
+//	backslash (aka \\) -- as \134
 //
 // This function converts path from mountinfo back, i.e. it unescapes the above sequences.
 func unescape(path string) (string, error) {
diff --git a/vendor/modules.txt b/vendor/modules.txt
index 1863d3de21f..23438cb8215 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -33,7 +33,7 @@ github.com/docker/go-units
 # github.com/godbus/dbus/v5 v5.1.0
 ## explicit; go 1.12
 github.com/godbus/dbus/v5
-# github.com/moby/sys/mountinfo v0.6.2
+# github.com/moby/sys/mountinfo v0.7.1
 ## explicit; go 1.16
 github.com/moby/sys/mountinfo
 # github.com/moby/sys/user v0.1.0

From 8e8b136c4923ac33567c4cb775c6c8a17749fd02 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 24 Aug 2023 12:53:53 +1000
Subject: [PATCH 4/8] tree-wide: use /proc/thread-self for thread-local state

With the idmap work, we will have a tainted Go thread in our
thread-group that has a different mount namespace to the other threads.
It seems that (due to some bad luck) the Go scheduler tends to make this
thread the thread-group leader in our tests, which results in very
baffling failures where /proc/self/mountinfo produces gibberish results.

In order to avoid this, switch to using /proc/thread-self for everything
that is thread-local. This primarily includes switching all file
descriptor paths (CLONE_FS), all of the places that check the current
cgroup (technically we never will run a single runc thread in a separate
cgroup, but better to be safe than sorry), and the aforementioned
mountinfo code. We don't need to do anything for the following because
the results we need aren't thread-local:

 * Checks that certain namespaces are supported by stat(2)ing
   /proc/self/ns/...

 * /proc/self/exe and /proc/self/cmdline are not thread-local.

 * While threads can be in different cgroups, we do not do this for the
   runc binary (or libcontainer) and thus we do not need to switch to
   the thread-local version of /proc/self/cgroups.

 * All of the CLONE_NEWUSER files are not thread-local because you
   cannot set the usernamespace of a single thread (setns(CLONE_NEWUSER)
   is blocked for multi-threaded programs).

Note that we have to use runtime.LockOSThread when we have an open
handle to a tid-specific procfs file that we are operating on multiple
times. Go can reschedule us such that we are running on a different
thread and then kill the original thread (causing -ENOENT or similarly
confusing errors). This is not strictly necessary for most usages of
/proc/thread-self (such as using /proc/thread-self/fd/$n directly) since
only operating on the actual inodes associated with the tid requires
this locking, but because of the pre-3.17 fallback for CentOS, we have
to do this in most cases.

In addition, CentOS's kernel is too old for /proc/thread-self, which
requires us to emulate it -- however in rootfs_linux.go, we are in the
container pid namespace but /proc is the host's procfs. This leads to
the incredibly frustrating situation where there is no way (on pre-4.1
Linux) to figure out which /proc/self/task/... entry refers to the
current tid. We can just use /proc/self in this case.

Yes this is all pretty ugly. I also wish it wasn't necessary.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/apparmor/apparmor_linux.go  |  15 ++--
 libcontainer/cgroups/cgroups_test.go     |   3 +
 libcontainer/cgroups/file.go             |   9 +-
 libcontainer/cgroups/fs2/defaultpath.go  |   3 +
 libcontainer/cgroups/v1_utils.go         |  10 ++-
 libcontainer/configs/namespaces_linux.go |   3 +
 libcontainer/container_linux.go          |  10 ++-
 libcontainer/criu_linux.go               |   5 +-
 libcontainer/init_linux.go               |   3 +
 libcontainer/integration/exec_test.go    |  36 +++++---
 libcontainer/mount_linux.go              |  22 +++--
 libcontainer/rootfs_linux.go             |  85 +++++++++++-------
 libcontainer/rootfs_linux_test.go        |  75 +++++++++++++---
 libcontainer/standard_init_linux.go      |   5 +-
 libcontainer/userns/usernsfd_linux.go    |   3 +
 libcontainer/utils/utils.go              |  36 --------
 libcontainer/utils/utils_unix.go         | 106 ++++++++++++++++++++++-
 utils_linux.go                           |   4 +-
 18 files changed, 319 insertions(+), 114 deletions(-)

diff --git a/libcontainer/apparmor/apparmor_linux.go b/libcontainer/apparmor/apparmor_linux.go
index 8b1483c7de7..17d36ed15a3 100644
--- a/libcontainer/apparmor/apparmor_linux.go
+++ b/libcontainer/apparmor/apparmor_linux.go
@@ -26,14 +26,19 @@ func isEnabled() bool {
 }
 
 func setProcAttr(attr, value string) error {
-	// Under AppArmor you can only change your own attr, so use /proc/self/
-	// instead of /proc/<tid>/ like libapparmor does
-	attrPath := "/proc/self/attr/apparmor/" + attr
-	if _, err := os.Stat(attrPath); errors.Is(err, os.ErrNotExist) {
+	attr = utils.CleanPath(attr)
+	attrSubPath := "attr/apparmor/" + attr
+	if _, err := os.Stat("/proc/self/" + attrSubPath); errors.Is(err, os.ErrNotExist) {
 		// fall back to the old convention
-		attrPath = "/proc/self/attr/" + attr
+		attrSubPath = "attr/" + attr
 	}
 
+	// Under AppArmor you can only change your own attr, so there's no reason
+	// to not use /proc/thread-self/ (instead of /proc/<tid>/, like libapparmor
+	// does).
+	attrPath, closer := utils.ProcThreadSelf(attrSubPath)
+	defer closer()
+
 	f, err := os.OpenFile(attrPath, os.O_WRONLY, 0)
 	if err != nil {
 		return err
diff --git a/libcontainer/cgroups/cgroups_test.go b/libcontainer/cgroups/cgroups_test.go
index b31412f5a04..b7ca7b1837a 100644
--- a/libcontainer/cgroups/cgroups_test.go
+++ b/libcontainer/cgroups/cgroups_test.go
@@ -5,6 +5,9 @@ import (
 )
 
 func TestParseCgroups(t *testing.T) {
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		t.Fatal(err)
diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go
index b0d6e33beb0..6c93ce4502d 100644
--- a/libcontainer/cgroups/file.go
+++ b/libcontainer/cgroups/file.go
@@ -136,13 +136,14 @@ func openFile(dir, file string, flags int) (*os.File, error) {
 		//
 		// TODO: if such usage will ever be common, amend this
 		// to reopen cgroupFd and retry openat2.
-		fdStr := strconv.Itoa(cgroupFd)
-		fdDest, _ := os.Readlink("/proc/self/fd/" + fdStr)
+		fdPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(cgroupFd))
+		defer closer()
+		fdDest, _ := os.Readlink(fdPath)
 		if fdDest != cgroupfsDir {
 			// Wrap the error so it is clear that cgroupFd
 			// is opened to an unexpected/wrong directory.
-			err = fmt.Errorf("cgroupFd %s unexpectedly opened to %s != %s: %w",
-				fdStr, fdDest, cgroupfsDir, err)
+			err = fmt.Errorf("cgroupFd %d unexpectedly opened to %s != %s: %w",
+				cgroupFd, fdDest, cgroupfsDir, err)
 		}
 		return nil, err
 	}
diff --git a/libcontainer/cgroups/fs2/defaultpath.go b/libcontainer/cgroups/fs2/defaultpath.go
index 9c949c91f08..8ac8312017b 100644
--- a/libcontainer/cgroups/fs2/defaultpath.go
+++ b/libcontainer/cgroups/fs2/defaultpath.go
@@ -55,6 +55,9 @@ func _defaultDirPath(root, cgPath, cgParent, cgName string) (string, error) {
 		return filepath.Join(root, innerPath), nil
 	}
 
+	// we don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	ownCgroup, err := parseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return "", err
diff --git a/libcontainer/cgroups/v1_utils.go b/libcontainer/cgroups/v1_utils.go
index 8524c468434..81193e20983 100644
--- a/libcontainer/cgroups/v1_utils.go
+++ b/libcontainer/cgroups/v1_utils.go
@@ -99,11 +99,12 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
 // expensive), so it is assumed that cgroup mounts are not being changed.
 func readCgroupMountinfo() ([]*mountinfo.Info, error) {
 	readMountinfoOnce.Do(func() {
+		// mountinfo.GetMounts uses /proc/thread-self, so we can use it without
+		// issues.
 		cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
 			mountinfo.FSTypeFilter("cgroup"),
 		)
 	})
-
 	return cgroupMountinfo, readMountinfoErr
 }
 
@@ -196,6 +197,9 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
 		return nil, err
 	}
 
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return nil, err
@@ -214,6 +218,10 @@ func GetOwnCgroup(subsystem string) (string, error) {
 	if IsCgroup2UnifiedMode() {
 		return "", errUnified
 	}
+
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	cgroups, err := ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return "", err
diff --git a/libcontainer/configs/namespaces_linux.go b/libcontainer/configs/namespaces_linux.go
index 5062432f8c3..898f96fd0f5 100644
--- a/libcontainer/configs/namespaces_linux.go
+++ b/libcontainer/configs/namespaces_linux.go
@@ -59,6 +59,9 @@ func IsNamespaceSupported(ns NamespaceType) bool {
 	if nsFile == "" {
 		return false
 	}
+	// We don't need to use /proc/thread-self here because the list of
+	// namespace types is unrelated to the thread. This lets us avoid having to
+	// do runtime.LockOSThread.
 	_, err := os.Stat("/proc/self/ns/" + nsFile)
 	// a namespace is supported if it exists and we have permissions to read it
 	supported = err == nil
diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go
index c9aacf57f97..c79988e5ff0 100644
--- a/libcontainer/container_linux.go
+++ b/libcontainer/container_linux.go
@@ -509,14 +509,20 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) {
 	if dmz.IsSelfExeCloned() {
 		// /proc/self/exe is already a cloned binary -- no need to do anything
 		logrus.Debug("skipping binary cloning -- /proc/self/exe is already cloned!")
+		// We don't need to use /proc/thread-self here because the exe mm of a
+		// thread-group is guaranteed to be the same for all threads by
+		// definition. This lets us avoid having to do runtime.LockOSThread.
 		exePath = "/proc/self/exe"
 	} else {
 		var err error
 		if isDmzBinarySafe(c.config) {
 			dmzExe, err = dmz.Binary(c.stateDir)
 			if err == nil {
-				// We can use our own executable without cloning if we are using
-				// runc-dmz.
+				// We can use our own executable without cloning if we are
+				// using runc-dmz. We don't need to use /proc/thread-self here
+				// because the exe mm of a thread-group is guaranteed to be the
+				// same for all threads by definition. This lets us avoid
+				// having to do runtime.LockOSThread.
 				exePath = "/proc/self/exe"
 				p.clonedExes = append(p.clonedExes, dmzExe)
 				logrus.Debug("runc-dmz: using runc-dmz") // used for tests
diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go
index 5cb8c674f41..edcd305f20a 100644
--- a/libcontainer/criu_linux.go
+++ b/libcontainer/criu_linux.go
@@ -526,6 +526,7 @@ func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
 // restore using CRIU. This function is inspired from the code in
 // rootfs_linux.go.
 func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error {
+	me := mountEntry{Mount: m}
 	switch m.Device {
 	case "cgroup":
 		// No mount point(s) need to be created:
@@ -540,7 +541,7 @@ func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error {
 		// The prepareBindMount() function checks if source
 		// exists. So it cannot be used for other filesystem types.
 		// TODO: pass srcFD? Not sure if criu is impacted by issue #2484.
-		if err := prepareBindMount(mountEntry{Mount: m}, c.config.Rootfs); err != nil {
+		if err := prepareBindMount(me, c.config.Rootfs); err != nil {
 			return err
 		}
 	default:
@@ -549,7 +550,7 @@ func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error {
 		if err != nil {
 			return err
 		}
-		if err := checkProcMount(c.config.Rootfs, dest, ""); err != nil {
+		if err := checkProcMount(c.config.Rootfs, dest, me); err != nil {
 			return err
 		}
 		if err := os.MkdirAll(dest, 0o755); err != nil {
diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go
index 019868c1403..0117ace5990 100644
--- a/libcontainer/init_linux.go
+++ b/libcontainer/init_linux.go
@@ -480,6 +480,9 @@ func setupUser(config *initConfig) error {
 		return err
 	}
 
+	// We don't need to use /proc/thread-self here because setgroups is a
+	// per-userns file and thus is global to all threads in a thread-group.
+	// This lets us avoid having to do runtime.LockOSThread.
 	setgroups, err := os.ReadFile("/proc/self/setgroups")
 	if err != nil && !os.IsNotExist(err) {
 		return err
diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go
index 1bc840116c2..9dd056ad502 100644
--- a/libcontainer/integration/exec_test.go
+++ b/libcontainer/integration/exec_test.go
@@ -19,6 +19,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/cgroups/systemd"
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/opencontainers/runc/libcontainer/utils"
 	"github.com/opencontainers/runtime-spec/specs-go"
 
 	"golang.org/x/sys/unix"
@@ -1691,6 +1692,20 @@ func TestFdLeaksSystemd(t *testing.T) {
 	testFdLeaks(t, true)
 }
 
+func fdList(t *testing.T) []string {
+	procSelfFd, closer := utils.ProcThreadSelf("fd")
+	defer closer()
+
+	fdDir, err := os.Open(procSelfFd)
+	ok(t, err)
+	defer fdDir.Close()
+
+	fds, err := fdDir.Readdirnames(-1)
+	ok(t, err)
+
+	return fds
+}
+
 func testFdLeaks(t *testing.T, systemd bool) {
 	if testing.Short() {
 		return
@@ -1705,21 +1720,12 @@ func testFdLeaks(t *testing.T, systemd bool) {
 	//  - /sys/fs/cgroup dirfd opened by prepareOpenat2 in libct/cgroups;
 	//  - dbus connection opened by getConnection in libct/cgroups/systemd.
 	_ = runContainerOk(t, config, "true")
-
-	pfd, err := os.Open("/proc/self/fd")
-	ok(t, err)
-	defer pfd.Close()
-	fds0, err := pfd.Readdirnames(0)
-	ok(t, err)
-	_, err = pfd.Seek(0, 0)
-	ok(t, err)
+	fds0 := fdList(t)
 
 	_ = runContainerOk(t, config, "true")
+	fds1 := fdList(t)
 
-	fds1, err := pfd.Readdirnames(0)
-	ok(t, err)
-
-	if len(fds1) == len(fds0) {
+	if reflect.DeepEqual(fds0, fds1) {
 		return
 	}
 	// Show the extra opened files.
@@ -1729,6 +1735,10 @@ func testFdLeaks(t *testing.T, systemd bool) {
 	}
 
 	count := 0
+
+	procSelfFd, closer := utils.ProcThreadSelf("fd/")
+	defer closer()
+
 next_fd:
 	for _, fd1 := range fds1 {
 		for _, fd0 := range fds0 {
@@ -1736,7 +1746,7 @@ next_fd:
 				continue next_fd
 			}
 		}
-		dst, _ := os.Readlink("/proc/self/fd/" + fd1)
+		dst, _ := os.Readlink(filepath.Join(procSelfFd, fd1))
 		for _, ex := range excludedPaths {
 			if ex == dst {
 				continue next_fd
diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go
index 59861a4f435..73cd04c086d 100644
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@@ -12,6 +12,7 @@ import (
 
 	"github.com/opencontainers/runc/libcontainer/configs"
 	"github.com/opencontainers/runc/libcontainer/userns"
+	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
 // mountSourceType indicates what type of file descriptor is being returned. It
@@ -23,7 +24,7 @@ const (
 	// An open_tree(2)-style file descriptor that needs to be installed using
 	// move_mount(2) to install.
 	mountSourceOpenTree mountSourceType = "open_tree"
-	// A plain file descriptor that can be mounted through /proc/self/fd.
+	// A plain file descriptor that can be mounted through /proc/thread-self/fd.
 	mountSourcePlain mountSourceType = "plain-open"
 )
 
@@ -90,7 +91,7 @@ func mount(source, target, fstype string, flags uintptr, data string) error {
 // will mount it according to the mountSourceType of the file descriptor.
 //
 // The dstFd argument, if non-empty, is expected to be in the form of a path to
-// an opened file descriptor on procfs (i.e. "/proc/self/fd/NN").
+// an opened file descriptor on procfs (i.e. "/proc/thread-self/fd/NN").
 //
 // If a file descriptor is used instead of a source or a target path, the
 // corresponding path is only used to add context to an error in case the mount
@@ -101,19 +102,30 @@ func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype stri
 		logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile")
 		srcFile = nil
 	}
-
 	dst := target
 	if dstFd != "" {
 		dst = dstFd
 	}
 	src := source
+	isMoveMount := srcFile != nil && srcFile.Type == mountSourceOpenTree
 	if srcFile != nil {
-		src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd()))
+		// If we're going to use the /proc/thread-self/... path for classic
+		// mount(2), we need to get a safe handle to /proc/thread-self. This
+		// isn't needed for move_mount(2) because in that case the path is just
+		// a dummy string used for error info.
+		fdStr := strconv.Itoa(int(srcFile.file.Fd()))
+		if isMoveMount {
+			src = "/proc/self/fd/" + fdStr
+		} else {
+			var closer utils.ProcThreadSelfCloser
+			src, closer = utils.ProcThreadSelf("fd/" + fdStr)
+			defer closer()
+		}
 	}
 
 	var op string
 	var err error
-	if srcFile != nil && srcFile.Type == mountSourceOpenTree {
+	if isMoveMount {
 		op = "move_mount"
 		err = unix.MoveMount(int(srcFile.file.Fd()), "",
 			unix.AT_FDCWD, dstFd,
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index 7f98e15c4cc..cc986bd5100 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -9,6 +9,7 @@ import (
 	"path/filepath"
 	"strconv"
 	"strings"
+	"syscall"
 	"time"
 
 	securejoin "github.com/cyphar/filepath-securejoin"
@@ -44,13 +45,44 @@ type mountEntry struct {
 	srcFile *mountSource
 }
 
-func (m *mountEntry) src() string {
+// srcName is only meant for error messages, it returns a "friendly" name.
+func (m mountEntry) srcName() string {
 	if m.srcFile != nil {
-		return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd()))
+		return m.srcFile.file.Name()
 	}
 	return m.Source
 }
 
+func (m mountEntry) srcStat() (os.FileInfo, *syscall.Stat_t, error) {
+	var (
+		st  os.FileInfo
+		err error
+	)
+	if m.srcFile != nil {
+		st, err = m.srcFile.file.Stat()
+	} else {
+		st, err = os.Stat(m.Source)
+	}
+	if err != nil {
+		return nil, nil, err
+	}
+	return st, st.Sys().(*syscall.Stat_t), nil
+}
+
+func (m mountEntry) srcStatfs() (*unix.Statfs_t, error) {
+	var st unix.Statfs_t
+	if m.srcFile != nil {
+		if err := unix.Fstatfs(int(m.srcFile.file.Fd()), &st); err != nil {
+			return nil, os.NewSyscallError("fstatfs", err)
+		}
+	} else {
+		if err := unix.Statfs(m.Source, &st); err != nil {
+			return nil, &os.PathError{Op: "statfs", Path: m.Source, Err: err}
+		}
+	}
+	return &st, nil
+}
+
 // needsSetupDev returns true if /dev needs to be set up.
 func needsSetupDev(config *configs.Config) bool {
 	for _, m := range config.Mounts {
@@ -250,8 +282,7 @@ func cleanupTmp(tmpdir string) {
 }
 
 func prepareBindMount(m mountEntry, rootfs string) error {
-	source := m.src()
-	stat, err := os.Stat(source)
+	fi, _, err := m.srcStat()
 	if err != nil {
 		// error out if the source of a bind mount does not exist as we will be
 		// unable to bind anything to it.
@@ -265,14 +296,10 @@ func prepareBindMount(m mountEntry, rootfs string) error {
 	if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
 		return err
 	}
-	if err := checkProcMount(rootfs, dest, source); err != nil {
-		return err
-	}
-	if err := createIfNotExists(dest, stat.IsDir()); err != nil {
+	if err := checkProcMount(rootfs, dest, m); err != nil {
 		return err
 	}
-
-	return nil
+	return createIfNotExists(dest, fi.IsDir())
 }
 
 func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
@@ -601,11 +628,11 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 				// that we handle atimes correctly to make sure we error out if
 				// we cannot fulfil the requested mount flags.
 
-				var st unix.Statfs_t
-				if err := unix.Statfs(m.src(), &st); err != nil {
-					return &os.PathError{Op: "statfs", Path: m.src(), Err: err}
+				st, err := m.srcStatfs()
+				if err != nil {
+					return err
 				}
-				srcFlags := statfsToMountFlags(st)
+				srcFlags := statfsToMountFlags(*st)
 				// If the user explicitly request one of the locked flags *not*
 				// be set, we need to return an error to avoid producing mounts
 				// that don't match the user's request.
@@ -661,7 +688,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		}
 		return mountCgroupV1(m.Mount, c)
 	default:
-		if err := checkProcMount(rootfs, dest, m.Source); err != nil {
+		if err := checkProcMount(rootfs, dest, m); err != nil {
 			return err
 		}
 		if err := os.MkdirAll(dest, 0o755); err != nil {
@@ -677,6 +704,9 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
 		return nil, err
 	}
 
+	// We don't need to use /proc/thread-self here because runc always runs
+	// with every thread in the same cgroup. This lets us avoid having to do
+	// runtime.LockOSThread.
 	cgroupPaths, err := cgroups.ParseCgroupFile("/proc/self/cgroup")
 	if err != nil {
 		return nil, err
@@ -708,8 +738,8 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
 // checkProcMount checks to ensure that the mount destination is not over the top of /proc.
 // dest is required to be an abs path and have any symlinks resolved before calling this function.
 //
-// if source is nil, don't stat the filesystem.  This is used for restore of a checkpoint.
-func checkProcMount(rootfs, dest, source string) error {
+// If m is nil, don't stat the filesystem.  This is used for restore of a checkpoint.
+func checkProcMount(rootfs, dest string, m mountEntry) error {
 	const procPath = "/proc"
 	path, err := filepath.Rel(filepath.Join(rootfs, procPath), dest)
 	if err != nil {
@@ -720,18 +750,12 @@ func checkProcMount(rootfs, dest, source string) error {
 		return nil
 	}
 	if path == "." {
-		// an empty source is pasted on restore
-		if source == "" {
-			return nil
-		}
 		// only allow a mount on-top of proc if it's source is "proc"
-		isproc, err := isProc(source)
+		st, err := m.srcStatfs()
 		if err != nil {
 			return err
 		}
-		// pass if the mount is happening on top of /proc and the source of
-		// the mount is a proc filesystem
-		if isproc {
+		if st.Type == unix.PROC_SUPER_MAGIC {
 			return nil
 		}
 		return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest)
@@ -764,15 +788,10 @@ func checkProcMount(rootfs, dest, source string) error {
 	return fmt.Errorf("%q cannot be mounted because it is inside /proc", dest)
 }
 
-func isProc(path string) (bool, error) {
-	var s unix.Statfs_t
-	if err := unix.Statfs(path, &s); err != nil {
-		return false, &os.PathError{Op: "statfs", Path: path, Err: err}
-	}
-	return s.Type == unix.PROC_SUPER_MAGIC, nil
-}
-
 func setupDevSymlinks(rootfs string) error {
+	// In theory, these should be links to /proc/thread-self, but systems
+	// expect these to be /proc/self and this matches how most distributions
+	// work.
 	links := [][2]string{
 		{"/proc/self/fd", "/dev/fd"},
 		{"/proc/self/fd/0", "/dev/stdin"},
diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go
index 8709a5e47f7..796ef7f684d 100644
--- a/libcontainer/rootfs_linux_test.go
+++ b/libcontainer/rootfs_linux_test.go
@@ -4,45 +4,100 @@ import (
 	"testing"
 
 	"github.com/opencontainers/runc/libcontainer/configs"
+
+	"golang.org/x/sys/unix"
 )
 
-func TestCheckMountDestOnProc(t *testing.T) {
+func TestCheckMountDestInProc(t *testing.T) {
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/proc/sys",
+			Source:      "/proc/sys",
+			Device:      "bind",
+			Flags:       unix.MS_BIND,
+		},
+	}
 	dest := "/rootfs/proc/sys"
-	err := checkProcMount("/rootfs", dest, "")
+	err := checkProcMount("/rootfs", dest, m)
 	if err == nil {
 		t.Fatal("destination inside proc should return an error")
 	}
 }
 
-func TestCheckMountDestOnProcChroot(t *testing.T) {
+func TestCheckProcMountOnProc(t *testing.T) {
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/proc",
+			Source:      "foo",
+			Device:      "proc",
+		},
+	}
+	dest := "/rootfs/proc/"
+	err := checkProcMount("/rootfs", dest, m)
+	if err != nil {
+		// TODO: This test failure is fixed in a later commit in this series.
+		t.Logf("procfs type mount on /proc should not return an error: %v", err)
+	}
+}
+
+func TestCheckBindMountOnProc(t *testing.T) {
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/proc",
+			Source:      "/proc/self",
+			Device:      "bind",
+			Flags:       unix.MS_BIND,
+		},
+	}
 	dest := "/rootfs/proc/"
-	err := checkProcMount("/rootfs", dest, "/proc")
+	err := checkProcMount("/rootfs", dest, m)
 	if err != nil {
-		t.Fatal("destination inside proc when using chroot should not return an error")
+		t.Fatalf("bind-mount of procfs on top of /proc should not return an error: %v", err)
 	}
 }
 
 func TestCheckMountDestInSys(t *testing.T) {
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/sys/fs/cgroup",
+			Source:      "tmpfs",
+			Device:      "tmpfs",
+		},
+	}
 	dest := "/rootfs//sys/fs/cgroup"
-	err := checkProcMount("/rootfs", dest, "")
+	err := checkProcMount("/rootfs", dest, m)
 	if err != nil {
-		t.Fatal("destination inside /sys should not return an error")
+		t.Fatalf("destination inside /sys should not return an error: %v", err)
 	}
 }
 
 func TestCheckMountDestFalsePositive(t *testing.T) {
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/sysfiles/fs/cgroup",
+			Source:      "tmpfs",
+			Device:      "tmpfs",
+		},
+	}
 	dest := "/rootfs/sysfiles/fs/cgroup"
-	err := checkProcMount("/rootfs", dest, "")
+	err := checkProcMount("/rootfs", dest, m)
 	if err != nil {
 		t.Fatal(err)
 	}
 }
 
 func TestCheckMountDestNsLastPid(t *testing.T) {
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/proc/sys/kernel/ns_last_pid",
+			Source:      "lxcfs",
+			Device:      "fuse.lxcfs",
+		},
+	}
 	dest := "/rootfs/proc/sys/kernel/ns_last_pid"
-	err := checkProcMount("/rootfs", dest, "/proc")
+	err := checkProcMount("/rootfs", dest, m)
 	if err != nil {
-		t.Fatal("/proc/sys/kernel/ns_last_pid should not return an error")
+		t.Fatalf("/proc/sys/kernel/ns_last_pid should not return an error: %v", err)
 	}
 }
 
diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go
index b27732778c4..3096d0d81ee 100644
--- a/libcontainer/standard_init_linux.go
+++ b/libcontainer/standard_init_linux.go
@@ -17,6 +17,7 @@ import (
 	"github.com/opencontainers/runc/libcontainer/keys"
 	"github.com/opencontainers/runc/libcontainer/seccomp"
 	"github.com/opencontainers/runc/libcontainer/system"
+	"github.com/opencontainers/runc/libcontainer/utils"
 )
 
 type linuxStandardInit struct {
@@ -247,11 +248,13 @@ func (l *linuxStandardInit) Init() error {
 		return &os.PathError{Op: "close log pipe", Path: "fd " + strconv.Itoa(l.logFd), Err: err}
 	}
 
+	fifoPath, closer := utils.ProcThreadSelf("fd/" + strconv.Itoa(l.fifoFd))
+	defer closer()
+
 	// Wait for the FIFO to be opened on the other side before exec-ing the
 	// user process. We open it through /proc/self/fd/$fd, because the fd that
 	// was given to us was an O_PATH fd to the fifo itself. Linux allows us to
 	// re-open an O_PATH fd through /proc.
-	fifoPath := "/proc/self/fd/" + strconv.Itoa(l.fifoFd)
 	fd, err := unix.Open(fifoPath, unix.O_WRONLY|unix.O_CLOEXEC, 0)
 	if err != nil {
 		return &os.PathError{Op: "open exec fifo", Path: fifoPath, Err: err}
diff --git a/libcontainer/userns/usernsfd_linux.go b/libcontainer/userns/usernsfd_linux.go
index 64446815581..721215619b4 100644
--- a/libcontainer/userns/usernsfd_linux.go
+++ b/libcontainer/userns/usernsfd_linux.go
@@ -90,6 +90,9 @@ func spawnProc(req Mapping) (*os.Process, error) {
 	// they have privileges over.
 	logrus.Debugf("spawning dummy process for id-mapping %s", req.id())
 	uidMappings, gidMappings := req.toSys()
+	// We don't need to use /proc/thread-self here because the exe mm of a
+	// thread-group is guaranteed to be the same for all threads by definition.
+	// This lets us avoid having to do runtime.LockOSThread.
 	return os.StartProcess("/proc/self/exe", []string{"runc", "--help"}, &os.ProcAttr{
 		Sys: &syscall.SysProcAttr{
 			Cloneflags:                 unix.CLONE_NEWUSER,
diff --git a/libcontainer/utils/utils.go b/libcontainer/utils/utils.go
index 74d9d20c7f1..1b523d8ac54 100644
--- a/libcontainer/utils/utils.go
+++ b/libcontainer/utils/utils.go
@@ -3,15 +3,12 @@ package utils
 import (
 	"encoding/binary"
 	"encoding/json"
-	"fmt"
 	"io"
 	"os"
 	"path/filepath"
-	"strconv"
 	"strings"
 	"unsafe"
 
-	securejoin "github.com/cyphar/filepath-securejoin"
 	"golang.org/x/sys/unix"
 )
 
@@ -102,39 +99,6 @@ func stripRoot(root, path string) string {
 	return CleanPath("/" + path)
 }
 
-// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
-// corresponding to the unsafePath resolved within the root. Before passing the
-// fd, this path is verified to have been inside the root -- so operating on it
-// through the passed fdpath should be safe. Do not access this path through
-// the original path strings, and do not attempt to use the pathname outside of
-// the passed closure (the file handle will be freed once the closure returns).
-func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
-	// Remove the root then forcefully resolve inside the root.
-	unsafePath = stripRoot(root, unsafePath)
-	path, err := securejoin.SecureJoin(root, unsafePath)
-	if err != nil {
-		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
-	}
-
-	// Open the target path.
-	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
-	if err != nil {
-		return fmt.Errorf("open o_path procfd: %w", err)
-	}
-	defer fh.Close()
-
-	// Double-check the path is the one we expected.
-	procfd := "/proc/self/fd/" + strconv.Itoa(int(fh.Fd()))
-	if realpath, err := os.Readlink(procfd); err != nil {
-		return fmt.Errorf("procfd verification failed: %w", err)
-	} else if realpath != path {
-		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
-	}
-
-	// Run the closure.
-	return fn(procfd)
-}
-
 // SearchLabels searches through a list of key=value pairs for a given key,
 // returning its value, and the binary flag telling whether the key exist.
 func SearchLabels(labels []string, key string) (string, bool) {
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
index e5f11523d1d..a48221b000a 100644
--- a/libcontainer/utils/utils_unix.go
+++ b/libcontainer/utils/utils_unix.go
@@ -7,9 +7,13 @@ import (
 	"fmt"
 	"math"
 	"os"
+	"path/filepath"
+	"runtime"
 	"strconv"
 	"sync"
 
+	securejoin "github.com/cyphar/filepath-securejoin"
+	"github.com/sirupsen/logrus"
 	"golang.org/x/sys/unix"
 )
 
@@ -57,7 +61,10 @@ func CloseExecFrom(minFd int) error {
 		return os.NewSyscallError("close_range", err)
 	}
 
-	fdDir, err := os.Open("/proc/self/fd")
+	procSelfFd, closer := ProcThreadSelf("fd")
+	defer closer()
+
+	fdDir, err := os.Open(procSelfFd)
 	if err != nil {
 		return err
 	}
@@ -98,3 +105,100 @@ func NewSockPair(name string) (parent, child *os.File, err error) {
 	}
 	return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil
 }
+
+// WithProcfd runs the passed closure with a procfd path (/proc/self/fd/...)
+// corresponding to the unsafePath resolved within the root. Before passing the
+// fd, this path is verified to have been inside the root -- so operating on it
+// through the passed fdpath should be safe. Do not access this path through
+// the original path strings, and do not attempt to use the pathname outside of
+// the passed closure (the file handle will be freed once the closure returns).
+func WithProcfd(root, unsafePath string, fn func(procfd string) error) error {
+	// Remove the root then forcefully resolve inside the root.
+	unsafePath = stripRoot(root, unsafePath)
+	path, err := securejoin.SecureJoin(root, unsafePath)
+	if err != nil {
+		return fmt.Errorf("resolving path inside rootfs failed: %w", err)
+	}
+
+	procSelfFd, closer := ProcThreadSelf("fd/")
+	defer closer()
+
+	// Open the target path.
+	fh, err := os.OpenFile(path, unix.O_PATH|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return fmt.Errorf("open o_path procfd: %w", err)
+	}
+	defer fh.Close()
+
+	procfd := filepath.Join(procSelfFd, strconv.Itoa(int(fh.Fd())))
+	// Double-check the path is the one we expected.
+	if realpath, err := os.Readlink(procfd); err != nil {
+		return fmt.Errorf("procfd verification failed: %w", err)
+	} else if realpath != path {
+		return fmt.Errorf("possibly malicious path detected -- refusing to operate on %s", realpath)
+	}
+
+	return fn(procfd)
+}
+
+type ProcThreadSelfCloser func()
+
+var (
+	haveProcThreadSelf     bool
+	haveProcThreadSelfOnce sync.Once
+)
+
+// ProcThreadSelf returns a string that is equivalent to
+// /proc/thread-self/<subpath>, with a graceful fallback on older kernels where
+// /proc/thread-self doesn't exist. This method DOES NOT use SecureJoin,
+// meaning that the passed string needs to be trusted. The caller _must_ call
+// the returned procThreadSelfCloser function (which is runtime.UnlockOSThread)
+// *only once* after it has finished using the returned path string.
+func ProcThreadSelf(subpath string) (string, ProcThreadSelfCloser) {
+	haveProcThreadSelfOnce.Do(func() {
+		if _, err := os.Stat("/proc/thread-self/"); err == nil {
+			haveProcThreadSelf = true
+		} else {
+			logrus.Debugf("cannot stat /proc/thread-self (%v), falling back to /proc/self/task/<tid>", err)
+		}
+	})
+
+	// We need to lock our thread until the caller is done with the path string
+	// because any non-atomic operation on the path (such as opening a file,
+	// then reading it) could be interrupted by the Go runtime where the
+	// underlying thread is swapped out and the original thread is killed,
+	// resulting in pull-your-hair-out-hard-to-debug issues in the caller. In
+	// addition, the pre-3.17 fallback makes everything non-atomic because the
+	// same thing could happen between unix.Gettid() and the path operations.
+	//
+	// In theory, we don't need to lock in the atomic user case when using
+	// /proc/thread-self/, but it's better to be safe than sorry (and there are
+	// only one or two truly atomic users of /proc/thread-self/).
+	runtime.LockOSThread()
+
+	threadSelf := "/proc/thread-self/"
+	if !haveProcThreadSelf {
+		// Pre-3.17 kernels did not have /proc/thread-self, so do it manually.
+		threadSelf = "/proc/self/task/" + strconv.Itoa(unix.Gettid()) + "/"
+		if _, err := os.Stat(threadSelf); err != nil {
+			// Unfortunately, this code is called from rootfs_linux.go where we
+			// are running inside the pid namespace of the container but /proc
+			// is the host's procfs. Unfortunately there is no real way to get
+			// the correct tid to use here (the kernel age means we cannot do
+			// things like set up a private fsopen("proc") -- even scanning
+			// NSpid in all of the tasks in /proc/self/task/*/status requires
+			// Linux 4.1).
+			//
+			// So, we just have to assume that /proc/self is acceptable in this
+			// one specific case.
+			if os.Getpid() == 1 {
+				logrus.Debugf("/proc/thread-self (tid=%d) cannot be emulated inside the initial container setup -- using /proc/self instead: %v", unix.Gettid(), err)
+			} else {
+				// This should never happen, but the fallback should work in most cases...
+				logrus.Warnf("/proc/thread-self could not be emulated for pid=%d (tid=%d) -- using more buggy /proc/self fallback instead: %v", os.Getpid(), unix.Gettid(), err)
+			}
+			threadSelf = "/proc/self/"
+		}
+	}
+	return threadSelf + subpath, runtime.UnlockOSThread
+}
diff --git a/utils_linux.go b/utils_linux.go
index 5de00628493..e7b362cdb2e 100644
--- a/utils_linux.go
+++ b/utils_linux.go
@@ -224,8 +224,10 @@ func (r *runner) run(config *specs.Process) (int, error) {
 		process.ExtraFiles = append(process.ExtraFiles, r.listenFDs...)
 	}
 	baseFd := 3 + len(process.ExtraFiles)
+	procSelfFd, closer := utils.ProcThreadSelf("fd/")
+	defer closer()
 	for i := baseFd; i < baseFd+r.preserveFDs; i++ {
-		_, err = os.Stat("/proc/self/fd/" + strconv.Itoa(i))
+		_, err = os.Stat(filepath.Join(procSelfFd, strconv.Itoa(i)))
 		if err != nil {
 			return -1, fmt.Errorf("unable to stat preserved-fd %d (of %d): %w", i-baseFd, r.preserveFDs, err)
 		}

From cdff09ab875159d004035990c0d45e8bdf20ed35 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 7 Dec 2023 16:23:24 +1100
Subject: [PATCH 5/8] rootfs: fix 'can we mount on top of /proc' check

Our previous test for whether we can mount on top of /proc incorrectly
assumed that it would only be called with bind-mount sources. This meant
that having a non bind-mount entry for a pseudo-filesystem (like
overlayfs) with a dummy source set to /proc on the host would let you
bypass the check, which could easily lead to security issues.

In addition, the check should be applied more uniformly to all mount
types, so fix that as well. And add some tests for some of the tricky
cases to make sure we protect against them properly.

Fixes: 331692baa7af ("Only allow proc mount if it is procfs")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/criu_linux.go        | 26 ++++++-----
 libcontainer/rootfs_linux.go      | 71 +++++++++++++++++--------------
 libcontainer/rootfs_linux_test.go | 39 +++++++++++++++--
 3 files changed, 92 insertions(+), 44 deletions(-)

diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go
index edcd305f20a..0567152403f 100644
--- a/libcontainer/criu_linux.go
+++ b/libcontainer/criu_linux.go
@@ -527,6 +527,14 @@ func (c *Container) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts) {
 // rootfs_linux.go.
 func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error {
 	me := mountEntry{Mount: m}
+	dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
+	if err != nil {
+		return err
+	}
+	// TODO: pass srcFD? Not sure if criu is impacted by issue #2484.
+	if err := checkProcMount(c.config.Rootfs, dest, me); err != nil {
+		return err
+	}
 	switch m.Device {
 	case "cgroup":
 		// No mount point(s) need to be created:
@@ -538,21 +546,19 @@ func (c *Container) makeCriuRestoreMountpoints(m *configs.Mount) error {
 		//   the mountpoint appears as soon as /sys is mounted
 		return nil
 	case "bind":
-		// The prepareBindMount() function checks if source
-		// exists. So it cannot be used for other filesystem types.
-		// TODO: pass srcFD? Not sure if criu is impacted by issue #2484.
-		if err := prepareBindMount(me, c.config.Rootfs); err != nil {
-			return err
-		}
-	default:
-		// for all other filesystems just create the mountpoints
-		dest, err := securejoin.SecureJoin(c.config.Rootfs, m.Destination)
+		// For bind-mounts (unlike other filesystem types), we need to check if
+		// the source exists.
+		fi, _, err := me.srcStat()
 		if err != nil {
+			// error out if the source of a bind mount does not exist as we
+			// will be unable to bind anything to it.
 			return err
 		}
-		if err := checkProcMount(c.config.Rootfs, dest, me); err != nil {
+		if err := createIfNotExists(dest, fi.IsDir()); err != nil {
 			return err
 		}
+	default:
+		// for all other filesystems just create the mountpoints
 		if err := os.MkdirAll(dest, 0o755); err != nil {
 			return err
 		}
diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go
index cc986bd5100..89faea085a5 100644
--- a/libcontainer/rootfs_linux.go
+++ b/libcontainer/rootfs_linux.go
@@ -281,27 +281,6 @@ func cleanupTmp(tmpdir string) {
 	_ = os.RemoveAll(tmpdir)
 }
 
-func prepareBindMount(m mountEntry, rootfs string) error {
-	fi, _, err := m.srcStat()
-	if err != nil {
-		// error out if the source of a bind mount does not exist as we will be
-		// unable to bind anything to it.
-		return err
-	}
-	// ensure that the destination of the bind mount is resolved of symlinks at mount time because
-	// any previous mounts can invalidate the next mount's destination.
-	// this can happen when a user specifies mounts within other mounts to cause breakouts or other
-	// evil stuff to try to escape the container's rootfs.
-	var dest string
-	if dest, err = securejoin.SecureJoin(rootfs, m.Destination); err != nil {
-		return err
-	}
-	if err := checkProcMount(rootfs, dest, m); err != nil {
-		return err
-	}
-	return createIfNotExists(dest, fi.IsDir())
-}
-
 func mountCgroupV1(m *configs.Mount, c *mountConfig) error {
 	binds, err := getCgroupMounts(m)
 	if err != nil {
@@ -520,6 +499,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 			// Do not use securejoin as it resolves symlinks.
 			dest = filepath.Join(rootfs, dest)
 		}
+		if err := checkProcMount(rootfs, dest, m); err != nil {
+			return err
+		}
 		if fi, err := os.Lstat(dest); err != nil {
 			if !os.IsNotExist(err) {
 				return err
@@ -539,6 +521,9 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 	if err != nil {
 		return err
 	}
+	if err := checkProcMount(rootfs, dest, m); err != nil {
+		return err
+	}
 
 	switch m.Device {
 	case "mqueue":
@@ -570,7 +555,13 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 
 		return err
 	case "bind":
-		if err := prepareBindMount(m, rootfs); err != nil {
+		fi, _, err := m.srcStat()
+		if err != nil {
+			// error out if the source of a bind mount does not exist as we will be
+			// unable to bind anything to it.
+			return err
+		}
+		if err := createIfNotExists(dest, fi.IsDir()); err != nil {
 			return err
 		}
 		// open_tree()-related shenanigans are all handled in mountViaFds.
@@ -688,9 +679,6 @@ func mountToRootfs(c *mountConfig, m mountEntry) error {
 		}
 		return mountCgroupV1(m.Mount, c)
 	default:
-		if err := checkProcMount(rootfs, dest, m); err != nil {
-			return err
-		}
 		if err := os.MkdirAll(dest, 0o755); err != nil {
 			return err
 		}
@@ -735,6 +723,11 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) {
 	return binds, nil
 }
 
+// Taken from <include/linux/proc_ns.h>. If a file is on a filesystem of type
+// PROC_SUPER_MAGIC, we're guaranteed that only the root of the superblock will
+// have this inode number.
+const procRootIno = 1
+
 // checkProcMount checks to ensure that the mount destination is not over the top of /proc.
 // dest is required to be an abs path and have any symlinks resolved before calling this function.
 //
@@ -750,12 +743,28 @@ func checkProcMount(rootfs, dest string, m mountEntry) error {
 		return nil
 	}
 	if path == "." {
-		// only allow a mount on-top of proc if it's source is "proc"
-		st, err := m.srcStatfs()
-		if err != nil {
-			return err
-		}
-		if st.Type == unix.PROC_SUPER_MAGIC {
+		// Only allow bind-mounts on top of /proc, and only if the source is a
+		// procfs mount.
+		if m.IsBind() {
+			fsSt, err := m.srcStatfs()
+			if err != nil {
+				return err
+			}
+			if fsSt.Type == unix.PROC_SUPER_MAGIC {
+				if _, uSt, err := m.srcStat(); err != nil {
+					return err
+				} else if uSt.Ino != procRootIno {
+					// We cannot error out in this case, because we've
+					// supported these kinds of mounts for a long time.
+					// However, we would expect users to bind-mount the root of
+					// a real procfs on top of /proc in the container. We might
+					// want to block this in the future.
+					logrus.Warnf("bind-mount %v (source %v) is of type procfs but is not the root of a procfs (inode %d). Future versions of runc might block this configuration -- please report an issue to <https://github.com/opencontainers/runc> if you see this warning.", dest, m.srcName(), uSt.Ino)
+				}
+				return nil
+			}
+		} else if m.Device == "proc" {
+			// Fresh procfs-type mounts are always safe to mount on top of /proc.
 			return nil
 		}
 		return fmt.Errorf("%q cannot be mounted because it is not of type proc", dest)
diff --git a/libcontainer/rootfs_linux_test.go b/libcontainer/rootfs_linux_test.go
index 796ef7f684d..6ce099d256d 100644
--- a/libcontainer/rootfs_linux_test.go
+++ b/libcontainer/rootfs_linux_test.go
@@ -35,8 +35,7 @@ func TestCheckProcMountOnProc(t *testing.T) {
 	dest := "/rootfs/proc/"
 	err := checkProcMount("/rootfs", dest, m)
 	if err != nil {
-		// TODO: This test failure is fixed in a later commit in this series.
-		t.Logf("procfs type mount on /proc should not return an error: %v", err)
+		t.Fatalf("procfs type mount on /proc should not return an error: %v", err)
 	}
 }
 
@@ -52,7 +51,41 @@ func TestCheckBindMountOnProc(t *testing.T) {
 	dest := "/rootfs/proc/"
 	err := checkProcMount("/rootfs", dest, m)
 	if err != nil {
-		t.Fatalf("bind-mount of procfs on top of /proc should not return an error: %v", err)
+		t.Fatalf("bind-mount of procfs on top of /proc should not return an error (for now): %v", err)
+	}
+}
+
+func TestCheckTrickyMountOnProc(t *testing.T) {
+	// Make a non-bind mount that looks like a bit like a bind-mount.
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/proc",
+			Source:      "/proc",
+			Device:      "overlay",
+			Data:        "lowerdir=/tmp/fakeproc,upperdir=/tmp/fakeproc2,workdir=/tmp/work",
+		},
+	}
+	dest := "/rootfs/proc/"
+	err := checkProcMount("/rootfs", dest, m)
+	if err == nil {
+		t.Fatalf("dodgy overlayfs mount on top of /proc should return an error")
+	}
+}
+
+func TestCheckTrickyBindMountOnProc(t *testing.T) {
+	// Make a bind mount that looks like it might be a procfs mount.
+	m := mountEntry{
+		Mount: &configs.Mount{
+			Destination: "/proc",
+			Source:      "/sys",
+			Device:      "proc",
+			Flags:       unix.MS_BIND,
+		},
+	}
+	dest := "/rootfs/proc/"
+	err := checkProcMount("/rootfs", dest, m)
+	if err == nil {
+		t.Fatalf("dodgy bind-mount on top of /proc should return an error")
 	}
 }
 

From 7795ca46684518143af3df8d8eb69457753db932 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 16 Nov 2023 17:27:27 +1100
Subject: [PATCH 6/8] specconv: handle recursive attribute clearing more
 consistently

If a user specifies a configuration like "rro, rrw", we should have
similar behaviour to "ro, rw" where we clear the previous flags so that
the last specified flag takes precendence.

Fixes: 382eba4354d7 ("Support recursive mount attrs ("rro", "rnosuid", "rnodev", ...)")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/specconv/spec_linux.go | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
index 90275043eca..5dc6ec79f32 100644
--- a/libcontainer/specconv/spec_linux.go
+++ b/libcontainer/specconv/spec_linux.go
@@ -1036,8 +1036,10 @@ func parseMountOptions(options []string) *configs.Mount {
 		} else if f, exists := recAttrFlags[o]; exists {
 			if f.clear {
 				recAttrClr |= f.flag
+				recAttrSet &= ^f.flag
 			} else {
 				recAttrSet |= f.flag
+				recAttrClr &= ^f.flag
 				if f.flag&unix.MOUNT_ATTR__ATIME == f.flag {
 					// https://man7.org/linux/man-pages/man2/mount_setattr.2.html
 					// "cannot simply specify the access-time setting in attr_set, but must also include MOUNT_ATTR__ATIME in the attr_clr field."

From 3b57e45cbfae2a4f72ede6816b5a84ecd47028ce Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Thu, 16 Nov 2023 17:28:53 +1100
Subject: [PATCH 7/8] mount: add support for ridmap and idmap

ridmap indicates that the id mapping should be applied recursively (only
really relevant for rbind mount entries), and idmap indicates that it
should not be applied recursively (the default). If no mappings are
specified for the mount, we use the userns configuration of the
container. This matches the behaviour in the currently-unreleased
runtime-spec.

This includes a minor change to the state.json serialisation format, but
because there has been no released version of runc with commit
fbf183c6f8c4 ("Add uid and gid mappings to mounts"), we can safely make
this change without affecting running containers. Doing it this way
makes it much easier to handle m.IsIDMapped() and indicating that a
mapping has been specified.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/configs/mount.go                 |   2 +-
 libcontainer/configs/mount_linux.go           |  34 +-
 libcontainer/configs/validate/validator.go    |  17 +
 .../configs/validate/validator_test.go        | 165 ++++++--
 libcontainer/mount_linux.go                   |  30 +-
 libcontainer/specconv/spec_linux.go           |  41 +-
 tests/integration/idmap.bats                  | 351 +++++++++++++++++-
 7 files changed, 588 insertions(+), 52 deletions(-)

diff --git a/libcontainer/configs/mount.go b/libcontainer/configs/mount.go
index e39440fc08d..bfd356e497f 100644
--- a/libcontainer/configs/mount.go
+++ b/libcontainer/configs/mount.go
@@ -3,5 +3,5 @@ package configs
 const (
 	// EXT_COPYUP is a directive to copy up the contents of a directory when
 	// a tmpfs is mounted over it.
-	EXT_COPYUP = 1 << iota //nolint:golint // ignore "don't use ALL_CAPS" warning
+	EXT_COPYUP = 1 << iota //nolint:golint,revive // ignore "don't use ALL_CAPS" warning
 )
diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go
index 3f489295d97..b69e9ab238e 100644
--- a/libcontainer/configs/mount_linux.go
+++ b/libcontainer/configs/mount_linux.go
@@ -2,6 +2,24 @@ package configs
 
 import "golang.org/x/sys/unix"
 
+type MountIDMapping struct {
+	// Recursive indicates if the mapping needs to be recursive.
+	Recursive bool `json:"recursive"`
+
+	// UserNSPath is a path to a user namespace that indicates the necessary
+	// id-mappings for MOUNT_ATTR_IDMAP. If set to non-"", UIDMappings and
+	// GIDMappings must be set to nil.
+	UserNSPath string `json:"userns_path,omitempty"`
+
+	// UIDMappings is the uid mapping set for this mount, to be used with
+	// MOUNT_ATTR_IDMAP.
+	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
+
+	// GIDMappings is the gid mapping set for this mount, to be used with
+	// MOUNT_ATTR_IDMAP.
+	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
+}
+
 type Mount struct {
 	// Source path for the mount.
 	Source string `json:"source"`
@@ -34,17 +52,9 @@ type Mount struct {
 	// Extensions are additional flags that are specific to runc.
 	Extensions int `json:"extensions"`
 
-	// UIDMappings is used to changing file user owners w/o calling chown.
-	// Note that, the underlying filesystem should support this feature to be
-	// used.
-	// Every mount point could have its own mapping.
-	UIDMappings []IDMap `json:"uid_mappings,omitempty"`
-
-	// GIDMappings is used to changing file group owners w/o calling chown.
-	// Note that, the underlying filesystem should support this feature to be
-	// used.
-	// Every mount point could have its own mapping.
-	GIDMappings []IDMap `json:"gid_mappings,omitempty"`
+	// Mapping is the MOUNT_ATTR_IDMAP configuration for the mount. If non-nil,
+	// the mount is configured to use MOUNT_ATTR_IDMAP-style id mappings.
+	IDMapping *MountIDMapping `json:"id_mapping,omitempty"`
 }
 
 func (m *Mount) IsBind() bool {
@@ -52,5 +62,5 @@ func (m *Mount) IsBind() bool {
 }
 
 func (m *Mount) IsIDMapped() bool {
-	return len(m.UIDMappings) > 0 || len(m.GIDMappings) > 0
+	return m.IDMapping != nil
 }
diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go
index 33f9cc3f5fe..4708e1b773f 100644
--- a/libcontainer/configs/validate/validator.go
+++ b/libcontainer/configs/validate/validator.go
@@ -309,6 +309,13 @@ func checkBindOptions(m *configs.Mount) error {
 }
 
 func checkIDMapMounts(config *configs.Config, m *configs.Mount) error {
+	// Make sure MOUNT_ATTR_IDMAP is not set on any of our mounts. This
+	// attribute is handled differently to all other attributes (through
+	// m.IDMapping), so make sure we never store it in the actual config. This
+	// really shouldn't ever happen.
+	if m.RecAttr != nil && (m.RecAttr.Attr_set|m.RecAttr.Attr_clr)&unix.MOUNT_ATTR_IDMAP != 0 {
+		return errors.New("mount configuration cannot contain recAttr for MOUNT_ATTR_IDMAP")
+	}
 	if !m.IsIDMapped() {
 		return nil
 	}
@@ -318,6 +325,16 @@ func checkIDMapMounts(config *configs.Config, m *configs.Mount) error {
 	if config.RootlessEUID {
 		return errors.New("id-mapped mounts are not supported for rootless containers")
 	}
+	if m.IDMapping.UserNSPath == "" {
+		if len(m.IDMapping.UIDMappings) == 0 || len(m.IDMapping.GIDMappings) == 0 {
+			return errors.New("id-mapped mounts must have both uid and gid mappings specified")
+		}
+	} else {
+		if m.IDMapping.UIDMappings != nil || m.IDMapping.GIDMappings != nil {
+			// should never happen
+			return errors.New("[internal error] id-mapped mounts cannot have both userns_path and uid and gid mappings specified")
+		}
+	}
 	return nil
 }
 
diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go
index 9d3f50c7c70..5c12e08a1db 100644
--- a/libcontainer/configs/validate/validator_test.go
+++ b/libcontainer/configs/validate/validator_test.go
@@ -504,17 +504,82 @@ func TestValidateIDMapMounts(t *testing.T) {
 		config *configs.Config
 	}{
 		{
-			name:  "idmap mount without bind opt specified",
+			name:  "idmap non-bind mount",
 			isErr: true,
 			config: &configs.Config{
 				UIDMappings: mapping,
 				GIDMappings: mapping,
+				Mounts: []*configs.Mount{
+					{
+						Source:      "/dev/sda1",
+						Destination: "/abs/path/",
+						Device:      "ext4",
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: mapping,
+						},
+					},
+				},
+			},
+		},
+		{
+			name:  "idmap option non-bind mount",
+			isErr: true,
+			config: &configs.Config{
+				Mounts: []*configs.Mount{
+					{
+						Source:      "/dev/sda1",
+						Destination: "/abs/path/",
+						Device:      "ext4",
+						IDMapping:   &configs.MountIDMapping{},
+					},
+				},
+			},
+		},
+		{
+			name:  "ridmap option non-bind mount",
+			isErr: true,
+			config: &configs.Config{
+				Mounts: []*configs.Mount{
+					{
+						Source:      "/dev/sda1",
+						Destination: "/abs/path/",
+						Device:      "ext4",
+						IDMapping: &configs.MountIDMapping{
+							Recursive: true,
+						},
+					},
+				},
+			},
+		},
+		{
+			name:  "idmap mount no uid mapping",
+			isErr: true,
+			config: &configs.Config{
+				Mounts: []*configs.Mount{
+					{
+						Source:      "/abs/path/",
+						Destination: "/abs/path/",
+						Flags:       unix.MS_BIND,
+						IDMapping: &configs.MountIDMapping{
+							GIDMappings: mapping,
+						},
+					},
+				},
+			},
+		},
+		{
+			name:  "idmap mount no gid mapping",
+			isErr: true,
+			config: &configs.Config{
 				Mounts: []*configs.Mount{
 					{
 						Source:      "/abs/path/",
 						Destination: "/abs/path/",
-						UIDMappings: mapping,
-						GIDMappings: mapping,
+						Flags:       unix.MS_BIND,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+						},
 					},
 				},
 			},
@@ -531,8 +596,10 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "/abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: mapping,
+						},
 					},
 				},
 			},
@@ -547,8 +614,10 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "./rel/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: mapping,
+						},
 					},
 				},
 			},
@@ -563,8 +632,10 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "/abs/path/",
 						Destination: "./rel/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: mapping,
+						},
 					},
 				},
 			},
@@ -579,8 +650,10 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: mapping,
+						},
 					},
 				},
 			},
@@ -595,8 +668,10 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "/another-abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND | unix.MS_RDONLY,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: mapping,
+						},
 					},
 				},
 			},
@@ -609,8 +684,10 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "/abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: mapping,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: mapping,
+						},
 					},
 				},
 			},
@@ -625,14 +702,16 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "/abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: []configs.IDMap{
+								{
+									ContainerID: 10,
+									HostID:      10,
+									Size:        1,
+								},
 							},
+							GIDMappings: mapping,
 						},
-						GIDMappings: mapping,
 					},
 				},
 			},
@@ -647,18 +726,50 @@ func TestValidateIDMapMounts(t *testing.T) {
 						Source:      "/abs/path/",
 						Destination: "/abs/path/",
 						Flags:       unix.MS_BIND,
-						UIDMappings: mapping,
-						GIDMappings: []configs.IDMap{
-							{
-								ContainerID: 10,
-								HostID:      10,
-								Size:        1,
+						IDMapping: &configs.MountIDMapping{
+							UIDMappings: mapping,
+							GIDMappings: []configs.IDMap{
+								{
+									ContainerID: 10,
+									HostID:      10,
+									Size:        1,
+								},
 							},
 						},
 					},
 				},
 			},
 		},
+		{
+			name:  "mount with 'idmap' option but no mappings",
+			isErr: true,
+			config: &configs.Config{
+				Mounts: []*configs.Mount{
+					{
+						Source:      "/abs/path/",
+						Destination: "/abs/path/",
+						Flags:       unix.MS_BIND,
+						IDMapping:   &configs.MountIDMapping{},
+					},
+				},
+			},
+		},
+		{
+			name:  "mount with 'ridmap' option but no mappings",
+			isErr: true,
+			config: &configs.Config{
+				Mounts: []*configs.Mount{
+					{
+						Source:      "/abs/path/",
+						Destination: "/abs/path/",
+						Flags:       unix.MS_BIND,
+						IDMapping: &configs.MountIDMapping{
+							Recursive: true,
+						},
+					},
+				},
+			},
+		},
 	}
 
 	for _, tc := range testCases {
diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go
index 73cd04c086d..6b5edbb0c14 100644
--- a/libcontainer/mount_linux.go
+++ b/libcontainer/mount_linux.go
@@ -229,16 +229,32 @@ func mountFd(nsHandles *userns.Handles, m *configs.Mount) (*mountSource, error)
 		sourceType = mountSourceOpenTree
 
 		// Configure the id mapping.
-		usernsFile, err := nsHandles.Get(userns.Mapping{
-			UIDMappings: m.UIDMappings,
-			GIDMappings: m.GIDMappings,
-		})
-		if err != nil {
-			return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
+		var usernsFile *os.File
+		if m.IDMapping.UserNSPath == "" {
+			usernsFile, err = nsHandles.Get(userns.Mapping{
+				UIDMappings: m.IDMapping.UIDMappings,
+				GIDMappings: m.IDMapping.GIDMappings,
+			})
+			if err != nil {
+				return nil, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err)
+			}
+		} else {
+			usernsFile, err = os.Open(m.IDMapping.UserNSPath)
+			if err != nil {
+				return nil, fmt.Errorf("failed to open existing userns for %s id-mapping: %w", m.Source, err)
+			}
 		}
 		defer usernsFile.Close()
 
-		if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{
+		setAttrFlags := uint(unix.AT_EMPTY_PATH)
+		// If the mount has "ridmap" set, we apply the configuration
+		// recursively. This allows you to create "rbind" mounts where only
+		// the top-level mount has an idmapping. I'm not sure why you'd
+		// want that, but still...
+		if m.IDMapping.Recursive {
+			setAttrFlags |= unix.AT_RECURSIVE
+		}
+		if err := unix.MountSetattr(int(mountFile.Fd()), "", setAttrFlags, &unix.MountAttr{
 			Attr_set:  unix.MOUNT_ATTR_IDMAP,
 			Userns_fd: uint64(usernsFile.Fd()),
 		}); err != nil {
diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go
index 5dc6ec79f32..6f9f58ad1d0 100644
--- a/libcontainer/specconv/spec_linux.go
+++ b/libcontainer/specconv/spec_linux.go
@@ -38,6 +38,7 @@ var (
 		clear bool
 		flag  int
 	}
+	complexFlags map[string]func(*configs.Mount)
 )
 
 func initMaps() {
@@ -126,7 +127,6 @@ func initMaps() {
 			"rnostrictatime": {true, unix.MOUNT_ATTR_STRICTATIME},
 			"rnosymfollow":   {false, unix.MOUNT_ATTR_NOSYMFOLLOW}, // since kernel 5.14
 			"rsymfollow":     {true, unix.MOUNT_ATTR_NOSYMFOLLOW},  // since kernel 5.14
-			// No support for MOUNT_ATTR_IDMAP yet (needs UserNS FD)
 		}
 
 		extensionFlags = map[string]struct {
@@ -135,6 +135,17 @@ func initMaps() {
 		}{
 			"tmpcopyup": {false, configs.EXT_COPYUP},
 		}
+
+		complexFlags = map[string]func(*configs.Mount){
+			"idmap": func(m *configs.Mount) {
+				m.IDMapping = new(configs.MountIDMapping)
+				m.IDMapping.Recursive = false // noop
+			},
+			"ridmap": func(m *configs.Mount) {
+				m.IDMapping = new(configs.MountIDMapping)
+				m.IDMapping.Recursive = true
+			},
+		}
 	})
 }
 
@@ -415,6 +426,19 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
 			if err := setupUserNamespace(spec, config); err != nil {
 				return nil, err
 			}
+			// For idmap and ridmap mounts without explicit mappings, use the
+			// ones from the container's userns. If we are joining another
+			// userns, stash the path.
+			for _, m := range config.Mounts {
+				if m.IDMapping != nil && m.IDMapping.UIDMappings == nil && m.IDMapping.GIDMappings == nil {
+					if path := config.Namespaces.PathOf(configs.NEWUSER); path != "" {
+						m.IDMapping.UserNSPath = path
+					} else {
+						m.IDMapping.UIDMappings = config.UIDMappings
+						m.IDMapping.GIDMappings = config.GIDMappings
+					}
+				}
+			}
 		}
 		config.MaskPaths = spec.Linux.MaskedPaths
 		config.ReadonlyPaths = spec.Linux.ReadonlyPaths
@@ -447,6 +471,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) {
 				Domain: domain,
 			}
 		}
+
 	}
 
 	// Set the host UID that should own the container's cgroup.
@@ -552,8 +577,14 @@ func createLibcontainerMount(cwd string, m specs.Mount) (*configs.Mount, error)
 		}
 	}
 
-	mnt.UIDMappings = toConfigIDMap(m.UIDMappings)
-	mnt.GIDMappings = toConfigIDMap(m.GIDMappings)
+	if m.UIDMappings != nil || m.GIDMappings != nil {
+		if mnt.IDMapping == nil {
+			// Neither "idmap" nor "ridmap" were specified.
+			mnt.IDMapping = new(configs.MountIDMapping)
+		}
+		mnt.IDMapping.UIDMappings = toConfigIDMap(m.UIDMappings)
+		mnt.IDMapping.GIDMappings = toConfigIDMap(m.GIDMappings)
+	}
 
 	// None of the mount arguments can contain a null byte. Normally such
 	// strings would either cause some other failure or would just be truncated
@@ -1046,12 +1077,14 @@ func parseMountOptions(options []string) *configs.Mount {
 					recAttrClr |= unix.MOUNT_ATTR__ATIME
 				}
 			}
-		} else if f, exists := extensionFlags[o]; exists && f.flag != 0 {
+		} else if f, exists := extensionFlags[o]; exists {
 			if f.clear {
 				m.Extensions &= ^f.flag
 			} else {
 				m.Extensions |= f.flag
 			}
+		} else if fn, exists := complexFlags[o]; exists {
+			fn(&m)
 		} else {
 			data = append(data, o)
 		}
diff --git a/tests/integration/idmap.bats b/tests/integration/idmap.bats
index 89fe4d82356..a816fe96863 100644
--- a/tests/integration/idmap.bats
+++ b/tests/integration/idmap.bats
@@ -5,7 +5,6 @@ load helpers
 function setup() {
 	OVERFLOW_UID="$(cat /proc/sys/kernel/overflowuid)"
 	OVERFLOW_GID="$(cat /proc/sys/kernel/overflowgid)"
-
 	requires root
 	requires_kernel 5.12
 
@@ -37,12 +36,34 @@ function setup() {
 
 	# Add a symlink-containing source.
 	ln -s source-multi1 source-multi1-symlink
+
+	# Add some top-level files in the mount tree.
+	mkdir -p mnt-subtree/multi{1,2}
+	touch mnt-subtree/{foo,bar,baz}.txt
+	chown 100:211 mnt-subtree/foo.txt
+	chown 200:222 mnt-subtree/bar.txt
+	chown 300:233 mnt-subtree/baz.txt
+
+	mounts_file="$PWD/.all-mounts"
+	echo -n >"$mounts_file"
 }
 
 function teardown() {
+	if [ -v mounts_file ]; then
+		xargs -n 1 -a "$mounts_file" -- umount -l
+		rm -f "$mounts_file"
+	fi
 	teardown_bundle
 }
 
+function setup_host_bind_mount() {
+	src="$1"
+	dst="$2"
+
+	mount --bind "$src" "$dst"
+	echo "$dst" >>"$mounts_file"
+}
+
 function setup_idmap_userns() {
 	update_config '.linux.namespaces += [{"type": "user"}]
 		| .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]
@@ -341,3 +362,331 @@ function setup_idmap_basic_mount() {
 	[[ "$output" == *"=/tmp/mount-multi1/bar.txt:2000=2202="* ]]
 	[[ "$output" == *"=/tmp/mount-multi1/baz.txt:3000=3303="* ]]
 }
+
+@test "idmap mount (non-recursive idmap) [userns]" {
+	setup_idmap_userns
+
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 3},
+					{"containerID": 200, "hostID": 102000, "size": 3},
+					{"containerID": 300, "hostID": 103000, "size": 3}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:3000=3303="* ]]
+	# Because we used "idmap", the child mounts were not remapped recursively.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+}
+
+@test "idmap mount (non-recursive idmap) [no userns]" {
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 3},
+					{"containerID": 200, "hostID": 102000, "size": 3},
+					{"containerID": 300, "hostID": 103000, "size": 3}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:101000=101101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:102000=102202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:103000=103303="* ]]
+	# Because we used "idmap", the child mounts were not remapped recursively.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:100=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:102=233="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:200=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:201=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:202=233="* ]]
+}
+
+@test "idmap mount (idmap flag) [userns]" {
+	setup_idmap_userns
+
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind", "idmap"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 3},
+					{"containerID": 200, "hostID": 102000, "size": 3},
+					{"containerID": 300, "hostID": 103000, "size": 3}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:3000=3303="* ]]
+	# Because we used "idmap", the child mounts were not remapped recursively.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+}
+
+@test "idmap mount (idmap flag) [no userns]" {
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind", "idmap"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 3},
+					{"containerID": 200, "hostID": 102000, "size": 3},
+					{"containerID": 300, "hostID": 103000, "size": 3}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:101000=101101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:102000=102202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:103000=103303="* ]]
+	# Because we used "idmap", the child mounts were not remapped recursively.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:100=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:102=233="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:200=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:201=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:202=233="* ]]
+}
+
+@test "idmap mount (ridmap flag) [userns]" {
+	setup_idmap_userns
+
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind", "ridmap"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 3},
+					{"containerID": 200, "hostID": 102000, "size": 3},
+					{"containerID": 300, "hostID": 103000, "size": 3}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:2000=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:3000=3303="* ]]
+	# The child mounts have the same mapping applied.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:1000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:1001=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:1002=3303="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:2000=1101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:2001=2202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:2002=3303="* ]]
+}
+
+@test "idmap mount (ridmap flag) [no userns]" {
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind", "ridmap"],
+				"uidMappings": [
+					{"containerID": 100, "hostID": 101000, "size": 3},
+					{"containerID": 200, "hostID": 102000, "size": 3},
+					{"containerID": 300, "hostID": 103000, "size": 3}
+				],
+				"gidMappings": [
+					{"containerID": 210, "hostID": 101100, "size": 10},
+					{"containerID": 220, "hostID": 102200, "size": 10},
+					{"containerID": 230, "hostID": 103300, "size": 10}
+				]
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:101000=101101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:102000=102202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:103000=103303="* ]]
+	# The child mounts have the same mapping applied.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:101000=101101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101001=102202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:101002=103303="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:102000=101101="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:102001=102202="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:102002=103303="* ]]
+}
+
+@test "idmap mount (idmap flag, implied mapping) [userns]" {
+	setup_idmap_userns
+
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind", "idmap"],
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:100=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:200=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:300=233="* ]]
+	# Because we used "idmap", the child mounts were not remapped recursively.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+}
+
+@test "idmap mount (ridmap flag, implied mapping) [userns]" {
+	setup_idmap_userns
+
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind", "ridmap"],
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:100=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:200=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:300=233="* ]]
+	# The child mounts have the same mapping applied.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:100=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:101=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:102=233="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:200=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:201=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:202=233="* ]]
+}
+
+@test "idmap mount (idmap flag, implied mapping, userns join) [userns]" {
+	# Create a detached container with the id-mapping we want.
+	cp config.json config.json.bak
+	update_config '.linux.namespaces += [{"type": "user"}]
+		| .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]
+		| .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]'
+	update_config '.process.args = ["sleep", "infinity"]'
+
+	runc run -d --console-socket "$CONSOLE_SOCKET" target_userns
+	[ "$status" -eq 0 ]
+
+	# Configure our container to attach to the first container's userns.
+	target_pid="$(__runc state target_userns | jq .pid)"
+	update_config '.linux.namespaces |= map(if .type == "user" then (.path = "/proc/'"$target_pid"'/ns/" + .type) else . end)'
+	update_config 'del(.linux.uidMappings) | del(.linux.gidMappings)'
+
+	setup_host_bind_mount "source-multi1/" "mnt-subtree/multi1"
+	setup_host_bind_mount "source-multi2/" "mnt-subtree/multi2"
+
+	update_config '.mounts += [
+			{
+				"source": "mnt-subtree/",
+				"destination": "/tmp/mount-tree",
+				"options": ["rbind", "idmap"],
+			}
+		]'
+
+	update_config '.process.args = ["bash", "-c", "stat -c =%n:%u=%g= /tmp/mount-tree{,/multi1,/multi2}/{foo,bar,baz}.txt"]'
+	runc run test_debian
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"=/tmp/mount-tree/foo.txt:100=211="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/bar.txt:200=222="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/baz.txt:300=233="* ]]
+	# Because we used "idmap", the child mounts were not remapped recursively.
+	[[ "$output" == *"=/tmp/mount-tree/multi1/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi1/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/foo.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/bar.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+	[[ "$output" == *"=/tmp/mount-tree/multi2/baz.txt:$OVERFLOW_UID=$OVERFLOW_GID="* ]]
+}

From fa93c8b05b323187d0c5e3bdb3622c23c3abcfff Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Fri, 1 Dec 2023 17:08:38 +1100
Subject: [PATCH 8/8] tests: mounts: add some tests to check mount ordering

Our previous implementation of idmapped mounts and bind-mount sources
would open all of the source paths before we did any mounts, meaning
that mounts using sources from inside the container rootfs would not be
correct.

This has been fixed with the new on-demand system, and so add some
regression tests.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 tests/integration/mounts.bats | 119 ++++++++++++++++++++++++++++++++++
 1 file changed, 119 insertions(+)

diff --git a/tests/integration/mounts.bats b/tests/integration/mounts.bats
index a4d997ca765..5dafb655c92 100644
--- a/tests/integration/mounts.bats
+++ b/tests/integration/mounts.bats
@@ -24,6 +24,109 @@ function test_ro_cgroup_mount() {
 	for line in "${lines[@]}"; do [[ "${line}" == *'ro,'* ]]; done
 }
 
+# Parse an "optstring" of the form foo,bar into $is_foo and $is_bar variables.
+# Usage: parse_optstring foo,bar foo bar baz
+function parse_optstring() {
+	optstring="$1"
+	shift
+
+	for flag in "$@"; do
+		is_set=
+		if grep -wq "$flag" <<<"$optstring"; then
+			is_set=1
+		fi
+		eval "is_$flag=$is_set"
+	done
+}
+
+function config_add_bind_mount() {
+	src="$1"
+	dst="$2"
+	parse_optstring "${3:-}" rbind idmap
+
+	bindtype=bind
+	if [ -n "$is_rbind" ]; then
+		bindtype=rbind
+	fi
+
+	mappings=""
+	if [ -n "$is_idmap" ]; then
+		mappings='
+			"uidMappings": [{"containerID": 0, "hostID": 100000, "size": 65536}],
+			"gidMappings": [{"containerID": 0, "hostID": 100000, "size": 65536}],
+		'
+	fi
+
+	update_config '.mounts += [{
+		"source": "'"$src"'",
+		"destination": "'"$dst"'",
+		"type": "bind",
+		'"$mappings"'
+		"options": [ "'"$bindtype"'", "rprivate" ]
+	}]'
+}
+
+# This needs to be placed at the top of the bats file to work around
+# a shellcheck bug. See <https://github.com/koalaman/shellcheck/issues/2873>.
+function test_mount_order() {
+	parse_optstring "${1:-}" userns idmap
+
+	if [ -n "$is_userns" ]; then
+		requires root
+
+		update_config '.linux.namespaces += [{"type": "user"}]
+			| .linux.uidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]
+			| .linux.gidMappings += [{"containerID": 0, "hostID": 100000, "size": 65536}]'
+		remap_rootfs
+	fi
+
+	ctr_src_opts="rbind"
+	if [ -n "$is_idmap" ]; then
+		requires root
+		requires_kernel 5.12
+		requires_idmap_fs .
+
+		ctr_src_opts+=",idmap"
+	fi
+
+	mkdir -p rootfs/{mnt,final}
+	# Create a set of directories we can create a mount tree with.
+	for subdir in a/x b/y c/z; do
+		dir="bind-src/$subdir"
+		mkdir -p "$dir"
+		echo "$subdir" >"$dir/file"
+		# Add a symlink to make sure
+		topdir="$(dirname "$subdir")"
+		ln -s "$topdir" "bind-src/sym-$topdir"
+	done
+	# In userns tests, make sure that the source directory cannot be accessed,
+	# to make sure we're exercising the bind-mount source fd logic.
+	chmod o-rwx bind-src
+
+	rootfs="$(pwd)/rootfs"
+	rm -rf rootfs/mnt
+	mkdir rootfs/mnt
+
+	# Create a bind-mount tree.
+	config_add_bind_mount "$PWD/bind-src/a" "/mnt"
+	config_add_bind_mount "$PWD/bind-src/sym-b" "/mnt/x"
+	config_add_bind_mount "$PWD/bind-src/c" "/mnt/x/y"
+	config_add_bind_mount "$PWD/bind-src/sym-a" "/mnt/x/y/z"
+	# Create a recursive bind-mount that uses part of the current tree in the
+	# container.
+	config_add_bind_mount "$rootfs/mnt/x" "$rootfs/mnt/x/y/z/x" "$ctr_src_opts"
+	config_add_bind_mount "$rootfs/mnt/x/y" "$rootfs/mnt/x/y/z" "$ctr_src_opts"
+	# Finally, bind-mount the whole thing on top of /final.
+	config_add_bind_mount "$rootfs/mnt" "$rootfs/final" "$ctr_src_opts"
+
+	# Check that the entire tree was copied and the mounts were done in the
+	# expected order.
+	update_config '.process.args = ["cat", "/final/x/y/z/z/x/y/z/x/file"]'
+	runc run test_busybox
+	[ "$status" -eq 0 ]
+	[[ "$output" == *"a/x"* ]] # the final "file" was from a/x.
+}
+
 # https://github.com/opencontainers/runc/issues/3991
 @test "runc run [tmpcopyup]" {
 	mkdir -p rootfs/dir1/dir2
@@ -108,3 +211,19 @@ function test_ro_cgroup_mount() {
 	update_config '.linux.namespaces |= if index({"type": "cgroup"}) then . else . + [{"type": "cgroup"}] end'
 	test_ro_cgroup_mount
 }
+
+@test "runc run [mount order, container bind-mount source]" {
+	test_mount_order
+}
+
+@test "runc run [mount order, container bind-mount source] (userns)" {
+	test_mount_order userns
+}
+
+@test "runc run [mount order, container idmap source]" {
+	test_mount_order idmap
+}
+
+@test "runc run [mount order, container idmap source] (userns)" {
+	test_mount_order userns,idmap
+}