opencontainers · AkihiroSuda · Oct 20, 2024 · Oct 20, 2024 · Oct 16, 2024 · rata
diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go
@@ -212,6 +212,23 @@ func IsCloned(exe *os.File) bool {
 // make sure the container process can never resolve the original runc binary.
 // For more details on why this is necessary, see CVE-2019-5736.
 func CloneSelfExe(tmpDir string) (*os.File, error) {
+	// Try to create a temporary overlayfs to produce a readonly version of
+	// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
+	// to CloneBinary, this technique does not require any extra memory usage
+	// and does not have the (fairly noticeable) performance impact of copying
+	// a large binary file into a memfd.
+	//
+	// Based on some basic performance testing, the overlayfs approach has
+	// effectively no performance overhead (it is on par with both
+	// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
+	// around ~60% overhead during container startup.
+	overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
+	if err == nil {
+		logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
+		return overlayFile, nil
+	}
+	logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
+
 	selfExe, err := os.Open("/proc/self/exe")
 	if err != nil {
 		return nil, fmt.Errorf("opening current binary: %w", err)

diff --git a/libcontainer/dmz/overlayfs_linux.go b/libcontainer/dmz/overlayfs_linux.go
@@ -0,0 +1,115 @@
+package dmz
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/utils"
+)
+
+func fsopen(fsName string, flags int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSOPEN_CLOEXEC
+	fd, err := unix.Fsopen(fsName, flags)
+	if err != nil {
+		return nil, os.NewSyscallError("fsopen "+fsName, err)
+	}
+	return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
+}
+
+func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSMOUNT_CLOEXEC
+	fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
+	if err != nil {
+		return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
+	}
+	runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
+	return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
+}
+
+func escapeOverlayLowerDir(path string) string {
+	// If the lowerdir path contains ":" we need to escape them, and if there
+	// were any escape characters already (\) we need to escape those first.
+	return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
+}
+
+// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
+// uses the directory containing the binary as a lowerdir and a temporary tmpfs
+// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
+// and so we can create a safe zero-copy sealed version of /proc/self/exe.
+// This only works for privileged users and on kernels with overlayfs and
+// fsopen() enabled.
+//
+// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
+// it is technically possible to create an overlayfs even for rootless
+// containers. Unfortunately, this would require some ugly manual CGo+fork
+// magic so we can do this later if we feel it's really needed.
+func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
+	// Try to do the superblock creation first to bail out early if we can't
+	// use this method.
+	overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
+	if err != nil {
+		return nil, err
+	}
+	defer overlayCtx.Close()
+
+	// binPath is going to be /proc/self/exe, so do a readlink to get the real
+	// path. overlayfs needs the real underlying directory for this protection
+	// mode to work properly.
+	if realPath, err := os.Readlink(binPath); err == nil {
+		binPath = realPath
+	}
+	binLowerDirPath, binName := filepath.Split(binPath)
+	// Escape any ":"s or "\"s in the path.
+	binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)
+
+	// Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
+	// where writes are completely blocked. Ideally we would create a dummy
+	// tmpfs for this, but it turns out that overlayfs doesn't allow for
+	// anonymous mountns paths.
+	// NOTE: I'm working on a patch to fix this but it won't be backported.
+	dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)
+
+	// Configure the lowerdirs. The binary lowerdir needs to be on the top to
+	// ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
+	// mask the binary.
+	lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
+	if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
+		return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
+	}
+
+	// Get an actual handle to the overlayfs.
+	if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
+		return nil, os.NewSyscallError("fsconfig create overlayfs", err)
+	}
+	overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
+	if err != nil {
+		return nil, err
+	}
+	defer overlayFd.Close()
+
+	// Grab a handle to the binary through overlayfs.
+	exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
+	}
+	// NOTE: We would like to check that exeFile is the same as /proc/self/exe,
+	// except this is a little difficult. Depending on what filesystems the
+	// layers are on, overlayfs can remap the inode numbers (and it always
+	// creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
+	// basic stat-based check. The only reasonable option would be to hash both
+	// files and compare them, but this would require fully reading both files
+	// which would produce a similar performance overhead to memfd cloning.
+	//
+	// Ultimately, there isn't a real attack to be worried about here. An
+	// attacker would need to be able to modify files in /usr/sbin (or wherever
+	// runc lives), at which point they could just replace the runc binary with
+	// something malicious anyway.
+	return exeFile, nil
+}
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
@@ -346,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error {
 	}
 	return err
 }
+
+// Openat is a Go-friendly openat(2) wrapper.
+func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
+	dirFd := unix.AT_FDCWD
+	if dir != nil {
+		dirFd = int(dir.Fd())
+	}
+	flags |= unix.O_CLOEXEC
+
+	fd, err := unix.Openat(dirFd, path, flags, mode)
+	if err != nil {
+		return nil, &os.PathError{Op: "openat", Path: path, Err: err}
+	}
+	return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
+}
diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash
@@ -368,6 +368,55 @@ function rootless_cgroup() {
 	[[ "$ROOTLESS_FEATURES" == *"cgroup"* || -v RUNC_USE_SYSTEMD ]]
 }
 
+function in_userns() {
+	# The kernel guarantees the root userns inode number (and thus the value of
+	# the magic-link) is always the same value (PROC_USER_INIT_INO).
+	[[ "$(readlink /proc/self/ns/user)" != "user:[$((0xEFFFFFFD))]" ]]
+}
+
+function can_fsopen() {
+	fstype="$1"
+
+	# At the very least you need 5.1 for fsopen() and the filesystem needs to
+	# be supported by the running kernel.
+	if ! is_kernel_gte 5.1 || ! grep -qFw "$fstype" /proc/filesystems; then
+		return 1
+	fi
+
+	# You need to be root to use fsopen.
+	if [ "$EUID" -ne 0 ]; then
+		return 1
+	fi
+
+	# If we're root in the initial userns, we're done.
+	if ! in_userns; then
+		return 0
+	fi
+
+	# If we are running in a userns, then the filesystem needs to support
+	# FS_USERNS_MOUNT, which is a per-filesystem flag that depends on the
+	# kernel version.
+	case "$fstype" in
+	overlay)
+		# 459c7c565ac3 ("ovl: unprivieged mounts")
+		is_kernel_gte 5.11 || return 2
+		;;
+	fuse)
+		# 4ad769f3c346 ("fuse: Allow fully unprivileged mounts")
+		is_kernel_gte 4.18 || return 2
+		;;
+	ramfs | tmpfs)
+		# b3c6761d9b5c ("userns: Allow the userns root to mount ramfs.")
+		# 2b8576cb09a7 ("userns: Allow the userns root to mount tmpfs.")
+		is_kernel_gte 3.9 || return 2
+		;;
+	*)
+		# If we don't know about the filesystem, return an error.
+		fail "can_fsopen: unknown filesystem $fstype"
+		;;
+	esac
+}
+
 # Check if criu is available and working.
 function have_criu() {
 	command -v criu &>/dev/null || return 1
@@ -396,7 +445,7 @@ function requires() {
 			fi
 			;;
 		root)
-			if [ $EUID -ne 0 ]; then
+			if [ $EUID -ne 0 ] || in_userns; then
 				skip_me=1
 			fi
 			;;

diff --git a/tests/integration/run.bats b/tests/integration/run.bats
@@ -159,6 +159,10 @@ function teardown() {
 	[ "$status" -eq 0 ]
 	[[ "$output" = *"Hello World"* ]]
 	[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
+	# runc will use fsopen("overlay") if it can.
+	if can_fsopen overlay; then
+		[[ "$output" = *"runc-dmz: using overlayfs for sealed /proc/self/exe"* ]]
+	fi
 }
 
 @test "runc run [joining existing container namespaces]" {