From 8cfbccb6d99a1bcfad5b16fb3f0bbca97cb5be4c Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Sun, 20 Oct 2024 19:55:00 +1100
Subject: [PATCH 1/2] tests: integration: add helper to check if we're in a
 userns

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 tests/integration/helpers.bash | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash
index 31e7dee8438..789b83e5b92 100755
--- a/tests/integration/helpers.bash
+++ b/tests/integration/helpers.bash
@@ -368,6 +368,12 @@ function rootless_cgroup() {
 	[[ "$ROOTLESS_FEATURES" == *"cgroup"* || -v RUNC_USE_SYSTEMD ]]
 }
 
+function in_userns() {
+	# The kernel guarantees the root userns inode number (and thus the value of
+	# the magic-link) is always the same value (PROC_USER_INIT_INO).
+	[[ "$(readlink /proc/self/ns/user)" != "user:[$((0xEFFFFFFD))]" ]]
+}
+
 # Check if criu is available and working.
 function have_criu() {
 	command -v criu &>/dev/null || return 1
@@ -396,7 +402,7 @@ function requires() {
 			fi
 			;;
 		root)
-			if [ $EUID -ne 0 ]; then
+			if [ $EUID -ne 0 ] || in_userns; then
 				skip_me=1
 			fi
 			;;

From 515f09f7b1dbd442298132592c7e9689b855dbdf Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 16 Oct 2024 17:13:39 +1100
Subject: [PATCH 2/2] dmz: use overlayfs to write-protect /proc/self/exe if
 possible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit b999376fb237 ("nsenter: cloned_binary: remove bindfd logic
entirely") removed the read-only bind-mount logic from our cloned binary
code because it wasn't really safe because a container with
CAP_SYS_ADMIN could remove the MS_RDONLY bit and get write access to
/proc/self/exe (even with user namespaces this could've been an issue
because it's not clear if the flags are locked).

However, copying a binary does seem to have a minor performance impact.
The only way to have no performance impact would be for the kernel to
block these write attempts, but barring that we could try to reduce the
overhead by coming up with a mount that cannot have it's read-only bits
cleared.

The "simplest" solution is to create a temporary overlayfs using
fsopen(2) which uses the directory where runc exists as a lowerdir,
ensuring that the container cannot access the underlying file -- and we
don't have to do any copies.

While fsopen(2) is not free because mount namespace cloning is usually
expensive (and so it seems like the difference would be marginal), some
basic performance testing seems to indicate there is a ~60% improvement
doing it this way and that it has effectively no overhead even when
compared to just using /proc/self/exe directly:

  % hyperfine --warmup 50 \
  >           "./runc-noclone run -b bundle ctr" \
  >           "./runc-overlayfs run -b bundle ctr" \
  >           "./runc-memfd run -b bundle ctr"

  Benchmark 1: ./runc-noclone run -b bundle ctr
    Time (mean ± σ):      13.7 ms ±   0.9 ms    [User: 6.0 ms, System: 10.9 ms]
    Range (min … max):    11.3 ms …  16.1 ms    184 runs

  Benchmark 2: ./runc-overlayfs run -b bundle ctr
    Time (mean ± σ):      13.9 ms ±   0.9 ms    [User: 6.2 ms, System: 10.8 ms]
    Range (min … max):    11.8 ms …  16.0 ms    180 runs

  Benchmark 3: ./runc-memfd run -b bundle ctr
    Time (mean ± σ):      22.6 ms ±   1.3 ms    [User: 5.7 ms, System: 20.7 ms]
    Range (min … max):    19.9 ms …  26.5 ms    114 runs

  Summary
    ./runc-noclone run -b bundle ctr ran
      1.01 ± 0.09 times faster than ./runc-overlayfs run -b bundle ctr
      1.65 ± 0.15 times faster than ./runc-memfd run -b bundle ctr

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/dmz/cloned_binary_linux.go |  17 ++++
 libcontainer/dmz/overlayfs_linux.go     | 115 ++++++++++++++++++++++++
 libcontainer/utils/utils_unix.go        |  15 ++++
 tests/integration/helpers.bash          |  43 +++++++++
 tests/integration/run.bats              |   4 +
 5 files changed, 194 insertions(+)
 create mode 100644 libcontainer/dmz/overlayfs_linux.go

diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go
index 3d732d3f98a..1c034e4e6e5 100644
--- a/libcontainer/dmz/cloned_binary_linux.go
+++ b/libcontainer/dmz/cloned_binary_linux.go
@@ -212,6 +212,23 @@ func IsCloned(exe *os.File) bool {
 // make sure the container process can never resolve the original runc binary.
 // For more details on why this is necessary, see CVE-2019-5736.
 func CloneSelfExe(tmpDir string) (*os.File, error) {
+	// Try to create a temporary overlayfs to produce a readonly version of
+	// /proc/self/exe that cannot be "unwrapped" by the container. In contrast
+	// to CloneBinary, this technique does not require any extra memory usage
+	// and does not have the (fairly noticeable) performance impact of copying
+	// a large binary file into a memfd.
+	//
+	// Based on some basic performance testing, the overlayfs approach has
+	// effectively no performance overhead (it is on par with both
+	// MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds
+	// around ~60% overhead during container startup.
+	overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir)
+	if err == nil {
+		logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests
+		return overlayFile, nil
+	}
+	logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy")
+
 	selfExe, err := os.Open("/proc/self/exe")
 	if err != nil {
 		return nil, fmt.Errorf("opening current binary: %w", err)
diff --git a/libcontainer/dmz/overlayfs_linux.go b/libcontainer/dmz/overlayfs_linux.go
new file mode 100644
index 00000000000..92cb1944e59
--- /dev/null
+++ b/libcontainer/dmz/overlayfs_linux.go
@@ -0,0 +1,115 @@
+package dmz
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"runtime"
+	"strings"
+
+	"golang.org/x/sys/unix"
+
+	"github.com/opencontainers/runc/libcontainer/utils"
+)
+
+func fsopen(fsName string, flags int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSOPEN_CLOEXEC
+	fd, err := unix.Fsopen(fsName, flags)
+	if err != nil {
+		return nil, os.NewSyscallError("fsopen "+fsName, err)
+	}
+	return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil
+}
+
+func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) {
+	// Make sure we always set O_CLOEXEC.
+	flags |= unix.FSMOUNT_CLOEXEC
+	fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs)
+	if err != nil {
+		return nil, os.NewSyscallError("fsmount "+ctx.Name(), err)
+	}
+	runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used
+	return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil
+}
+
+func escapeOverlayLowerDir(path string) string {
+	// If the lowerdir path contains ":" we need to escape them, and if there
+	// were any escape characters already (\) we need to escape those first.
+	return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`)
+}
+
+// sealedOverlayfs will create an internal overlayfs mount using fsopen() that
+// uses the directory containing the binary as a lowerdir and a temporary tmpfs
+// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY)
+// and so we can create a safe zero-copy sealed version of /proc/self/exe.
+// This only works for privileged users and on kernels with overlayfs and
+// fsopen() enabled.
+//
+// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so
+// it is technically possible to create an overlayfs even for rootless
+// containers. Unfortunately, this would require some ugly manual CGo+fork
+// magic so we can do this later if we feel it's really needed.
+func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) {
+	// Try to do the superblock creation first to bail out early if we can't
+	// use this method.
+	overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC)
+	if err != nil {
+		return nil, err
+	}
+	defer overlayCtx.Close()
+
+	// binPath is going to be /proc/self/exe, so do a readlink to get the real
+	// path. overlayfs needs the real underlying directory for this protection
+	// mode to work properly.
+	if realPath, err := os.Readlink(binPath); err == nil {
+		binPath = realPath
+	}
+	binLowerDirPath, binName := filepath.Split(binPath)
+	// Escape any ":"s or "\"s in the path.
+	binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath)
+
+	// Overlayfs requires two lowerdirs in order to run in "lower-only" mode,
+	// where writes are completely blocked. Ideally we would create a dummy
+	// tmpfs for this, but it turns out that overlayfs doesn't allow for
+	// anonymous mountns paths.
+	// NOTE: I'm working on a patch to fix this but it won't be backported.
+	dummyLowerDirPath := escapeOverlayLowerDir(tmpDir)
+
+	// Configure the lowerdirs. The binary lowerdir needs to be on the top to
+	// ensure that a file called "runc" (binName) in the dummy lowerdir doesn't
+	// mask the binary.
+	lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath
+	if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil {
+		return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err)
+	}
+
+	// Get an actual handle to the overlayfs.
+	if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil {
+		return nil, os.NewSyscallError("fsconfig create overlayfs", err)
+	}
+	overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID)
+	if err != nil {
+		return nil, err
+	}
+	defer overlayFd.Close()
+
+	// Grab a handle to the binary through overlayfs.
+	exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0)
+	if err != nil {
+		return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err)
+	}
+	// NOTE: We would like to check that exeFile is the same as /proc/self/exe,
+	// except this is a little difficult. Depending on what filesystems the
+	// layers are on, overlayfs can remap the inode numbers (and it always
+	// creates its own device numbers -- see ovl_map_dev_ino) so we can't do a
+	// basic stat-based check. The only reasonable option would be to hash both
+	// files and compare them, but this would require fully reading both files
+	// which would produce a similar performance overhead to memfd cloning.
+	//
+	// Ultimately, there isn't a real attack to be worried about here. An
+	// attacker would need to be able to modify files in /usr/sbin (or wherever
+	// runc lives), at which point they could just replace the runc binary with
+	// something malicious anyway.
+	return exeFile, nil
+}
diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go
index cc84597a7ce..c8ad559d931 100644
--- a/libcontainer/utils/utils_unix.go
+++ b/libcontainer/utils/utils_unix.go
@@ -346,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error {
 	}
 	return err
 }
+
+// Openat is a Go-friendly openat(2) wrapper.
+func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) {
+	dirFd := unix.AT_FDCWD
+	if dir != nil {
+		dirFd = int(dir.Fd())
+	}
+	flags |= unix.O_CLOEXEC
+
+	fd, err := unix.Openat(dirFd, path, flags, mode)
+	if err != nil {
+		return nil, &os.PathError{Op: "openat", Path: path, Err: err}
+	}
+	return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil
+}
diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash
index 789b83e5b92..85f1fb0a558 100755
--- a/tests/integration/helpers.bash
+++ b/tests/integration/helpers.bash
@@ -374,6 +374,49 @@ function in_userns() {
 	[[ "$(readlink /proc/self/ns/user)" != "user:[$((0xEFFFFFFD))]" ]]
 }
 
+function can_fsopen() {
+	fstype="$1"
+
+	# At the very least you need 5.1 for fsopen() and the filesystem needs to
+	# be supported by the running kernel.
+	if ! is_kernel_gte 5.1 || ! grep -qFw "$fstype" /proc/filesystems; then
+		return 1
+	fi
+
+	# You need to be root to use fsopen.
+	if [ "$EUID" -ne 0 ]; then
+		return 1
+	fi
+
+	# If we're root in the initial userns, we're done.
+	if ! in_userns; then
+		return 0
+	fi
+
+	# If we are running in a userns, then the filesystem needs to support
+	# FS_USERNS_MOUNT, which is a per-filesystem flag that depends on the
+	# kernel version.
+	case "$fstype" in
+	overlay)
+		# 459c7c565ac3 ("ovl: unprivieged mounts")
+		is_kernel_gte 5.11 || return 2
+		;;
+	fuse)
+		# 4ad769f3c346 ("fuse: Allow fully unprivileged mounts")
+		is_kernel_gte 4.18 || return 2
+		;;
+	ramfs | tmpfs)
+		# b3c6761d9b5c ("userns: Allow the userns root to mount ramfs.")
+		# 2b8576cb09a7 ("userns: Allow the userns root to mount tmpfs.")
+		is_kernel_gte 3.9 || return 2
+		;;
+	*)
+		# If we don't know about the filesystem, return an error.
+		fail "can_fsopen: unknown filesystem $fstype"
+		;;
+	esac
+}
+
 # Check if criu is available and working.
 function have_criu() {
 	command -v criu &>/dev/null || return 1
diff --git a/tests/integration/run.bats b/tests/integration/run.bats
index b49f3ac4c1a..390a69bf083 100644
--- a/tests/integration/run.bats
+++ b/tests/integration/run.bats
@@ -159,6 +159,10 @@ function teardown() {
 	[ "$status" -eq 0 ]
 	[[ "$output" = *"Hello World"* ]]
 	[[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]]
+	# runc will use fsopen("overlay") if it can.
+	if can_fsopen overlay; then
+		[[ "$output" = *"runc-dmz: using overlayfs for sealed /proc/self/exe"* ]]
+	fi
 }
 
 @test "runc run [joining existing container namespaces]" {