diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go index 3d732d3f98a..1c034e4e6e5 100644 --- a/libcontainer/dmz/cloned_binary_linux.go +++ b/libcontainer/dmz/cloned_binary_linux.go @@ -212,6 +212,23 @@ func IsCloned(exe *os.File) bool { // make sure the container process can never resolve the original runc binary. // For more details on why this is necessary, see CVE-2019-5736. func CloneSelfExe(tmpDir string) (*os.File, error) { + // Try to create a temporary overlayfs to produce a readonly version of + // /proc/self/exe that cannot be "unwrapped" by the container. In contrast + // to CloneBinary, this technique does not require any extra memory usage + // and does not have the (fairly noticeable) performance impact of copying + // a large binary file into a memfd. + // + // Based on some basic performance testing, the overlayfs approach has + // effectively no performance overhead (it is on par with both + // MS_BIND+MS_RDONLY and no binary cloning at all) while memfd copying adds + // around ~60% overhead during container startup. + overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir) + if err == nil { + logrus.Debug("runc-dmz: using overlayfs for sealed /proc/self/exe") // used for tests + return overlayFile, nil + } + logrus.WithError(err).Debugf("could not use overlayfs for /proc/self/exe sealing -- falling back to making a temporary copy") + selfExe, err := os.Open("/proc/self/exe") if err != nil { return nil, fmt.Errorf("opening current binary: %w", err) diff --git a/libcontainer/dmz/overlayfs_linux.go b/libcontainer/dmz/overlayfs_linux.go new file mode 100644 index 00000000000..92cb1944e59 --- /dev/null +++ b/libcontainer/dmz/overlayfs_linux.go @@ -0,0 +1,115 @@ +package dmz + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +func fsopen(fsName string, flags int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSOPEN_CLOEXEC + fd, err := unix.Fsopen(fsName, flags) + if err != nil { + return nil, os.NewSyscallError("fsopen "+fsName, err) + } + return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil +} + +func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSMOUNT_CLOEXEC + fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs) + if err != nil { + return nil, os.NewSyscallError("fsmount "+ctx.Name(), err) + } + runtime.KeepAlive(ctx) // make sure fd is kept alive while it's used + return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil +} + +func escapeOverlayLowerDir(path string) string { + // If the lowerdir path contains ":" we need to escape them, and if there + // were any escape characters already (\) we need to escape those first. + return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`) +} + +// sealedOverlayfs will create an internal overlayfs mount using fsopen() that +// uses the directory containing the binary as a lowerdir and a temporary tmpfs +// as an upperdir. There is no way to "unwrap" this (unlike MS_BIND+MS_RDONLY) +// and so we can create a safe zero-copy sealed version of /proc/self/exe. +// This only works for privileged users and on kernels with overlayfs and +// fsopen() enabled. +// +// TODO: Since Linux 5.11, overlayfs can be created inside user namespaces so +// it is technically possible to create an overlayfs even for rootless +// containers. Unfortunately, this would require some ugly manual CGo+fork +// magic so we can do this later if we feel it's really needed. +func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) { + // Try to do the superblock creation first to bail out early if we can't + // use this method. + overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC) + if err != nil { + return nil, err + } + defer overlayCtx.Close() + + // binPath is going to be /proc/self/exe, so do a readlink to get the real + // path. overlayfs needs the real underlying directory for this protection + // mode to work properly. + if realPath, err := os.Readlink(binPath); err == nil { + binPath = realPath + } + binLowerDirPath, binName := filepath.Split(binPath) + // Escape any ":"s or "\"s in the path. + binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath) + + // Overlayfs requires two lowerdirs in order to run in "lower-only" mode, + // where writes are completely blocked. Ideally we would create a dummy + // tmpfs for this, but it turns out that overlayfs doesn't allow for + // anonymous mountns paths. + // NOTE: I'm working on a patch to fix this but it won't be backported. + dummyLowerDirPath := escapeOverlayLowerDir(tmpDir) + + // Configure the lowerdirs. The binary lowerdir needs to be on the top to + // ensure that a file called "runc" (binName) in the dummy lowerdir doesn't + // mask the binary. + lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath + if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil { + return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err) + } + + // Get an actual handle to the overlayfs. + if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil { + return nil, os.NewSyscallError("fsconfig create overlayfs", err) + } + overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID) + if err != nil { + return nil, err + } + defer overlayFd.Close() + + // Grab a handle to the binary through overlayfs. + exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err) + } + // NOTE: We would like to check that exeFile is the same as /proc/self/exe, + // except this is a little difficult. Depending on what filesystems the + // layers are on, overlayfs can remap the inode numbers (and it always + // creates its own device numbers -- see ovl_map_dev_ino) so we can't do a + // basic stat-based check. The only reasonable option would be to hash both + // files and compare them, but this would require fully reading both files + // which would produce a similar performance overhead to memfd cloning. + // + // Ultimately, there isn't a real attack to be worried about here. An + // attacker would need to be able to modify files in /usr/sbin (or wherever + // runc lives), at which point they could just replace the runc binary with + // something malicious anyway. + return exeFile, nil +} diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index cc84597a7ce..c8ad559d931 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -346,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error { } return err } + +// Openat is a Go-friendly openat(2) wrapper. +func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) { + dirFd := unix.AT_FDCWD + if dir != nil { + dirFd = int(dir.Fd()) + } + flags |= unix.O_CLOEXEC + + fd, err := unix.Openat(dirFd, path, flags, mode) + if err != nil { + return nil, &os.PathError{Op: "openat", Path: path, Err: err} + } + return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil +} diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 789b83e5b92..e62c69ed8eb 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -374,6 +374,44 @@ function in_userns() { [[ "$(readlink /proc/self/ns/user)" != "user:[$((0xEFFFFFFD))]" ]] } +function can_fsopen() { + fstype="$1" + + # At the very least you need 5.1 for fsopen() and the filesystem needs to + # be supported by the running kernel. + if ! is_kernel_gte 5.1 || ! grep -qFw "$fstype" /proc/filesystems; then + return 1 + fi + + # If we're root and not in a userns, then we're done. + if [ "$EUID" -eq 0 ] && ! in_userns; then + return 0 + fi + + # If we are running in a userns, then the filesystem needs to support + # FS_USERNS_MOUNT, which is a per-filesystem flag that depends on the + # kernel version. + case "$fstype" in + overlay) + # 459c7c565ac3 ("ovl: unprivieged mounts") + is_kernel_gte 5.11 || return 2 + ;; + fuse) + # 4ad769f3c346 ("fuse: Allow fully unprivileged mounts") + is_kernel_gte 4.18 || return 2 + ;; + ramfs | tmpfs) + # b3c6761d9b5c ("userns: Allow the userns root to mount ramfs.") + # 2b8576cb09a7 ("userns: Allow the userns root to mount tmpfs.") + is_kernel_gte 3.9 || return 2 + ;; + *) + # If we don't know about the filesystem, return an error. + fail "can_fsopen: unknown filesystem $fstype" + ;; + esac +} + # Check if criu is available and working. function have_criu() { command -v criu &>/dev/null || return 1 diff --git a/tests/integration/run.bats b/tests/integration/run.bats index b49f3ac4c1a..390a69bf083 100644 --- a/tests/integration/run.bats +++ b/tests/integration/run.bats @@ -159,6 +159,10 @@ function teardown() { [ "$status" -eq 0 ] [[ "$output" = *"Hello World"* ]] [[ "$output" = *"runc-dmz: using /proc/self/exe clone"* ]] + # runc will use fsopen("overlay") if it can. + if can_fsopen overlay; then + [[ "$output" = *"runc-dmz: using overlayfs for sealed /proc/self/exe"* ]] + fi } @test "runc run [joining existing container namespaces]" {