From a16141f327554d5fb4146457b475e263461136fa Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 16 Oct 2024 17:13:39 +1100 Subject: [PATCH] dmz: use overlayfs to write-protect /proc/self/exe if possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit b999376fb237 ("nsenter: cloned_binary: remove bindfd logic entirely") removed the read-only bind-mount logic from our cloned binary code because it wasn't really safe because a container with CAP_SYS_ADMIN could remove the MS_RDONLY bit and get write access to /proc/self/exe (even with user namespaces this could've been an issue because it's not clear if the flags are locked). However, copying a binary does seem to have a minor performance impact. The only way to have no performance impact would be for the kernel to block these write attempts, but barring that we could try to reduce the overhead by coming up with a mount that cannot have it's read-only bits cleared. The "simplest" solution is to create a temporary overlayfs using fsopen(2) which uses the directory where runc exists as a lowerdir, ensuring that the container cannot access the underlying file -- and we don't have to do any copies. While fsopen(2) is not free because mount namespace cloning is usually expensive (and so it seems like the difference would be marginal), some basic performance testing seems to indicate there is a ~60% improvement doing it this way: % hyperfine --warmup 50 \ > "./runc-overlayfs run -b bundle ctr" > "./runc-memfd run -b bundle ctr" Benchmark 1: ./runc-overlayfs run -b bundle ctr Time (mean ± σ): 14.3 ms ± 1.3 ms [User: 5.5 ms, System: 11.8 ms] Range (min … max): 11.8 ms … 18.0 ms 174 runs Benchmark 2: ./runc-memfd run -b bundle ctr Time (mean ± σ): 23.0 ms ± 1.5 ms [User: 6.1 ms, System: 20.2 ms] Range (min … max): 19.7 ms … 28.1 ms 117 runs Summary ./runc-overlayfs run -b bundle ctr ran 1.61 ± 0.18 times faster than ./runc-memfd run -b bundle ctr Signed-off-by: Aleksa Sarai --- libcontainer/dmz/cloned_binary_linux.go | 7 ++ libcontainer/dmz/overlayfs_linux.go | 133 ++++++++++++++++++++++++ libcontainer/utils/utils_unix.go | 15 +++ 3 files changed, 155 insertions(+) create mode 100644 libcontainer/dmz/overlayfs_linux.go diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go index 3d732d3f98a..0446d802e94 100644 --- a/libcontainer/dmz/cloned_binary_linux.go +++ b/libcontainer/dmz/cloned_binary_linux.go @@ -212,6 +212,13 @@ func IsCloned(exe *os.File) bool { // make sure the container process can never resolve the original runc binary. // For more details on why this is necessary, see CVE-2019-5736. func CloneSelfExe(tmpDir string) (*os.File, error) { + overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir) + if err == nil { + logrus.Debugf("using overlayfs for /proc/self/exe sealing") + return overlayFile, nil + } + logrus.Debugf("could not use overlayfs for /proc/self/exe sealing (%v) -- falling back to standard memfd copy", err) + selfExe, err := os.Open("/proc/self/exe") if err != nil { return nil, fmt.Errorf("opening current binary: %w", err) diff --git a/libcontainer/dmz/overlayfs_linux.go b/libcontainer/dmz/overlayfs_linux.go new file mode 100644 index 00000000000..addda2a18db --- /dev/null +++ b/libcontainer/dmz/overlayfs_linux.go @@ -0,0 +1,133 @@ +package dmz + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +func fsopen(fsName string, flags int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSOPEN_CLOEXEC + fd, err := unix.Fsopen(fsName, flags) + if err != nil { + return nil, os.NewSyscallError("fsopen "+fsName, err) + } + return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil +} + +func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSMOUNT_CLOEXEC + fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs) + if err != nil { + return nil, os.NewSyscallError("fsmount "+ctx.Name(), err) + } + return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil +} + +func escapeOverlayLowerDir(path string) string { + // If the lowerdir path contains ":" we need to escape them, and if there + // were any escape characters already (\) we need to escape those first. + return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`) +} + +func fstatat(dir *os.File, path string, flags int) (unix.Stat_t, error) { + dirFd := unix.AT_FDCWD + if dir != nil { + dirFd = int(dir.Fd()) + } + flags |= unix.AT_EMPTY_PATH + + var stat unix.Stat_t + err := unix.Fstatat(dirFd, path, &stat, flags) + if err != nil { + err = &os.PathError{Op: "fstatat", Path: path, Err: err} + } + runtime.KeepAlive(dir) + return stat, err +} + +// sealedOverlayfs will create an internal overlayfs mount using fsopen() that +// uses the directory containing the binary as a lowerdir and a temporary tmpfs +// as an upperdir. There is no way to "unwrap" this (unlike MS_RDONLY) and so +// we can create a safe zero-copy sealed version of /proc/self/exe. +func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) { + // binPath is going to be /proc/self/exe, so do a readlink to get the real + // path. overlayfs needs the real underlying directory for this protection + // mode to work properly. + if realPath, err := os.Readlink(binPath); err == nil { + binPath = realPath + } + binLowerDirPath, binName := filepath.Split(binPath) + // Escape any ":"s or "\"s in the path. + binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath) + + // Overlayfs requires two lowerdirs in order to run in "lower-only" mode, + // where writes are completely blocked. Ideally we would create a dummy + // tmpfs for this, but it turns out that overlayfs doesn't allow for + // anonymous mountns paths. + // NOTE: I'm working on a patch to fix this but it won't be backported. + dummyLowerDirPath := escapeOverlayLowerDir(tmpDir) + + overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC) + if err != nil { + return nil, err + } + defer overlayCtx.Close() + + // Configure the lowerdirs. The binary lowerdir needs to be on the top to + // ensure that a file called "runc" (binName) in the dummy lowerdir doesn't + // mask the binary. + lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath + if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil { + return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err) + } + + // Get an actual handle to the overlayfs. + if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil { + return nil, os.NewSyscallError("fsconfig create overlayfs", err) + } + overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID) + if err != nil { + return nil, err + } + defer overlayFd.Close() + + // Grab a handle to the binary through overlayfs. + exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err) + } + defer func() { + if Err != nil { + _ = exeFile.Close() + } + }() + + // Check that the file is what we expect. Ideally we might check the hash + // of the file against /proc/self/exe, but that would require us to copy + // the binary (negating the benefit of using overlayfs over memfd-copying). + // So instead we just check that the inode number is the same. In theory + // this could result in us allowing an incorrect binary in some scenarios + // but this would require an attacker to modify /usr/sbin in the host + // filesystem -- at which point you've already lost. + procSelfStat, err := fstatat(nil, "/proc/self/exe", 0) + if err != nil { + return nil, err + } + exeFileStat, err := fstatat(exeFile, "", 0) + if err != nil { + return nil, err + } + if procSelfStat.Ino != exeFileStat.Ino { + return nil, fmt.Errorf("overlayfs cloned binary has different inode number %d != %d", procSelfStat.Ino, exeFileStat.Ino) + } + return exeFile, nil +} diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index cc84597a7ce..c8ad559d931 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -346,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error { } return err } + +// Openat is a Go-friendly openat(2) wrapper. +func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) { + dirFd := unix.AT_FDCWD + if dir != nil { + dirFd = int(dir.Fd()) + } + flags |= unix.O_CLOEXEC + + fd, err := unix.Openat(dirFd, path, flags, mode) + if err != nil { + return nil, &os.PathError{Op: "openat", Path: path, Err: err} + } + return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil +}