diff --git a/libcontainer/dmz/cloned_binary_linux.go b/libcontainer/dmz/cloned_binary_linux.go index 3d732d3f98a..0446d802e94 100644 --- a/libcontainer/dmz/cloned_binary_linux.go +++ b/libcontainer/dmz/cloned_binary_linux.go @@ -212,6 +212,13 @@ func IsCloned(exe *os.File) bool { // make sure the container process can never resolve the original runc binary. // For more details on why this is necessary, see CVE-2019-5736. func CloneSelfExe(tmpDir string) (*os.File, error) { + overlayFile, err := sealedOverlayfs("/proc/self/exe", tmpDir) + if err == nil { + logrus.Debugf("using overlayfs for /proc/self/exe sealing") + return overlayFile, nil + } + logrus.Debugf("could not use overlayfs for /proc/self/exe sealing (%v) -- falling back to standard memfd copy", err) + selfExe, err := os.Open("/proc/self/exe") if err != nil { return nil, fmt.Errorf("opening current binary: %w", err) diff --git a/libcontainer/dmz/overlayfs_linux.go b/libcontainer/dmz/overlayfs_linux.go new file mode 100644 index 00000000000..addda2a18db --- /dev/null +++ b/libcontainer/dmz/overlayfs_linux.go @@ -0,0 +1,133 @@ +package dmz + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/utils" +) + +func fsopen(fsName string, flags int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSOPEN_CLOEXEC + fd, err := unix.Fsopen(fsName, flags) + if err != nil { + return nil, os.NewSyscallError("fsopen "+fsName, err) + } + return os.NewFile(uintptr(fd), "fscontext:"+fsName), nil +} + +func fsmount(ctx *os.File, flags, mountAttrs int) (*os.File, error) { + // Make sure we always set O_CLOEXEC. + flags |= unix.FSMOUNT_CLOEXEC + fd, err := unix.Fsmount(int(ctx.Fd()), flags, mountAttrs) + if err != nil { + return nil, os.NewSyscallError("fsmount "+ctx.Name(), err) + } + return os.NewFile(uintptr(fd), "fsmount:"+ctx.Name()), nil +} + +func escapeOverlayLowerDir(path string) string { + // If the lowerdir path contains ":" we need to escape them, and if there + // were any escape characters already (\) we need to escape those first. + return strings.ReplaceAll(strings.ReplaceAll(path, `\`, `\\`), `:`, `\:`) +} + +func fstatat(dir *os.File, path string, flags int) (unix.Stat_t, error) { + dirFd := unix.AT_FDCWD + if dir != nil { + dirFd = int(dir.Fd()) + } + flags |= unix.AT_EMPTY_PATH + + var stat unix.Stat_t + err := unix.Fstatat(dirFd, path, &stat, flags) + if err != nil { + err = &os.PathError{Op: "fstatat", Path: path, Err: err} + } + runtime.KeepAlive(dir) + return stat, err +} + +// sealedOverlayfs will create an internal overlayfs mount using fsopen() that +// uses the directory containing the binary as a lowerdir and a temporary tmpfs +// as an upperdir. There is no way to "unwrap" this (unlike MS_RDONLY) and so +// we can create a safe zero-copy sealed version of /proc/self/exe. +func sealedOverlayfs(binPath, tmpDir string) (_ *os.File, Err error) { + // binPath is going to be /proc/self/exe, so do a readlink to get the real + // path. overlayfs needs the real underlying directory for this protection + // mode to work properly. + if realPath, err := os.Readlink(binPath); err == nil { + binPath = realPath + } + binLowerDirPath, binName := filepath.Split(binPath) + // Escape any ":"s or "\"s in the path. + binLowerDirPath = escapeOverlayLowerDir(binLowerDirPath) + + // Overlayfs requires two lowerdirs in order to run in "lower-only" mode, + // where writes are completely blocked. Ideally we would create a dummy + // tmpfs for this, but it turns out that overlayfs doesn't allow for + // anonymous mountns paths. + // NOTE: I'm working on a patch to fix this but it won't be backported. + dummyLowerDirPath := escapeOverlayLowerDir(tmpDir) + + overlayCtx, err := fsopen("overlay", unix.FSOPEN_CLOEXEC) + if err != nil { + return nil, err + } + defer overlayCtx.Close() + + // Configure the lowerdirs. The binary lowerdir needs to be on the top to + // ensure that a file called "runc" (binName) in the dummy lowerdir doesn't + // mask the binary. + lowerDirStr := binLowerDirPath + ":" + dummyLowerDirPath + if err := unix.FsconfigSetString(int(overlayCtx.Fd()), "lowerdir", lowerDirStr); err != nil { + return nil, fmt.Errorf("fsconfig set overlayfs lowerdir=%s: %w", lowerDirStr, err) + } + + // Get an actual handle to the overlayfs. + if err := unix.FsconfigCreate(int(overlayCtx.Fd())); err != nil { + return nil, os.NewSyscallError("fsconfig create overlayfs", err) + } + overlayFd, err := fsmount(overlayCtx, unix.FSMOUNT_CLOEXEC, unix.MS_RDONLY|unix.MS_NODEV|unix.MS_NOSUID) + if err != nil { + return nil, err + } + defer overlayFd.Close() + + // Grab a handle to the binary through overlayfs. + exeFile, err := utils.Openat(overlayFd, binName, unix.O_PATH|unix.O_NOFOLLOW|unix.O_CLOEXEC, 0) + if err != nil { + return nil, fmt.Errorf("open %s from overlayfs (lowerdir=%s): %w", binName, lowerDirStr, err) + } + defer func() { + if Err != nil { + _ = exeFile.Close() + } + }() + + // Check that the file is what we expect. Ideally we might check the hash + // of the file against /proc/self/exe, but that would require us to copy + // the binary (negating the benefit of using overlayfs over memfd-copying). + // So instead we just check that the inode number is the same. In theory + // this could result in us allowing an incorrect binary in some scenarios + // but this would require an attacker to modify /usr/sbin in the host + // filesystem -- at which point you've already lost. + procSelfStat, err := fstatat(nil, "/proc/self/exe", 0) + if err != nil { + return nil, err + } + exeFileStat, err := fstatat(exeFile, "", 0) + if err != nil { + return nil, err + } + if procSelfStat.Ino != exeFileStat.Ino { + return nil, fmt.Errorf("overlayfs cloned binary has different inode number %d != %d", procSelfStat.Ino, exeFileStat.Ino) + } + return exeFile, nil +} diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index cc84597a7ce..c8ad559d931 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -346,3 +346,18 @@ func MkdirAllInRoot(root, unsafePath string, mode uint32) error { } return err } + +// Openat is a Go-friendly openat(2) wrapper. +func Openat(dir *os.File, path string, flags int, mode uint32) (*os.File, error) { + dirFd := unix.AT_FDCWD + if dir != nil { + dirFd = int(dir.Fd()) + } + flags |= unix.O_CLOEXEC + + fd, err := unix.Openat(dirFd, path, flags, mode) + if err != nil { + return nil, &os.PathError{Op: "openat", Path: path, Err: err} + } + return os.NewFile(uintptr(fd), dir.Name()+"/"+path), nil +}