From 7a3c81b27953d8475ca512c3e5b26dc6cafb289b Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 8 Aug 2023 15:37:29 +1000 Subject: [PATCH] squash! libcontainer: remove all mount logic from nsexec Signed-off-by: Aleksa Sarai --- libcontainer/container_linux.go | 151 ---------------------- libcontainer/criu_linux.go | 4 +- libcontainer/mount_linux.go | 216 ++++++++++++++++++++++---------- libcontainer/process_linux.go | 143 ++++++++++++++++++++- libcontainer/rootfs_linux.go | 114 ++++++++++------- libcontainer/sync.go | 7 ++ 6 files changed, 363 insertions(+), 272 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 932a415cb8a..11ea28afcad 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -15,7 +15,6 @@ import ( "sync" "time" - "github.com/moby/sys/mountinfo" "github.com/opencontainers/runtime-spec/specs-go" "github.com/sirupsen/logrus" "github.com/vishvananda/netlink/nl" @@ -26,7 +25,6 @@ import ( "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" - "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" ) @@ -647,152 +645,6 @@ func (c *Container) newParentProcess(p *Process) (parentProcess, error) { return c.newSetnsProcess(p, cmd, messageSockPair, logFilePair) } -// remapMountSources tries to remap any applicable mount sources in the -// container configuration to use open_tree(2)-style mountfds. This allows -// containers to have bind-mounts from source directories the container process -// cannot resolve (#2576) as well as apply MOUNT_ATTR_IDMAP to mounts (which -// requires privileges the container process might not have). -// -// The core idea is that we create a mountfd with the right configuration, and -// then replace the source with a reference to the file descriptor. Ideally -// this would be /proc/self/fd/, but we cannot use -// new mount API file descriptors as bind-mount sources (they run afoul of the -// check_mnt() checks that stop us from doing bind-mounts from an fd in a -// different namespace). So instead, we set the source to \x00 -// which is an invalid path under Linux. -func (c *Container) remapMountSources(cmd *exec.Cmd) (Err error) { - nsHandles := new(userns.Handles) - defer nsHandles.Release() - for i, m := range c.config.Mounts { - var mountFile *os.File - if m.IsBind() { - flags := uint(unix.OPEN_TREE_CLONE | unix.O_CLOEXEC) - if m.Flags&unix.MS_REC == unix.MS_REC { - flags |= unix.AT_RECURSIVE - } - mountFd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags) - if err != nil { - // For non-id-mapped mounts, this functionality is optional. We - // can just fallback to letting the rootfs code use the - // original source as a mountpoint. - if !m.IsIDMapped() { - logrus.Debugf("remap mount sources: skipping remap of %s due to failure: open_tree(OPEN_TREE_CLONE): %v", m.Source, err) - continue - } - return &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err} - } - mountFile = os.NewFile(uintptr(mountFd), m.Source+" (open_tree)") - // Only close the file if the remapping failed -- otherwise we keep - // the file open to be passed to "runc init". - defer func() { - if Err != nil { - _ = mountFile.Close() - } - }() - } - - if m.IsIDMapped() { - if mountFile == nil { - return fmt.Errorf("remap mount sources: invalid mount source %s: id-mapping of non-bind-mounts is not supported", m.Source) - } - usernsFile, err := nsHandles.Get(userns.Mapping{ - UIDMappings: m.UIDMappings, - GIDMappings: m.GIDMappings, - }) - if err != nil { - return fmt.Errorf("remap mount sources: failed to create userns for %s id-mapping: %w", m.Source, err) - } - defer usernsFile.Close() - if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{ - Attr_set: unix.MOUNT_ATTR_IDMAP, - Userns_fd: uint64(usernsFile.Fd()), - }); err != nil { - return fmt.Errorf("remap mount sources: failed to set IDMAP_SOURCE_ATTR on %s: %w", m.Source, err) - } - } - - if mountFile != nil { - // We need to emulate the propagation behaviour when bind-mounting - // from a root filesystem with config.RootPropagation applied. This - // is needed because existing runc users expect bind-mounts to act - // this way (the "classic" method), regardless of the mount API - // used to create the bind-mount. - // - // NOTE: This explicitly does not handle configurations where there - // is a bind-mount source of a path from inside the container - // rootfs. We could do this in rootfs_linux.go but this would - // require doing criu-style shennanigans to try to figure out how - // to recreate the state in /proc/self/mountinfo. For now, it's - // much simpler to implement this based purely on RootPropagation. - - // Same logic as prepareRoot(). - propFlags := unix.MS_SLAVE | unix.MS_REC - if c.config.RootPropagation != 0 { - propFlags = c.config.RootPropagation - } - - // The one thing to consider is whether the RootPropagation is - // MS_REC and whether the mount source is the same as /. If it is - // MS_REC then we apply the propagation flags (nix MS_REC) - // recursively. Otherwise we apply the propagation flags - // non-recursively if m.Source is part of the / mount. We do - // nothing if neither is the case. - var setattrFlags uint - if propFlags&unix.MS_REC == unix.MS_REC { - // If the RootPropagation is recursive, then any bind-mount - // from the host should inherit the propagation setting of /. - logrus.Debugf("remap mount sources: applying recursive RootPropagation of 0x%x to %q bind-mount propagation", propFlags, m.Source) - setattrFlags = unix.AT_RECURSIVE - } else { - // If the RootPropagation is not recursive, only bind-mounts - // from paths within the / mount should inherit the propagation - // setting /. - info, err := mountinfo.GetMounts(mountinfo.ParentsFilter(m.Source)) - if err != nil { - return fmt.Errorf("remap mount sources: failed to parse /proc/self/mountinfo: %w", err) - } - // If there is only a single entry with a mountpoint path of /, - // the source was a child of /. Otherwise, do not set the - // propagation setting of bind-mounts. - if len(info) == 1 && info[0].Mountpoint == "/" { - logrus.Debugf("remap mount sources: applying non-recursive RootPropagation of 0x%x to child of / %q bind-mount propagation", propFlags, m.Source) - } else { - logrus.Debugf("remap mount sources: using default mount propagation for %q bind-mount", m.Source) - propFlags = 0 - } - } - if propFlags != 0 { - if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH|setattrFlags, &unix.MountAttr{ - Propagation: uint64(propFlags &^ unix.MS_REC), - }); err != nil { - return fmt.Errorf("remap mount sources: failed to set mount propagation of %q bind-mount to 0x%x: %w", m.Source, propFlags, err) - } - } - - // We have to leak these file descriptors to "runc init", which - // will mean that Go will set them as ~O_CLOEXEC during ForkExec, - // meaning that by default these would be leaked to the container - // process. - // - // While this is a bit scary, we have several processes to make - // sure this never leaks to the actual container (the SET_DUMPABLE - // bit, setting everything to O_CLOEXEC at the end of init, etc) -- - // and in addition, OPEN_TREE_CLONE file descriptors are completely - // safe to leak to the container because they are in an anonymous - // mount namespace so they cannot be used to escape the container - // a-la CVE-2016-9962. - cmd.ExtraFiles = append(cmd.ExtraFiles, mountFile) - fd := stdioFdCount + len(cmd.ExtraFiles) - 1 - logrus.Debugf("remapping mount source %s to fd %d", m.Source, fd) - c.config.Mounts[i].SourceFd = &fd - // Prepend \x00 to m.Source to make sure that attempts to operate - // on it in rootfs_linux.go fail. - c.config.Mounts[i].Source = "\x00" + m.Source - } - } - return nil -} - func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) nsMaps := make(map[configs.NamespaceType]string) @@ -801,9 +653,6 @@ func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, l nsMaps[ns.Type] = ns.Path } } - if err := c.remapMountSources(cmd); err != nil { - return nil, err - } data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps) if err != nil { return nil, err diff --git a/libcontainer/criu_linux.go b/libcontainer/criu_linux.go index ec6228399d8..7b209aa07af 100644 --- a/libcontainer/criu_linux.go +++ b/libcontainer/criu_linux.go @@ -618,8 +618,8 @@ func (c *Container) prepareCriuRestoreMounts(mounts []*configs.Mount) error { // because during initial container creation mounts are // set up in the order they are configured. if m.Device == "bind" { - if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, nil, m.Destination, dstFD, "", unix.MS_BIND|unix.MS_REC, "") + if err := utils.WithProcfd(c.config.Rootfs, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, nil, m.Destination, dstFd, "", unix.MS_BIND|unix.MS_REC, "") }); err != nil { return err } diff --git a/libcontainer/mount_linux.go b/libcontainer/mount_linux.go index 8d25f7911f8..ce71f63ad68 100644 --- a/libcontainer/mount_linux.go +++ b/libcontainer/mount_linux.go @@ -2,24 +2,46 @@ package libcontainer import ( "errors" + "fmt" "io/fs" "os" "strconv" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/userns" ) +// mountSourceType indicates what type of file descriptor is being returned. It +// is used to tell rootfs_linux.go whether or not to use move_mount(2) to +// install the mount. +type mountSourceType string + +const ( + // An open_tree(2)-style file descriptor that needs to be installed using + // move_mount(2) to install. + mountSourceOpenTree mountSourceType = "open_tree" + // A plain file descriptor that can be mounted through /proc/self/fd. + mountSourcePlain mountSourceType = "plain-open" +) + +type mountSource struct { + Type mountSourceType `json:"source_type"` + file *os.File `json:"-"` +} + // mountError holds an error from a failed mount or unmount operation. type mountError struct { - op string - source string - srcFD *int - target string - dstFD string - flags uintptr - data string - err error + op string + source string + srcFile *mountSource + target string + dstFd string + flags uintptr + data string + err error } // Error provides a string error representation. @@ -28,13 +50,14 @@ func (e *mountError) Error() string { if e.source != "" { out += "src=" + e.source + ", " - if e.srcFD != nil { - out += "srcFD=" + strconv.Itoa(*e.srcFD) + ", " + if e.srcFile != nil { + out += "srcType=" + string(e.srcFile.Type) + ", " + out += "srcFd=" + strconv.Itoa(int(e.srcFile.file.Fd())) + ", " } } out += "dst=" + e.target - if e.dstFD != "" { - out += ", dstFD=" + e.dstFD + if e.dstFd != "" { + out += ", dstFd=" + e.dstFd } if e.flags != uintptr(0) { @@ -57,73 +80,58 @@ func (e *mountError) Unwrap() error { // mount is a simple unix.Mount wrapper, returning an error with more context // in case it failed. func mount(source, target, fstype string, flags uintptr, data string) error { - return mountViaFDs(source, nil, target, "", fstype, flags, data) + return mountViaFds(source, nil, target, "", fstype, flags, data) } -// mountViaFDs is a unix.Mount wrapper which uses srcFD instead of source, -// and dstFD instead of target, unless those are empty. +// mountViaFds is a unix.Mount wrapper which uses srcFile instead of source, +// and dstFd instead of target, unless those are empty. // -// If srcFD is non-nil and flags contains MS_BIND, mountViaFDs will attempt to -// do a move_mount(2) as srcFD is assumed to be a "new mount API" file -// descriptor that cannot always be mounted using the old mount API (such as -// with bind-mounts). +// If srcFile is non-nil and flags does not contain MS_REMOUNT, mountViaFds +// will mount it according to the mountSourceType of the file descriptor. // -// The dstFD argument, if non-empty, is expected to be in the form of a path to +// The dstFd argument, if non-empty, is expected to be in the form of a path to // an opened file descriptor on procfs (i.e. "/proc/self/fd/NN"). // -// If case an FD is used instead of a source or a target path, the -// corresponding path is only used to add context to an error in case -// the mount operation has failed. -func mountViaFDs(source string, srcFD *int, target, dstFD, fstype string, flags uintptr, data string) error { +// If a file descriptor is used instead of a source or a target path, the +// corresponding path is only used to add context to an error in case the mount +// operation has failed. +func mountViaFds(source string, srcFile *mountSource, target, dstFd, fstype string, flags uintptr, data string) error { + // MS_REMOUNT and srcFile don't make sense together. + if srcFile != nil && flags&unix.MS_REMOUNT != 0 { + logrus.Debugf("mount source passed along with MS_REMOUNT -- ignoring srcFile") + srcFile = nil + } + dst := target - if dstFD != "" { - dst = dstFD + if dstFd != "" { + dst = dstFd } src := source - if srcFD != nil { - src = "/proc/self/fd/" + strconv.Itoa(*srcFD) - } - - // Do a move_mount(2) if the source is a mountfd. For MS_REMOUNT, - // move_mount(2) doesn't make sense (though we should never get into that - // case). - if srcFD != nil && flags&unix.MS_REMOUNT == 0 { - if err := unix.MoveMount(*srcFD, "", unix.AT_FDCWD, dstFD, unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS); err == nil { - // all done - return nil - } else if errors.Is(err, unix.EINVAL) || errors.Is(err, unix.ENOSYS) { - // fall back to old-school mount -- even if it's almost certainly - // not going to work... - logrus.Debugf("move_mount of srcfd-based mount failed, falling back to classic mount") - } else { - // Prettify the source argument so the user knows what - // /proc/self/fd/... refers to here. - srcLink, err := os.Readlink(src) - if err != nil { - srcLink = "" - } - return &mountError{ - op: "move_mount", - source: srcLink, - srcFD: srcFD, - target: target, - dstFD: dstFD, - flags: flags, - data: data, - err: err, - } - } + if srcFile != nil { + src = "/proc/self/fd/" + strconv.Itoa(int(srcFile.file.Fd())) + } + + var op string + var err error + if srcFile != nil && srcFile.Type == mountSourceOpenTree { + op = "move_mount" + err = unix.MoveMount(int(srcFile.file.Fd()), "", + unix.AT_FDCWD, dstFd, + unix.MOVE_MOUNT_F_EMPTY_PATH|unix.MOVE_MOUNT_T_SYMLINKS) + } else { + op = "mount" + err = unix.Mount(src, dst, fstype, flags, data) } - if err := unix.Mount(src, dst, fstype, flags, data); err != nil { + if err != nil { return &mountError{ - op: "mount", - source: source, - srcFD: srcFD, - target: target, - dstFD: dstFD, - flags: flags, - data: data, - err: err, + op: op, + source: source, + srcFile: srcFile, + target: target, + dstFd: dstFd, + flags: flags, + data: data, + err: err, } } return nil @@ -159,3 +167,75 @@ func syscallMode(i fs.FileMode) (o uint32) { // No mapping for Go's ModeTemporary (plan9 only). return } + +// mountFd creates an open_tree(2)-like mount fd from the provided +// configuration. This function must be called from within the container's +// mount namespace. +func mountFd(nsHandles *userns.Handles, m *configs.Mount) (mountSource, error) { + if !m.IsBind() { + return mountSource{}, errors.New("new mount api: only bind-mounts are supported") + } + if nsHandles == nil { + nsHandles = new(userns.Handles) + defer nsHandles.Release() + } + + var mountFile *os.File + var sourceType mountSourceType + + if m.IsBind() { + // We only need to use OPEN_TREE_CLONE in the case where we need to use + // mount_setattr(2). We are currently in the container namespace and + // there is no risk of an opened directory being used to escape the + // container. OPEN_TREE_CLONE is more expensive than open(2) because it + // requires doing mounts inside a new anonymous mount namespace. + if m.IsIDMapped() { + flags := uint(unix.OPEN_TREE_CLONE | unix.O_CLOEXEC) + if m.Flags&unix.MS_REC == unix.MS_REC { + flags |= unix.AT_RECURSIVE + } + mountFd, err := unix.OpenTree(unix.AT_FDCWD, m.Source, flags) + if err != nil { + return mountSource{}, &os.PathError{Op: "open_tree(OPEN_TREE_CLONE)", Path: m.Source, Err: err} + } + mountFile = os.NewFile(uintptr(mountFd), m.Source) + sourceType = mountSourceOpenTree + } else { + var err error + mountFile, err = os.OpenFile(m.Source, unix.O_PATH|unix.O_CLOEXEC, 0) + if err != nil { + return mountSource{}, err + } + sourceType = mountSourcePlain + } + } + + if m.IsIDMapped() { + if mountFile == nil { + return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapping of non-bind-mounts is not supported", m.Source) + } + if sourceType != mountSourceOpenTree { + // should never happen + return mountSource{}, fmt.Errorf("invalid mount source %q: id-mapped target mistakenly opened without OPEN_TREE_CLONE", m.Source) + } + + usernsFile, err := nsHandles.Get(userns.Mapping{ + UIDMappings: m.UIDMappings, + GIDMappings: m.GIDMappings, + }) + if err != nil { + return mountSource{}, fmt.Errorf("failed to create userns for %s id-mapping: %w", m.Source, err) + } + defer usernsFile.Close() + if err := unix.MountSetattr(int(mountFile.Fd()), "", unix.AT_EMPTY_PATH, &unix.MountAttr{ + Attr_set: unix.MOUNT_ATTR_IDMAP, + Userns_fd: uint64(usernsFile.Fd()), + }); err != nil { + return mountSource{}, fmt.Errorf("failed to set IDMAP_SOURCE_ATTR on %s: %w", m.Source, err) + } + } + return mountSource{ + Type: sourceType, + file: mountFile, + }, nil +} diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 1f5e37bd166..fb5cdc432b6 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "context" "encoding/json" "errors" "fmt" @@ -13,16 +14,18 @@ import ( "strconv" "time" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/logs" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) type parentProcess interface { @@ -168,8 +171,11 @@ func (p *setnsProcess) start() (retErr error) { // This shouldn't happen. panic("unexpected procReady in setns") case procHooks: - // This shouldn't happen. + // this shouldn't happen. panic("unexpected procHooks in setns") + case procMountPlease: + // this shouldn't happen. + panic("unexpected procMountPlease in setns") case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { return errors.New("seccomp listenerPath is not set") @@ -361,6 +367,95 @@ func (p *initProcess) waitForChildExit(childPid int) error { return nil } +type mountSourceRequestFn func(*configs.Mount) (mountSource, error) + +// goCreateMountSources spawns a goroutine which creates open_tree(2)-style +// mountfds based on the requested configs.Mount configuration. The returned +// requestFn and cancelFn are used to interact with the goroutine. +func (p *initProcess) goCreateMountSources(ctx context.Context) (mountSourceRequestFn, context.CancelFunc, error) { + type response struct { + src mountSource + err error + } + + errCh := make(chan error, 1) + requestCh := make(chan *configs.Mount) + responseCh := make(chan response) + + ctx, cancelFn := context.WithCancel(ctx) + go func() { + // We lock this thread because we need to setns(2) here. There is no + // UnlockOSThread() here, to ensure that the Go runtime will kill this + // thread once this goroutine returns (ensuring no other goroutines run + // in this context). + runtime.LockOSThread() + + // Detach from the shared fs of the rest of the Go process. + if err := unix.Unshare(unix.CLONE_FS); err != nil { + err = os.NewSyscallError("unshare(CLONE_FS)", err) + errCh <- fmt.Errorf("mount source thread: %w", err) + } + + // Attach to the container's mount namespace. + nsFd, err := os.Open(fmt.Sprintf("/proc/%d/ns/mnt", p.pid())) + if err != nil { + errCh <- fmt.Errorf("mount source thread: open container mntns: %w", err) + return + } + defer nsFd.Close() + if err := unix.Setns(int(nsFd.Fd()), unix.CLONE_NEWNS); err != nil { + err = os.NewSyscallError("setns", err) + errCh <- fmt.Errorf("mount source thread: join container mntns: %w", err) + return + } + + // No errors during setup! + errCh <- nil + logrus.Debugf("mount source thread: successfully running in container mntns") + + nsHandles := new(userns.Handles) + defer nsHandles.Release() + for { + select { + case m := <-requestCh: + src, err := mountFd(nsHandles, m) + logrus.Debugf("mount source thread: handling request for %q: %v %v", m.Source, src, err) + responseCh <- response{ + src: src, + err: err, + } + case <-ctx.Done(): + close(requestCh) + close(responseCh) + return + } + } + }() + + // Check for setup errors. + err := <-errCh + close(errCh) + if err != nil { + cancelFn() + return nil, nil, err + } + + requestFn := func(m *configs.Mount) (mountSource, error) { + select { + case requestCh <- m: + select { + case resp := <-responseCh: + return resp.src, resp.err + case <-ctx.Done(): + return mountSource{}, errors.New("receive mount source failed: channel closed") + } + case <-ctx.Done(): + return mountSource{}, errors.New("send mount source request failed: channel closed") + } + } + return requestFn, cancelFn, nil +} + func (p *initProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() //nolint: errcheck err := p.cmd.Start() @@ -451,6 +546,17 @@ func (p *initProcess) start() (retErr error) { return fmt.Errorf("error waiting for our first child to exit: %w", err) } + // Spin up a goroutine to handle remapping mount requests by runc init. There is no point doing this for rootless containers becuase + var mountRequest mountSourceRequestFn + if !p.container.config.RootlessEUID { + request, cancel, err := p.goCreateMountSources(context.Background()) + if err != nil { + return fmt.Errorf("error spawning mount remapping thread: %w", err) + } + defer cancel() + mountRequest = request + } + if err := p.createNetworkInterfaces(); err != nil { return fmt.Errorf("error creating network interfaces: %w", err) } @@ -467,6 +573,35 @@ func (p *initProcess) start() (retErr error) { ierr := parseSync(p.messageSockPair.parent, func(sync *syncT) error { switch sync.Type { + case procMountPlease: + if mountRequest == nil { + return fmt.Errorf("cannot fulfil mount requests as a rootless user") + } + var m *configs.Mount + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &m); err != nil { + return fmt.Errorf("sync %q passed invalid mount arg: %w", sync.Type, err) + } + mnt, err := mountRequest(m) + if err != nil { + return fmt.Errorf("failed to fulfil mount request: %w", err) + } + defer mnt.file.Close() + + arg, err := json.Marshal(mnt) + if err != nil { + return fmt.Errorf("sync %q failed to marshal mountSource: %w", sync.Type, err) + } + argMsg := json.RawMessage(arg) + if err := doWriteSync(p.messageSockPair.parent, syncT{ + Type: procMountFd, + Arg: &argMsg, + File: mnt.file, + }); err != nil { + return err + } case procSeccomp: if p.config.Config.Seccomp.ListenerPath == "" { return errors.New("seccomp listenerPath is not set") diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index e08956e11e8..fb7e917832c 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -1,6 +1,7 @@ package libcontainer import ( + "encoding/json" "errors" "fmt" "os" @@ -13,16 +14,17 @@ import ( securejoin "github.com/cyphar/filepath-securejoin" "github.com/moby/sys/mountinfo" "github.com/mrunalp/fileutils" + "github.com/opencontainers/runtime-spec/specs-go" + "github.com/opencontainers/selinux/go-selinux/label" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs2" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runc/libcontainer/userns" "github.com/opencontainers/runc/libcontainer/utils" - "github.com/opencontainers/runtime-spec/specs-go" - "github.com/opencontainers/selinux/go-selinux/label" - "github.com/sirupsen/logrus" - "golang.org/x/sys/unix" ) const defaultMountFlags = unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV @@ -40,12 +42,12 @@ type mountConfig struct { // mountEntry contains mount data specific to a mount point. type mountEntry struct { *configs.Mount - srcFD *int + srcFile *mountSource } func (m *mountEntry) src() string { - if m.srcFD != nil { - return "/proc/self/fd/" + strconv.Itoa(*m.srcFD) + if m.srcFile != nil { + return "/proc/self/fd/" + strconv.Itoa(int(m.srcFile.file.Fd())) } return m.Source } @@ -79,29 +81,48 @@ func prepareRootfs(pipe *os.File, iConfig *initConfig) (err error) { } for _, m := range config.Mounts { entry := mountEntry{Mount: m} - if m.SourceFd != nil { - // m.SourceFd indicates that we have an open_tree()-like mountfd - // which was passed to us from the parent runc process, which we - // have to operate on using the new mount API. - // TODO: Remove mountEntry and srcFD. - entry.srcFD = m.SourceFd - // As an extra precaution, mark the descriptor as O_CLOEXEC. In - // theory this could be raced against and leaked in other ways, but - // we have other protections. We ignore errors. - _, _ = unix.FcntlInt(uintptr(*m.SourceFd), unix.F_SETFD, unix.O_CLOEXEC) + // Figure out whether we need to request runc to give us an + // open_tree(2)-style mountfd. For idmapped mounts, this is always + // necessary. For bind-mounts, this is only necessary if we cannot + // resolve the parent mount (this is only hit if you are running in a + // userns). + wantSourceFile := m.IsIDMapped() + if m.IsBind() { + if _, err := os.Stat(m.Source); err != nil { + wantSourceFile = true + } } - if err := mountToRootfs(mountConfig, entry); err != nil { - // Clean up error formatting for \x00 prefix entries to tell the - // user what the original mount source was. - srcName := fmt.Sprintf("%q", m.Source) - if entry.srcFD != nil { - if link, err := os.Readlink(fmt.Sprintf("/proc/self/fd/%d", *entry.srcFD)); err != nil { - srcName = fmt.Sprintf("[fd=%d]", *entry.srcFD) - } else { - srcName = fmt.Sprintf("%q[fd=%d]", link, *entry.srcFD) - } + if wantSourceFile { + // Request a source file from the host. + if err := writeSyncArg(pipe, procMountPlease, m); err != nil { + return fmt.Errorf("failed to request mountfd for %q: %w", m.Source, err) + } + sync, err := readSyncFull(pipe, procMountFd) + if err != nil { + return fmt.Errorf("mountfd request for %q failed: %w", m.Source, err) + } + if sync.File == nil { + return fmt.Errorf("mountfd request for %q: response missing attached fd", m.Source) } - return fmt.Errorf("error mounting %s to rootfs at %q: %w", srcName, m.Destination, err) + defer sync.File.Close() + // Sanity-check to make sure we didn't get the wrong fd back. This + // should never happen. + if sync.File.Name() != m.Source { + return fmt.Errorf("returned mountfd for %q doesn't match requested mount configuration: mountfd path is %q", m.Source, sync.File.Name()) + } + // Unmarshal the procMountFd argument (the file is sync.File). + var src mountSource + if sync.Arg == nil { + return fmt.Errorf("sync %q is missing an argument", sync.Type) + } + if err := json.Unmarshal(*sync.Arg, &src); err != nil { + return fmt.Errorf("invalid mount fd response argument %q: %w", string(*sync.Arg), err) + } + src.file = sync.File + entry.srcFile = &src + } + if err := mountToRootfs(mountConfig, entry); err != nil { + return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } } @@ -282,7 +303,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { if err := os.MkdirAll(subsystemPath, 0o755); err != nil { return err } - if err := utils.WithProcfd(c.root, b.Destination, func(dstFD string) error { + if err := utils.WithProcfd(c.root, b.Destination, func(dstFd string) error { flags := defaultMountFlags if m.Flags&unix.MS_RDONLY != 0 { flags = flags | unix.MS_RDONLY @@ -295,7 +316,7 @@ func mountCgroupV1(m *configs.Mount, c *mountConfig) error { data = cgroups.CgroupNamePrefix + data source = "systemd" } - return mountViaFDs(source, nil, b.Destination, dstFD, "cgroup", uintptr(flags), data) + return mountViaFds(source, nil, b.Destination, dstFd, "cgroup", uintptr(flags), data) }); err != nil { return err } @@ -326,8 +347,8 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { if err := os.MkdirAll(dest, 0o755); err != nil { return err } - err = utils.WithProcfd(c.root, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, nil, m.Destination, dstFD, "cgroup2", uintptr(m.Flags), m.Data) + err = utils.WithProcfd(c.root, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, nil, m.Destination, dstFd, "cgroup2", uintptr(m.Flags), m.Data) }) if err == nil || !(errors.Is(err, unix.EPERM) || errors.Is(err, unix.EBUSY)) { return err @@ -348,7 +369,6 @@ func mountCgroupV2(m *configs.Mount, c *mountConfig) error { bindM.Source = c.cgroup2Path } // mountToRootfs() handles remounting for MS_RDONLY. - // No need to set mountEntry.srcFD here, because mountToRootfs() calls utils.WithProcfd() by itself in mountPropagate(). err = mountToRootfs(c, mountEntry{Mount: bindM}) if c.rootlessCgroups && errors.Is(err, unix.ENOENT) { // ENOENT (for `src = c.cgroup2Path`) happens when rootless runc is being executed @@ -393,15 +413,15 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { } }() - return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) (Err error) { + return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) (Err error) { // Copy the container data to the host tmpdir. We append "/" to force // CopyDirectory to resolve the symlink rather than trying to copy the // symlink itself. - if err := fileutils.CopyDirectory(dstFD+"/", tmpDir); err != nil { - return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFD, tmpDir, err) + if err := fileutils.CopyDirectory(dstFd+"/", tmpDir); err != nil { + return fmt.Errorf("tmpcopyup: failed to copy %s to %s (%s): %w", m.Destination, dstFd, tmpDir, err) } // Now move the mount into the container. - if err := mountViaFDs(tmpDir, nil, m.Destination, dstFD, "", unix.MS_MOVE, ""); err != nil { + if err := mountViaFds(tmpDir, nil, m.Destination, dstFd, "", unix.MS_MOVE, ""); err != nil { return fmt.Errorf("tmpcopyup: failed to move mount: %w", err) } return nil @@ -478,7 +498,7 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { if err := prepareBindMount(m, rootfs); err != nil { return err } - // open_tree()-related shenanigans are all handled in mountViaFDs. + // open_tree()-related shenanigans are all handled in mountViaFds. if err := mountPropagate(m, rootfs, mountLabel); err != nil { return err } @@ -704,8 +724,8 @@ func bindMountDeviceNode(rootfs, dest string, node *devices.Device) error { if f != nil { _ = f.Close() } - return utils.WithProcfd(rootfs, dest, func(dstFD string) error { - return mountViaFDs(node.Path, nil, dest, dstFD, "bind", unix.MS_BIND, "") + return utils.WithProcfd(rootfs, dest, func(dstFd string) error { + return mountViaFds(node.Path, nil, dest, dstFd, "bind", unix.MS_BIND, "") }) } @@ -1076,9 +1096,9 @@ func writeSystemProperty(key, value string) error { } func remount(m mountEntry, rootfs string, noMountFallback bool) error { - return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { + return utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { flags := uintptr(m.Flags | unix.MS_REMOUNT) - err := mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") + err := mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "") if err == nil { return nil } @@ -1102,7 +1122,7 @@ func remount(m mountEntry, rootfs string, noMountFallback bool) error { } // ... and retry the mount with flags found above. flags |= uintptr(int(s.Flags) & checkflags) - return mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") + return mountViaFds("", nil, m.Destination, dstFd, m.Device, flags, "") }) } @@ -1125,17 +1145,17 @@ func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { // mutating underneath us, we verify that we are actually going to mount // inside the container with WithProcfd() -- mounting through a procfd // mounts on the target. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { - return mountViaFDs(m.Source, m.srcFD, m.Destination, dstFD, m.Device, uintptr(flags), data) + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { + return mountViaFds(m.Source, m.srcFile, m.Destination, dstFd, m.Device, uintptr(flags), data) }); err != nil { return err } // We have to apply mount propagation flags in a separate WithProcfd() call // because the previous call invalidates the passed procfd -- the mount // target needs to be re-opened. - if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFd string) error { for _, pflag := range m.PropagationFlags { - if err := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(pflag), ""); err != nil { + if err := mountViaFds("", nil, m.Destination, dstFd, "", uintptr(pflag), ""); err != nil { return err } } diff --git a/libcontainer/sync.go b/libcontainer/sync.go index 25507e58ad9..2cb659ff94e 100644 --- a/libcontainer/sync.go +++ b/libcontainer/sync.go @@ -18,6 +18,11 @@ type syncType string // // [ child ] <-> [ parent ] // +// procMountPlease --> [open_tree(2) and configure mount] +// Arg: configs.Mount +// <-- procMountFd +// file: mountfd +// // procSeccomp --> [forward fd to listenerPath] // file: seccomp fd // --- no return synchronisation @@ -36,6 +41,8 @@ const ( procRun syncType = "procRun" procHooks syncType = "procHooks" procResume syncType = "procResume" + procMountPlease syncType = "procMountPlease" + procMountFd syncType = "procMountFd" procSeccomp syncType = "procSeccomp" procSeccompDone syncType = "procSeccompDone" )