From 4f05c4cf34639302f72874c09c7df8fbdd62a22e Mon Sep 17 00:00:00 2001 From: Fraser Tweedale Date: Wed, 16 Jun 2021 15:30:42 +1000 Subject: [PATCH] chown cgroup to process uid in container namespace Delegating cgroups to the container enables more complex workloads, including systemd-based workloads. The OCI runtime-spec was recently updated to explicitly admit such delegation, through specification of cgroup ownership semantics: https://github.com/opencontainers/runtime-spec/pull/1123 Pursuant to the updated OCI runtime-spec, change the ownership of the container's cgroup directory and particular files therein, when using cgroups v2 and when the cgroupfs is to be mounted read/write. As a result of this change, systemd workloads can run in isolated user namespaces on OpenShift when the sandbox's cgroupfs is mounted read/write. It might be possible to implement this feature in other cgroup managers, but that work is deferred. Signed-off-by: Fraser Tweedale --- libcontainer/cgroups/systemd/v2.go | 32 +++++++++++ libcontainer/configs/cgroup_linux.go | 7 +++ libcontainer/specconv/spec_linux.go | 43 ++++++++++++++ tests/integration/cgroups.bats | 83 ++++++++++++++++++++++++++++ 4 files changed, 165 insertions(+) diff --git a/libcontainer/cgroups/systemd/v2.go b/libcontainer/cgroups/systemd/v2.go index 49c0bfab623..d0d3b8acc0c 100644 --- a/libcontainer/cgroups/systemd/v2.go +++ b/libcontainer/cgroups/systemd/v2.go @@ -1,8 +1,10 @@ package systemd import ( + "bufio" "fmt" "math" + "os" "path/filepath" "strconv" "strings" @@ -288,6 +290,36 @@ func (m *unifiedManager) Apply(pid int) error { if err := fs2.CreateCgroupPath(m.path, m.cgroups); err != nil { return err } + + if c.OwnerUID != nil { + // The kernel exposes a list of files that should chowned to + // the delegate uid in /sys/kernel/cgroup/delegate. If the + // file is not present (Linux < 4.15), use the initial + // values mentioned in in cgroups(7). + filesToChown := []string{"."} // the directory itself must be chowned + cgroupDelegateFile := "/sys/kernel/cgroup/delegate" + f, err := os.Open(cgroupDelegateFile) + if err == nil { + defer f.Close() + scanner := bufio.NewScanner(f) + for scanner.Scan() { + filesToChown = append(filesToChown, scanner.Text()) + } + if err := scanner.Err(); err != nil { + return fmt.Errorf("error reading %s: %w", cgroupDelegateFile, err) + } + } else { + filesToChown = append(filesToChown, "cgroup.procs", "cgroup.subtree_control", "cgroup.threads") + } + + for _, v := range filesToChown { + err := os.Chown(m.path+"/"+v, *c.OwnerUID, -1) + if err != nil { + return err + } + } + } + return nil } diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index 25424bdcbfa..2d4a8987109 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -41,6 +41,13 @@ type Cgroup struct { // Rootless tells if rootless cgroups should be used. Rootless bool + + // The host UID that should own the cgroup, or nil to accept + // the default ownership. This should only be set when the + // cgroupfs is to be mounted read/write. + // Not all cgroup manager implementations support changing + // the ownership. + OwnerUID *int `json:"owner_uid,omitempty"` } type Resources struct { diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index cf54cf3dd5a..befda01e803 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -366,6 +366,49 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } } } + + // Set the host UID that should own the container's cgroup. + // This must be performed after setupUserNamespace, so that + // config.HostRootUID() returns the correct result. + // + // Only set it if the container will have its own cgroup + // namespace and the cgroupfs will be mounted read/write. + // + hasCgroupNS := config.Namespaces.Contains(configs.NEWCGROUP) && config.Namespaces.PathOf(configs.NEWCGROUP) == "" + hasRwCgroupfs := false + if hasCgroupNS { + for _, m := range config.Mounts { + if m.Source == "cgroup" && m.Destination == "/sys/fs/cgroup" && (m.Flags&unix.MS_RDONLY) == 0 { + hasRwCgroupfs = true + break + } + } + } + processUid := 0 + if spec.Process != nil { + // Chown the cgroup to the UID running the process, + // which is not necessarily UID 0 in the container + // namespace (e.g., an unprivileged UID in the host + // user namespace). + processUid = int(spec.Process.User.UID) + } + if hasCgroupNS && hasRwCgroupfs { + ownerUid, err := config.HostUID(processUid) + // There are two error cases; we can ignore both. + // + // 1. uidMappings is unset. Either there is no user + // namespace (fine), or it is an error (which is + // checked elsewhere). + // + // 2. The user is unmapped in the user namespace. This is an + // unusual configuration and might be an error. But it too + // will be checked elsewhere, so we can ignore it here. + // + if err == nil { + config.Cgroups.OwnerUID = &ownerUid + } + } + if spec.Process != nil { config.OomScoreAdj = spec.Process.OOMScoreAdj config.NoNewPrivileges = spec.Process.NoNewPrivileges diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index ff7cf6d0b70..17f0748a8ad 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -119,6 +119,89 @@ function setup() { # } +@test "runc exec (cgroup v2, ro cgroupfs, new cgroupns) does not chown cgroup" { + requires root cgroups_v2 systemd + + set_cgroups_path + + # configure a user namespace + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + ' + + # chown test temp dir to allow host user to read it + chown 100000 "$ROOT" + + # chown rootfs to allow host user to mkdir mount points + chown 100000 "$ROOT"/bundle/rootfs + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user +} + +@test "runc exec (cgroup v2, rw cgroupfs, inh cgroupns) does not chown cgroup" { + requires root cgroups_v2 systemd + + set_cgroups_path + set_cgroup_mount_writable + + # configure a user namespace + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + ' + + # inherit cgroup namespace (remove cgroup from namespaces list) + update_config '.linux.namespaces |= map(select(.type != "cgroup"))' + + # chown test temp dir to allow host user to read it + chown 100000 "$ROOT" + + # chown rootfs to allow host user to mkdir mount points + chown 100000 "$ROOT"/bundle/rootfs + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "nobody" ] # /sys/fs/cgroup owned by unmapped user +} + +@test "runc exec (cgroup v2, rw cgroupfs, new cgroupns) does chown cgroup" { + requires root cgroups_v2 systemd + + set_cgroups_path + set_cgroup_mount_writable + + # configure a user namespace + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65536}] + ' + + # make cgroupfs mount read/write + update_config '.mounts |= map(select(.source == "cgroup").options |= map(select(. != "ro")) + ["rw"])' + + # chown test temp dir to allow host user to read it + chown 100000 "$ROOT" + + # chown rootfs to allow host user to mkdir mount points + chown 100000 "$ROOT"/bundle/rootfs + + runc run -d --console-socket "$CONSOLE_SOCKET" test_cgroup_chown + [ "$status" -eq 0 ] + + runc exec test_cgroup_chown sh -c "stat -c %U /sys/fs/cgroup" + [ "$status" -eq 0 ] + [ "$output" = "root" ] # /sys/fs/cgroup owned by root (of user namespace) +} + @test "runc run (cgroup v1 + unified resources should fail)" { requires root cgroups_v1