diff --git a/CHANGELOG.md b/CHANGELOG.md index adcf084d0b6..ffdc9ae2fb5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -31,6 +31,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 This aligns cgroupv2 root usage more closely with cgroupv1 reporting. Additionally, report root swap usage as sum of swap and memory usage, aligned with v1 and existing non-root v2 reporting. (#3933) + * Add `swapOnlyUsage` in `MemoryStats`. This field reports swap-only usage. + For cgroupv1, `Usage` and `Failcnt` are set by subtracting memory usage + from memory+swap usage. For cgroupv2, `Usage`, `Limit`, and `MaxUsage` + are set. (#4010) ### Fixed diff --git a/contrib/completions/bash/runc b/contrib/completions/bash/runc index 782234e86d4..353c8ffdbbd 100644 --- a/contrib/completions/bash/runc +++ b/contrib/completions/bash/runc @@ -461,7 +461,6 @@ _runc_run() { --no-subreaper --no-pivot --no-new-keyring - --no-mount-fallback " local options_with_args=" @@ -568,7 +567,6 @@ _runc_create() { --help --no-pivot --no-new-keyring - --no-mount-fallback " local options_with_args=" @@ -629,7 +627,6 @@ _runc_restore() { --no-pivot --auto-dedup --lazy-pages - --no-mount-fallback " local options_with_args=" diff --git a/create.go b/create.go index 3788a532fce..97854b846cb 100644 --- a/create.go +++ b/create.go @@ -51,10 +51,6 @@ command(s) that get executed on start, edit the args parameter of the spec. See Name: "preserve-fds", Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", }, - cli.BoolFlag{ - Name: "no-mount-fallback", - Usage: "Do not fallback when the specific configuration is not applicable (e.g., do not try to remount a bind mount again after the first attempt failed on source filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set)", - }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { diff --git a/docs/spec-conformance.md b/docs/spec-conformance.md index a278d76a740..c3ed8084b5e 100644 --- a/docs/spec-conformance.md +++ b/docs/spec-conformance.md @@ -7,7 +7,6 @@ The following features are not implemented yet: Spec version | Feature | PR -------------|------------------------------------------|---------------------------------------------------------- -v1.0.2 | `.linux.personality` | [#3126](https://github.com/opencontainers/runc/pull/3126) v1.1.0 | `SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV` | [#3862](https://github.com/opencontainers/runc/pull/3862) v1.1.0 | rsvd hugetlb cgroup | TODO ([#3859](https://github.com/opencontainers/runc/issues/3859)) v1.1.0 | `.process.ioPriority` | [#3783](https://github.com/opencontainers/runc/pull/3783) diff --git a/go.mod b/go.mod index 47f282633f8..ca73baf787e 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,7 @@ go 1.20 require ( github.com/checkpoint-restore/go-criu/v6 v6.3.0 - github.com/cilium/ebpf v0.12.1 + github.com/cilium/ebpf v0.12.2 github.com/containerd/console v1.0.3 github.com/coreos/go-systemd/v22 v22.5.0 github.com/cyphar/filepath-securejoin v0.2.4 diff --git a/go.sum b/go.sum index 4d23ee67114..5a7d5b70f5a 100644 --- a/go.sum +++ b/go.sum @@ -1,8 +1,8 @@ github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ= github.com/checkpoint-restore/go-criu/v6 v6.3.0 h1:mIdrSO2cPNWQY1truPg6uHLXyKHk3Z5Odx4wjKOASzA= github.com/checkpoint-restore/go-criu/v6 v6.3.0/go.mod h1:rrRTN/uSwY2X+BPRl/gkulo9gsKOSAeVp9/K2tv7xZI= -github.com/cilium/ebpf v0.12.1 h1:0zxmBZrItv5dgJrSVYHo36yVfJAacE7Sd1xPC3fMl4M= -github.com/cilium/ebpf v0.12.1/go.mod h1:u9H29/Iq+8cy70YqI6p5pfADkFl3vdnV2qXDg5JL0Zo= +github.com/cilium/ebpf v0.12.2 h1:cP3qL4kkl19kr/F+hKqUo9F9pPMVz1oms8C7Qj0AwWk= +github.com/cilium/ebpf v0.12.2/go.mod h1:u9H29/Iq+8cy70YqI6p5pfADkFl3vdnV2qXDg5JL0Zo= github.com/containerd/console v1.0.3 h1:lIr7SlA5PxZyMV30bDW0MGbiOPXwc63yRuCP0ARubLw= github.com/containerd/console v1.0.3/go.mod h1:7LqA/THxQ86k76b8c/EMSiaJ3h1eZkMkXar0TQ1gf3U= github.com/coreos/go-systemd/v22 v22.5.0 h1:RrqgGjYQKalulkV8NGVIfkXQf6YYmOyiJKk8iXXhfZs= diff --git a/libcontainer/cgroups/file.go b/libcontainer/cgroups/file.go index 048a0ca4bef..1a463b4a781 100644 --- a/libcontainer/cgroups/file.go +++ b/libcontainer/cgroups/file.go @@ -49,24 +49,13 @@ func WriteFile(dir, file, data string) error { return err } defer fd.Close() - if err := retryingWriteFile(fd, data); err != nil { + if _, err := fd.WriteString(data); err != nil { // Having data in the error message helps in debugging. return fmt.Errorf("failed to write %q: %w", data, err) } return nil } -func retryingWriteFile(fd *os.File, data string) error { - for { - _, err := fd.Write([]byte(data)) - if errors.Is(err, unix.EINTR) { - logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) - continue - } - return err - } -} - const ( cgroupfsDir = "/sys/fs/cgroup" cgroupfsPrefix = cgroupfsDir + "/" diff --git a/libcontainer/cgroups/fs/fs.go b/libcontainer/cgroups/fs/fs.go index e2c425d0cd0..d2decb127ca 100644 --- a/libcontainer/cgroups/fs/fs.go +++ b/libcontainer/cgroups/fs/fs.go @@ -191,7 +191,7 @@ func (m *Manager) Set(r *configs.Resources) error { if path == "" { // We never created a path for this cgroup, so we cannot set // limits for it (though we have already tried at this point). - return fmt.Errorf("cannot set %s limit: container could not join or create cgroup, and the error is %w", sys.Name(), err) + return fmt.Errorf("cannot set %s limit: container could not join or create cgroup", sys.Name()) } return err } diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index a0e78074980..783566d68f0 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -170,6 +170,10 @@ func (s *MemoryGroup) GetStats(path string, stats *cgroups.Stats) error { return err } stats.MemoryStats.SwapUsage = swapUsage + stats.MemoryStats.SwapOnlyUsage = cgroups.MemoryData{ + Usage: swapUsage.Usage - memoryUsage.Usage, + Failcnt: swapUsage.Failcnt - memoryUsage.Failcnt, + } kernelUsage, err := getMemoryData(path, "kmem") if err != nil { return err diff --git a/libcontainer/cgroups/fs/memory_test.go b/libcontainer/cgroups/fs/memory_test.go index d305a62a393..95e9d3cbaa4 100644 --- a/libcontainer/cgroups/fs/memory_test.go +++ b/libcontainer/cgroups/fs/memory_test.go @@ -249,12 +249,13 @@ func TestMemoryStats(t *testing.T) { t.Fatal(err) } expectedStats := cgroups.MemoryStats{ - Cache: 512, - Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, - SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, - KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, - Stats: map[string]uint64{"cache": 512, "rss": 1024}, - UseHierarchy: true, + Cache: 512, + Usage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + SwapUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + SwapOnlyUsage: cgroups.MemoryData{Usage: 0, MaxUsage: 0, Failcnt: 0, Limit: 0}, + KernelUsage: cgroups.MemoryData{Usage: 2048, MaxUsage: 4096, Failcnt: 100, Limit: 8192}, + Stats: map[string]uint64{"cache": 512, "rss": 1024}, + UseHierarchy: true, PageUsageByNUMA: cgroups.PageUsageByNUMA{ PageUsageByNUMAInner: cgroups.PageUsageByNUMAInner{ Total: cgroups.PageStats{Total: 44611, Nodes: map[uint8]uint64{0: 32631, 1: 7501, 2: 1982, 3: 2497}}, diff --git a/libcontainer/cgroups/fs2/fs2.go b/libcontainer/cgroups/fs2/fs2.go index 47b67afc2a1..0760be74b97 100644 --- a/libcontainer/cgroups/fs2/fs2.go +++ b/libcontainer/cgroups/fs2/fs2.go @@ -133,6 +133,10 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { if err := fscommon.RdmaGetStats(m.dirPath, st); err != nil && !os.IsNotExist(err) { errs = append(errs, err) } + // misc (since kernel 5.13) + if err := statMisc(m.dirPath, st); err != nil && !os.IsNotExist(err) { + errs = append(errs, err) + } if len(errs) > 0 && !m.config.Rootless { return st, fmt.Errorf("error while statting cgroup v2: %+v", errs) } diff --git a/libcontainer/cgroups/fs2/memory.go b/libcontainer/cgroups/fs2/memory.go index 85e96b1ce98..29656597423 100644 --- a/libcontainer/cgroups/fs2/memory.go +++ b/libcontainer/cgroups/fs2/memory.go @@ -105,7 +105,7 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { memoryUsage, err := getMemoryDataV2(dirPath, "") if err != nil { if errors.Is(err, unix.ENOENT) && dirPath == UnifiedMountpoint { - // The root cgroup does not have memory.{current,max} + // The root cgroup does not have memory.{current,max,peak} // so emulate those using data from /proc/meminfo and // /sys/fs/cgroup/memory.stat return rootStatsFromMeminfo(stats) @@ -113,10 +113,12 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { return err } stats.MemoryStats.Usage = memoryUsage - swapUsage, err := getMemoryDataV2(dirPath, "swap") + swapOnlyUsage, err := getMemoryDataV2(dirPath, "swap") if err != nil { return err } + stats.MemoryStats.SwapOnlyUsage = swapOnlyUsage + swapUsage := swapOnlyUsage // As cgroup v1 reports SwapUsage values as mem+swap combined, // while in cgroup v2 swap values do not include memory, // report combined mem+swap for v1 compatibility. @@ -124,6 +126,9 @@ func statMemory(dirPath string, stats *cgroups.Stats) error { if swapUsage.Limit != math.MaxUint64 { swapUsage.Limit += memoryUsage.Limit } + // The `MaxUsage` of mem+swap cannot simply combine mem with + // swap. So set it to 0 for v1 compatibility. + swapUsage.MaxUsage = 0 stats.MemoryStats.SwapUsage = swapUsage return nil @@ -138,6 +143,7 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } usage := moduleName + ".current" limit := moduleName + ".max" + maxUsage := moduleName + ".peak" value, err := fscommon.GetCgroupParamUint(path, usage) if err != nil { @@ -157,6 +163,14 @@ func getMemoryDataV2(path, name string) (cgroups.MemoryData, error) { } memoryData.Limit = value + // `memory.peak` since kernel 5.19 + // `memory.swap.peak` since kernel 6.5 + value, err = fscommon.GetCgroupParamUint(path, maxUsage) + if err != nil && !os.IsNotExist(err) { + return cgroups.MemoryData{}, err + } + memoryData.MaxUsage = value + return memoryData, nil } diff --git a/libcontainer/cgroups/fs2/memory_test.go b/libcontainer/cgroups/fs2/memory_test.go index 2e2713c29ae..89c999d0cee 100644 --- a/libcontainer/cgroups/fs2/memory_test.go +++ b/libcontainer/cgroups/fs2/memory_test.go @@ -94,6 +94,10 @@ func TestStatMemoryPodCgroup(t *testing.T) { t.Fatal(err) } + if err := os.WriteFile(filepath.Join(fakeCgroupDir, "memory.peak"), []byte("987654321"), 0o644); err != nil { + t.Fatal(err) + } + gotStats := cgroups.NewStats() // use a fake root path to trigger the pod cgroup lookup. @@ -107,6 +111,18 @@ func TestStatMemoryPodCgroup(t *testing.T) { if gotStats.MemoryStats.Usage.Usage != expectedUsageBytes { t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Usage, expectedUsageBytes) } + + // result should be "memory.max" + var expectedLimitBytes uint64 = 999999999 + if gotStats.MemoryStats.Usage.Limit != expectedLimitBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.Limit, expectedLimitBytes) + } + + // result should be "memory.peak" + var expectedMaxUsageBytes uint64 = 987654321 + if gotStats.MemoryStats.Usage.MaxUsage != expectedMaxUsageBytes { + t.Errorf("parsed cgroupv2 memory.stat doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MemoryStats.Usage.MaxUsage, expectedMaxUsageBytes) + } } func TestRootStatsFromMeminfo(t *testing.T) { diff --git a/libcontainer/cgroups/fs2/misc.go b/libcontainer/cgroups/fs2/misc.go new file mode 100644 index 00000000000..f0b292aa015 --- /dev/null +++ b/libcontainer/cgroups/fs2/misc.go @@ -0,0 +1,52 @@ +package fs2 + +import ( + "bufio" + "os" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" +) + +func statMisc(dirPath string, stats *cgroups.Stats) error { + for _, file := range []string{"current", "events"} { + fd, err := cgroups.OpenFile(dirPath, "misc."+file, os.O_RDONLY) + if err != nil { + return err + } + + s := bufio.NewScanner(fd) + for s.Scan() { + key, value, err := fscommon.ParseKeyValue(s.Text()) + if err != nil { + fd.Close() + return err + } + + key = strings.TrimSuffix(key, ".max") + + if _, ok := stats.MiscStats[key]; !ok { + stats.MiscStats[key] = cgroups.MiscStats{} + } + + tmp := stats.MiscStats[key] + + switch file { + case "current": + tmp.Usage = value + case "events": + tmp.Events = value + } + + stats.MiscStats[key] = tmp + } + fd.Close() + + if err := s.Err(); err != nil { + return err + } + } + + return nil +} diff --git a/libcontainer/cgroups/fs2/misc_test.go b/libcontainer/cgroups/fs2/misc_test.go new file mode 100644 index 00000000000..a3f82395196 --- /dev/null +++ b/libcontainer/cgroups/fs2/misc_test.go @@ -0,0 +1,103 @@ +package fs2 + +import ( + "os" + "path/filepath" + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +const exampleMiscCurrentData = `res_a 123 +res_b 456 +res_c 42` + +const exampleMiscEventsData = `res_a.max 1 +res_b.max 2 +res_c.max 3` + +func TestStatMiscPodCgroupEmpty(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + // create empty misc.current and misc.events files to test the common case + // where no misc resource keys are available + for _, file := range []string{"misc.current", "misc.events"} { + if _, err := os.Create(filepath.Join(fakeCgroupDir, file)); err != nil { + t.Fatal(err) + } + } + + gotStats := cgroups.NewStats() + + err := statMisc(fakeCgroupDir, gotStats) + if err != nil { + t.Errorf("expected no error when statting empty misc.current/misc.events for cgroupv2, but got %#v", err) + } + + if len(gotStats.MiscStats) != 0 { + t.Errorf("parsed cgroupv2 misc.* returns unexpected resources: got %#v but expected nothing", gotStats.MiscStats) + } +} + +func TestStatMiscPodCgroupNotFound(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + // only write misc.current to ensure pod cgroup usage + // still reads misc.events. + statPath := filepath.Join(fakeCgroupDir, "misc.current") + if err := os.WriteFile(statPath, []byte(exampleMiscCurrentData), 0o644); err != nil { + t.Fatal(err) + } + + gotStats := cgroups.NewStats() + + // use a fake root path to mismatch the file we wrote. + // this triggers the non-root path which should fail to find misc.events. + err := statMisc(fakeCgroupDir, gotStats) + if err == nil { + t.Errorf("expected error when statting misc.current for cgroupv2 root, but was nil") + } + + if !strings.Contains(err.Error(), "misc.events: no such file or directory") { + t.Errorf("expected error to contain 'misc.events: no such file or directory', but was %s", err.Error()) + } +} + +func TestStatMiscPodCgroup(t *testing.T) { + // We're using a fake cgroupfs. + cgroups.TestMode = true + fakeCgroupDir := t.TempDir() + + currentPath := filepath.Join(fakeCgroupDir, "misc.current") + if err := os.WriteFile(currentPath, []byte(exampleMiscCurrentData), 0o644); err != nil { + t.Fatal(err) + } + + eventsPath := filepath.Join(fakeCgroupDir, "misc.events") + if err := os.WriteFile(eventsPath, []byte(exampleMiscEventsData), 0o644); err != nil { + t.Fatal(err) + } + + gotStats := cgroups.NewStats() + + // use a fake root path to trigger the pod cgroup lookup. + err := statMisc(fakeCgroupDir, gotStats) + if err != nil { + t.Errorf("expected no error when statting misc for cgroupv2 root, but got %#+v", err) + } + + // make sure all res_* from exampleMisc*Data are returned + if len(gotStats.MiscStats) != 3 { + t.Errorf("parsed cgroupv2 misc doesn't return all expected resources: \ngot %#v\nexpected %#v\n", len(gotStats.MiscStats), 3) + } + + var expectedUsageBytes uint64 = 42 + if gotStats.MiscStats["res_c"].Usage != expectedUsageBytes { + t.Errorf("parsed cgroupv2 misc.current for res_c doesn't match expected result: \ngot %#v\nexpected %#v\n", gotStats.MiscStats["res_c"].Usage, expectedUsageBytes) + } +} diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go index 8ff1fbb52bb..b475567d821 100644 --- a/libcontainer/cgroups/stats.go +++ b/libcontainer/cgroups/stats.go @@ -91,6 +91,8 @@ type MemoryStats struct { Usage MemoryData `json:"usage,omitempty"` // usage of memory + swap SwapUsage MemoryData `json:"swap_usage,omitempty"` + // usage of swap only + SwapOnlyUsage MemoryData `json:"swap_only_usage,omitempty"` // usage of kernel memory KernelUsage MemoryData `json:"kernel_usage,omitempty"` // usage of kernel TCP memory @@ -170,6 +172,13 @@ type RdmaStats struct { RdmaCurrent []RdmaEntry `json:"rdma_current,omitempty"` } +type MiscStats struct { + // current resource usage for a key in misc + Usage uint64 `json:"usage,omitempty"` + // number of times the resource usage was about to go over the max boundary + Events uint64 `json:"events,omitempty"` +} + type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` @@ -179,10 +188,13 @@ type Stats struct { // the map is in the format "size of hugepage: stats of the hugepage" HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` RdmaStats RdmaStats `json:"rdma_stats,omitempty"` + // the map is in the format "misc resource name: stats of the key" + MiscStats map[string]MiscStats `json:"misc_stats,omitempty"` } func NewStats() *Stats { memoryStats := MemoryStats{Stats: make(map[string]uint64)} hugetlbStats := make(map[string]HugetlbStats) - return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats} + miscStats := make(map[string]MiscStats) + return &Stats{MemoryStats: memoryStats, HugetlbStats: hugetlbStats, MiscStats: miscStats} } diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 1ece49c3732..722e1dcad61 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -214,15 +214,14 @@ type Config struct { // When RootlessCgroups is set, cgroups errors are ignored. RootlessCgroups bool `json:"rootless_cgroups,omitempty"` - // Do not try to remount a bind mount again after the first attempt failed on source - // filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set - NoMountFallback bool `json:"no_mount_fallback,omitempty"` - // TimeOffsets specifies the offset for supporting time namespaces. TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"` // Scheduler represents the scheduling attributes for a process. Scheduler *Scheduler `json:"scheduler,omitempty"` + + // Personality contains configuration for the Linux personality syscall. + Personality *LinuxPersonality `json:"personality,omitempty"` } // Scheduler is based on the Linux sched_setattr(2) syscall. diff --git a/libcontainer/configs/config_linux.go b/libcontainer/configs/config_linux.go index 4e58bb39630..1900fc978f4 100644 --- a/libcontainer/configs/config_linux.go +++ b/libcontainer/configs/config_linux.go @@ -9,6 +9,19 @@ var ( errNoGroupMap = errors.New("User namespaces enabled, but no group mapping found.") ) +// Please check https://man7.org/linux/man-pages/man2/personality.2.html for const details. +// https://raw.githubusercontent.com/torvalds/linux/master/include/uapi/linux/personality.h +const ( + PerLinux = 0x0000 + PerLinux32 = 0x0008 +) + +type LinuxPersonality struct { + // Domain for the personality + // can only contain values "LINUX" and "LINUX32" + Domain int `json:"domain"` +} + // HostUID gets the translated uid for the process on host which could be // different when user namespaces are enabled. func (c Config) HostUID(containerId int) (int, error) { diff --git a/libcontainer/configs/mount_linux.go b/libcontainer/configs/mount_linux.go index 6d4106de0c6..3f489295d97 100644 --- a/libcontainer/configs/mount_linux.go +++ b/libcontainer/configs/mount_linux.go @@ -15,6 +15,10 @@ type Mount struct { // Mount flags. Flags int `json:"flags"` + // Mount flags that were explicitly cleared in the configuration (meaning + // the user explicitly requested that these flags *not* be set). + ClearedFlags int `json:"cleared_flags"` + // Propagation Flags PropagationFlags []int `json:"propagation_flags"` diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index b9affb91c4b..624e0199e9a 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -654,6 +654,10 @@ func setupScheduler(config *configs.Config) error { return nil } +func setupPersonality(config *configs.Config) error { + return system.SetLinuxPersonality(config.Personality.Domain) +} + // signalAllProcesses freezes then iterates over all the processes inside the // manager's cgroups sending the signal s to them. func signalAllProcesses(m cgroups.Manager, s unix.Signal) error { diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index eafd6c82d05..a2e41ea5638 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -34,7 +34,6 @@ type mountConfig struct { cgroup2Path string rootlessCgroups bool cgroupns bool - noMountFallback bool } // mountEntry contains mount data specific to a mount point. @@ -83,7 +82,6 @@ func prepareRootfs(pipe *syncSocket, iConfig *initConfig, mountFds mountFds) (er cgroup2Path: iConfig.Cgroup2Path, rootlessCgroups: iConfig.RootlessCgroups, cgroupns: config.Namespaces.Contains(configs.NEWCGROUP), - noMountFallback: config.NoMountFallback, } for i, m := range config.Mounts { entry := mountEntry{Mount: m} @@ -409,6 +407,51 @@ func doTmpfsCopyUp(m mountEntry, rootfs, mountLabel string) (Err error) { }) } +const ( + // The atime "enum" flags (which are mutually exclusive). + mntAtimeEnumFlags = unix.MS_NOATIME | unix.MS_RELATIME | unix.MS_STRICTATIME + // All atime-related flags. + mntAtimeFlags = mntAtimeEnumFlags | unix.MS_NODIRATIME + // Flags which can be locked when inheriting mounts in a different userns. + // In the kernel, these are the mounts that are locked using MNT_LOCK_*. + mntLockFlags = unix.MS_RDONLY | unix.MS_NODEV | unix.MS_NOEXEC | + unix.MS_NOSUID | mntAtimeFlags +) + +func statfsToMountFlags(st unix.Statfs_t) int { + // From . + const ST_NOSYMFOLLOW = 0x2000 //nolint:revive + + var flags int + for _, f := range []struct { + st, ms int + }{ + // See calculate_f_flags() in fs/statfs.c. + {unix.ST_RDONLY, unix.MS_RDONLY}, + {unix.ST_NOSUID, unix.MS_NOSUID}, + {unix.ST_NODEV, unix.MS_NODEV}, + {unix.ST_NOEXEC, unix.MS_NOEXEC}, + {unix.ST_MANDLOCK, unix.MS_MANDLOCK}, + {unix.ST_SYNCHRONOUS, unix.MS_SYNCHRONOUS}, + {unix.ST_NOATIME, unix.MS_NOATIME}, + {unix.ST_NODIRATIME, unix.MS_NODIRATIME}, + {unix.ST_RELATIME, unix.MS_RELATIME}, + {ST_NOSYMFOLLOW, unix.MS_NOSYMFOLLOW}, + // There is no ST_STRICTATIME -- see below. + } { + if int(st.Flags)&f.st == f.st { + flags |= f.ms + } + } + // MS_STRICTATIME is a "fake" MS_* flag. It isn't stored in mnt->mnt_flags, + // and so it doesn't show up in statfs(2). If none of the other flags in + // atime enum are present, the mount is MS_STRICTATIME. + if flags&mntAtimeEnumFlags == 0 { + flags |= unix.MS_STRICTATIME + } + return flags +} + func mountToRootfs(c *mountConfig, m mountEntry) error { rootfs := c.root @@ -509,11 +552,97 @@ func mountToRootfs(c *mountConfig, m mountEntry) error { return err } } - // bind mount won't change mount options, we need remount to make mount options effective. - // first check that we have non-default options required before attempting a remount - if m.Flags&^(unix.MS_REC|unix.MS_REMOUNT|unix.MS_BIND) != 0 { - // only remount if unique mount options are set - if err := remount(m, rootfs, c.noMountFallback); err != nil { + + // The initial MS_BIND won't change the mount options, we need to do a + // separate MS_BIND|MS_REMOUNT to apply the mount options. We skip + // doing this if the user has not specified any mount flags at all + // (including cleared flags) -- in which case we just keep the original + // mount flags. + // + // Note that the fact we check whether any clearing flags are set is in + // contrast to mount(8)'s current behaviour, but is what users probably + // expect. See . + if m.Flags & ^(unix.MS_BIND|unix.MS_REC|unix.MS_REMOUNT) != 0 || m.ClearedFlags != 0 { + if err := utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { + flags := m.Flags | unix.MS_BIND | unix.MS_REMOUNT + // The runtime-spec says we SHOULD map to the relevant mount(8) + // behaviour. However, it's not clear whether we want the + // "mount --bind -o ..." or "mount --bind -o remount,..." + // behaviour here -- both of which are somewhat broken[1]. + // + // So, if the user has passed "remount" as a mount option, we + // implement the "mount --bind -o remount" behaviour, otherwise + // we implement the spiritual intent of the "mount --bind -o" + // behaviour, which should match what users expect. Maybe + // mount(8) will eventually implement this behaviour too.. + // + // [1]: https://github.com/util-linux/util-linux/issues/2433 + + // Initially, we emulate "mount --bind -o ..." where we set + // only the requested flags (clearing any existing flags). The + // only difference from mount(8) is that we do this + // unconditionally, regardless of whether any set-me mount + // options have been requested. + // + // TODO: We are not doing any special handling of the atime + // flags here, which means that the mount will inherit the old + // atime flags if the user didn't explicitly request a + // different set of flags. This also has the mount(8) bug where + // "nodiratime,norelatime" will result in a + // "nodiratime,relatime" mount. + mountErr := mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "") + if mountErr == nil { + return nil + } + + // If the mount failed, the mount may contain locked mount + // flags. In that case, we emulate "mount --bind -o + // remount,...", where we take the existing mount flags of the + // mount and apply the request flags (including clearing flags) + // on top. The main divergence we have from mount(8) here is + // that we handle atimes correctly to make sure we error out if + // we cannot fulfil the requested mount flags. + + var st unix.Statfs_t + if err := unix.Statfs(m.src(), &st); err != nil { + return &os.PathError{Op: "statfs", Path: m.src(), Err: err} + } + srcFlags := statfsToMountFlags(st) + // If the user explicitly request one of the locked flags *not* + // be set, we need to return an error to avoid producing mounts + // that don't match the user's request. + if srcFlags&m.ClearedFlags&mntLockFlags != 0 { + return mountErr + } + + // If an MS_*ATIME flag was requested, it must match the + // existing one. This handles two separate kernel bugs, and + // matches the logic of can_change_locked_flags() but without + // these bugs: + // + // * (2.6.30+) Since commit 613cbe3d4870 ("Don't set relatime + // when noatime is specified"), MS_RELATIME is ignored when + // MS_NOATIME is set. This means that us inheriting MS_NOATIME + // from a mount while requesting MS_RELATIME would *silently* + // produce an MS_NOATIME mount. + // + // * (2.6.30+) Since its introduction in commit d0adde574b84 + // ("Add a strictatime mount option"), MS_STRICTATIME has + // caused any passed MS_RELATIME and MS_NOATIME flags to be + // ignored which results in us *silently* producing + // MS_STRICTATIME mounts even if the user requested MS_RELATIME + // or MS_NOATIME. + if m.Flags&mntAtimeFlags != 0 && m.Flags&mntAtimeFlags != srcFlags&mntAtimeFlags { + return mountErr + } + + // Retry the mount with the existing lockable mount flags + // applied. + flags |= srcFlags & mntLockFlags + mountErr = mountViaFDs("", nil, m.Destination, dstFD, "", uintptr(flags), "") + logrus.Debugf("remount retry: srcFlags=0x%x flagsSet=0x%x flagsClr=0x%x: %v", srcFlags, m.Flags, m.ClearedFlags, mountErr) + return mountErr + }); err != nil { return err } } @@ -1103,37 +1232,6 @@ func writeSystemProperty(key, value string) error { return os.WriteFile(path.Join("/proc/sys", keyPath), []byte(value), 0o644) } -func remount(m mountEntry, rootfs string, noMountFallback bool) error { - return utils.WithProcfd(rootfs, m.Destination, func(dstFD string) error { - flags := uintptr(m.Flags | unix.MS_REMOUNT) - err := mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") - if err == nil { - return nil - } - // Check if the source has flags set according to noMountFallback - src := m.src() - var s unix.Statfs_t - if err := unix.Statfs(src, &s); err != nil { - return &os.PathError{Op: "statfs", Path: src, Err: err} - } - var checkflags int - if noMountFallback { - // Check for ro only - checkflags = unix.MS_RDONLY - } else { - // Check for ro, nodev, noexec, nosuid, noatime, relatime, strictatime, - // nodiratime - checkflags = unix.MS_RDONLY | unix.MS_NODEV | unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NOATIME | unix.MS_RELATIME | unix.MS_STRICTATIME | unix.MS_NODIRATIME - } - if int(s.Flags)&checkflags == 0 { - return err - } - // ... and retry the mount with flags found above. - flags |= uintptr(int(s.Flags) & checkflags) - return mountViaFDs("", nil, m.Destination, dstFD, m.Device, flags, "") - }) -} - // Do the mount operation followed by additional mounts required to take care // of propagation flags. This will always be scoped inside the container rootfs. func mountPropagate(m mountEntry, rootfs string, mountLabel string) error { diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index f3edb7100c8..171bc3b5908 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -93,6 +93,11 @@ func (l *linuxSetnsInit) Init() error { if err := apparmor.ApplyProfile(l.config.AppArmorProfile); err != nil { return err } + if l.config.Config.Personality != nil { + if err := setupPersonality(l.config.Config); err != nil { + return err + } + } // Check for the arg early to make sure it exists. name, err := exec.LookPath(l.config.Args[0]) if err != nil { diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index c5553832776..c1632ac12ad 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -313,7 +313,6 @@ type CreateOpts struct { Spec *specs.Spec RootlessEUID bool RootlessCgroups bool - NoMountFallback bool } // getwd is a wrapper similar to os.Getwd, except it always gets @@ -359,7 +358,6 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { NoNewKeyring: opts.NoNewKeyring, RootlessEUID: opts.RootlessEUID, RootlessCgroups: opts.RootlessCgroups, - NoMountFallback: opts.NoMountFallback, } for _, m := range spec.Mounts { @@ -436,6 +434,18 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { MemBwSchema: spec.Linux.IntelRdt.MemBwSchema, } } + if spec.Linux.Personality != nil { + if len(spec.Linux.Personality.Flags) > 0 { + logrus.Warnf("ignoring unsupported personality flags: %+v because personality flag has not supported at this time", spec.Linux.Personality.Flags) + } + domain, err := getLinuxPersonalityFromStr(string(spec.Linux.Personality.Domain)) + if err != nil { + return nil, err + } + config.Personality = &configs.LinuxPersonality{ + Domain: domain, + } + } } // Set the host UID that should own the container's cgroup. @@ -573,6 +583,16 @@ func checkPropertyName(s string) error { return nil } +// getLinuxPersonalityFromStr converts the string domain received from spec to equivalent integer. +func getLinuxPersonalityFromStr(domain string) (int, error) { + if domain == string(specs.PerLinux32) { + return configs.PerLinux32, nil + } else if domain == string(specs.PerLinux) { + return configs.PerLinux, nil + } + return -1, fmt.Errorf("invalid personality domain %s", domain) +} + // Some systemd properties are documented as having "Sec" suffix // (e.g. TimeoutStopSec) but are expected to have "USec" suffix // here, so let's provide conversion to improve compatibility. @@ -977,10 +997,15 @@ func parseMountOptions(options []string) *configs.Mount { // or the flag is not supported on the platform, // then it is a data value for a specific fs type. if f, exists := mountFlags[o]; exists && f.flag != 0 { + // FIXME: The *atime flags are special (they are more of an enum + // with quite hairy semantics) and thus arguably setting some of + // them should clear unrelated flags. if f.clear { m.Flags &= ^f.flag + m.ClearedFlags |= f.flag } else { m.Flags |= f.flag + m.ClearedFlags &= ^f.flag } } else if f, exists := mountPropagationMapping[o]; exists && f != 0 { m.PropagationFlags = append(m.PropagationFlags, f) diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 4fab50c0581..d0e01545ef9 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -233,6 +233,14 @@ func (l *linuxStandardInit) Init() error { return err } } + + // Set personality if specified. + if l.config.Config.Personality != nil { + if err := setupPersonality(l.config.Config); err != nil { + return err + } + } + // Close the pipe to signal that we have completed our init. logrus.Debugf("init: closing the pipe to signal completion") _ = l.pipe.Close() diff --git a/libcontainer/system/linux.go b/libcontainer/system/linux.go index 318b6edfe81..0b3b4da33d7 100644 --- a/libcontainer/system/linux.go +++ b/libcontainer/system/linux.go @@ -214,3 +214,13 @@ func Copy(dst io.Writer, src io.Reader) (copied int64, err error) { fallback: return io.Copy(dst, src) } + +// SetLinuxPersonality sets the Linux execution personality. For more information see the personality syscall documentation. +// checkout getLinuxPersonalityFromStr() from libcontainer/specconv/spec_linux.go for type conversion. +func SetLinuxPersonality(personality int) error { + _, _, errno := unix.Syscall(unix.SYS_PERSONALITY, uintptr(personality), 0, 0) + if errno != 0 { + return &os.SyscallError{Syscall: "set_personality", Err: errno} + } + return nil +} diff --git a/restore.go b/restore.go index de5b48d54c2..d65afcfc788 100644 --- a/restore.go +++ b/restore.go @@ -98,10 +98,6 @@ using the runc checkpoint command.`, Value: "", Usage: "Specify an LSM mount context to be used during restore.", }, - cli.BoolFlag{ - Name: "no-mount-fallback", - Usage: "Do not fallback when the specific configuration is not applicable (e.g., do not try to remount a bind mount again after the first attempt failed on source filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set)", - }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { diff --git a/run.go b/run.go index 8b4f4d1fb23..82781669d10 100644 --- a/run.go +++ b/run.go @@ -64,10 +64,6 @@ command(s) that get executed on start, edit the args parameter of the spec. See Name: "preserve-fds", Usage: "Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total)", }, - cli.BoolFlag{ - Name: "no-mount-fallback", - Usage: "Do not fallback when the specific configuration is not applicable (e.g., do not try to remount a bind mount again after the first attempt failed on source filesystems that have nodev, noexec, nosuid, noatime, relatime, strictatime, nodiratime set)", - }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 1, exactArgs); err != nil { diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index 8daf7420d0f..790108ba0b4 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -243,7 +243,7 @@ convert_hugetlb_size() { [ "$status" -eq 0 ] lim="max" - [ -v CGROUP_V1 ] && lim=".limit_in_bytes" + [ -v CGROUP_V1 ] && lim="limit_in_bytes" optional=("") # Add rsvd, if available. diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index 7e6399a47b8..811f817ab44 100755 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -260,16 +260,11 @@ function get_cgroup_value() { # Helper to check a if value in a cgroup file matches the expected one. function check_cgroup_value() { local current - local cgroup - cgroup="$(get_cgroup_path "$1")" - if [ ! -f "$cgroup/$1" ]; then - skip "$cgroup/$1 does not exist" - fi current="$(get_cgroup_value "$1")" local expected=$2 echo "current $current !? $expected" - [ "$current" = "$expected" ] || [ "$current" = "$((expected / 1000))" ] + [ "$current" = "$expected" ] } # Helper to check a value in systemd. @@ -318,6 +313,7 @@ function check_cpu_quota() { function check_cpu_burst() { local burst=$1 if [ -v CGROUP_V2 ]; then + burst=$((burst / 1000)) check_cgroup_value "cpu.max.burst" "$burst" else check_cgroup_value "cpu.cfs_burst_us" "$burst" @@ -438,6 +434,20 @@ function requires() { skip_me=1 fi ;; + cgroups_cpu_burst) + local p f + init_cgroup_paths + if [ -v CGROUP_V1 ]; then + p="$CGROUP_CPU_BASE_PATH" + f="cpu.cfs_burst_us" + elif [ -v CGROUP_V2 ]; then + p="$CGROUP_BASE_PATH" + f="cpu.max.burst" + fi + if [ -z "$(find "$p" -name "$f" -print -quit)" ]; then + skip_me=1 + fi + ;; cgroupns) if [ ! -e "/proc/self/ns/cgroup" ]; then skip_me=1 diff --git a/tests/integration/mounts_sshfs.bats b/tests/integration/mounts_sshfs.bats index 540403e4051..0bef01784f3 100644 --- a/tests/integration/mounts_sshfs.bats +++ b/tests/integration/mounts_sshfs.bats @@ -4,107 +4,438 @@ load helpers function setup() { setup_busybox - update_config '.process.args = ["/bin/echo", "Hello World"]' } function teardown() { - # Some distros do not have fusermount installed - # as a dependency of fuse-sshfs, and good ol' umount works. - fusermount -u "$DIR" || umount "$DIR" + if [ -v DIR ]; then + # Some distros do not have fusermount installed + # as a dependency of fuse-sshfs, and good ol' umount works. + fusermount -u "$DIR" || umount "$DIR" + unset DIR + fi teardown_bundle } +function sshfs_has_flag() { + if [ -v DIR ]; then + awk '$2 == "'"$DIR"'" { print $4 }' &2 + awk '$2 == "'"$DIR"'"' &2 } -@test "runc run [rw bind mount of a ro fuse sshfs mount]" { - setup_sshfs "ro" - update_config ' .mounts += [{ - type: "bind", - source: "'"$DIR"'", - destination: "/mnt", - options: ["rw", "rprivate", "nosuid", "nodev", "rbind"] - }]' +function setup_sshfs_bind_flags() { + host_flags="$1" # ro,nodev,nosuid + bind_flags="$2" # ro,nosuid,bind - runc run --no-mount-fallback test_busybox - [ "$status" -eq 0 ] -} + setup_sshfs "$host_flags" -@test "runc run [dev,exec,suid,atime bind mount of a nodev,nosuid,noexec,noatime fuse sshfs mount]" { - setup_sshfs "nodev,nosuid,noexec,noatime" - # The "sync" option is used to trigger a remount with the below options. - # It serves no further purpose. Otherwise only a bind mount without - # applying the below options will be done. - update_config ' .mounts += [{ - type: "bind", - source: "'"$DIR"'", - destination: "/mnt", - options: ["dev", "suid", "exec", "atime", "rprivate", "rbind", "sync"] - }]' + cat >"rootfs/find-tmp.awk" <<-'EOF' + #!/bin/awk -f + $2 == "/mnt" { print $4 } + EOF + chmod +x "rootfs/find-tmp.awk" - runc run test_busybox - [ "$status" -eq 0 ] + update_config '.process.args = ["sh", "-c", "/find-tmp.awk . +@test "runc run [mount(8)-unlike behaviour: --bind with clearing flag]" { + requires root + + pass_sshfs_bind_flags "ro,noexec,nosymfollow,nodiratime" "bind,dev" + # Unspecified flags must be cleared as well. + run ! grep -wq ro <<<"$mnt_flags" + run -0 grep -wq rw <<<"$mnt_flags" + run ! grep -wq noexec <<<"$mnt_flags" + run ! grep -wq nosymfollow <<<"$mnt_flags" + # FIXME FIXME: As with mount(8), trying to clear an atime flag the "naive" + # way will be ignored! + run -0 grep -wq nodiratime <<<"$mnt_flags" + + # Now try with a user namespace. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + pass_sshfs_bind_flags "ro,noexec,nosymfollow,nodiratime" "bind,dev" + # Lockable flags must be kept, because we didn't request them explicitly. + run -0 grep -wq ro <<<"$mnt_flags" + run ! grep -wq rw <<<"$mnt_flags" + run -0 grep -wq noexec <<<"$mnt_flags" + run -0 grep -wq nodiratime <<<"$mnt_flags" + # nosymfollow is not lockable, so it must be cleared. + run ! grep -wq nosymfollow <<<"$mnt_flags" +} + +@test "runc run [implied-rw bind mount of a ro fuse sshfs mount]" { + requires root + + pass_sshfs_bind_flags "ro" "bind,nosuid,nodev,rprivate" + # Unspecified flags must be cleared (rw default). + run ! grep -wq ro <<<"$mnt_flags" + run -0 grep -wq rw <<<"$mnt_flags" + # The new flags must be applied. + run -0 grep -wq nosuid <<<"$mnt_flags" + run -0 grep -wq nodev <<<"$mnt_flags" + + # Now try with a user namespace. The results should be the same as above. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + pass_sshfs_bind_flags "ro" "bind,nosuid,nodev,rprivate" + # "ro" must still be set (inherited). + run -0 grep -wq ro <<<"$mnt_flags" + # The new flags must be applied. + run -0 grep -wq nosuid <<<"$mnt_flags" + run -0 grep -wq nodev <<<"$mnt_flags" +} + +@test "runc run [explicit-rw bind mount of a ro fuse sshfs mount]" { + requires root + + # Try to overwrite MS_RDONLY. As we are running in a userns-less container, + # we can overwrite MNT_LOCKED flags. + pass_sshfs_bind_flags "ro" "bind,rw,nosuid,nodev,rprivate" + # "ro" must be cleared and replaced with "rw". + run ! grep -wq ro <<<"$mnt_flags" + run -0 grep -wq rw <<<"$mnt_flags" + # The new flags must be applied. + run -0 grep -wq nosuid <<<"$mnt_flags" + run -0 grep -wq nodev <<<"$mnt_flags" + + # Now try with a user namespace. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + # This must fail because we explicitly requested a mount with a MNT_LOCKED + # mount option cleared (when the source mount has those mounts enabled), + # namely MS_RDONLY. + fail_sshfs_bind_flags "ro" "bind,rw,nosuid,nodev,rprivate" +} + +@test "runc run [dev,exec,suid,atime bind mount of a nodev,nosuid,noexec,noatime fuse sshfs mount]" { + requires root + + # When running without userns, overwriting host flags should work. + pass_sshfs_bind_flags "nosuid,nodev,noexec,noatime" "bind,dev,suid,exec,atime" + # Unspecified flags must be cleared (rw default). + run ! grep -wq ro <<<"$mnt_flags" + run -0 grep -wq rw <<<"$mnt_flags" + # Check that the flags were actually cleared by the mount. + run ! grep -wq nosuid <<<"$mnt_flags" + run ! grep -wq nodev <<<"$mnt_flags" + run ! grep -wq noexec <<<"$mnt_flags" + # FIXME FIXME: As with mount(8), trying to clear an atime flag the "naive" + # way will be ignored! + run -0 grep -wq noatime <<<"$mnt_flags" + + # Now try with a user namespace. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + # This must fail because we explicitly requested a mount with MNT_LOCKED + # mount options cleared (when the source mount has those mounts enabled). + fail_sshfs_bind_flags "nodev,nosuid,nosuid,noatime" "bind,dev,suid,exec,atime" +} + +# Test to ensure we don't regress bind-mounting /etc/resolv.conf with +# containerd . +@test "runc run [ro bind mount of a nodev,nosuid,noexec fuse sshfs mount]" { + requires root + + # Setting flags that are not locked should work. + pass_sshfs_bind_flags "rw,nodev,nosuid,nodev,noexec,noatime" "bind,ro" + # The flagset should be the union of the two. + run -0 grep -wq ro <<<"$mnt_flags" + # Unspecified flags must be cleared. + run ! grep -wq nosuid <<<"$mnt_flags" + run ! grep -wq nodev <<<"$mnt_flags" + run ! grep -wq noexec <<<"$mnt_flags" + + # Now try with a user namespace. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + # Setting flags that are not locked should work. + pass_sshfs_bind_flags "rw,nodev,nosuid,nodev,noexec,noatime" "bind,ro" + # The flagset should be the union of the two. + run -0 grep -wq ro <<<"$mnt_flags" + # (Unspecified MNT_LOCKED flags are inherited.) + run -0 grep -wq nosuid <<<"$mnt_flags" + run -0 grep -wq nodev <<<"$mnt_flags" + run -0 grep -wq noexec <<<"$mnt_flags" +} + +@test "runc run [ro,symfollow bind mount of a rw,nodev,nosymfollow fuse sshfs mount]" { + requires root + + pass_sshfs_bind_flags "rw,nodev,nosymfollow" "bind,ro,symfollow" + # Must switch to ro. + run -0 grep -wq ro <<<"$mnt_flags" + run ! grep -wq rw <<<"$mnt_flags" + # Unspecified flags must be cleared. + run ! grep -wq nodev <<<"$mnt_flags" + # nosymfollow must also be cleared. + run ! grep -wq nosymfollow <<<"$mnt_flags" + + # Now try with a user namespace. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + # Unsetting flags that are not lockable should work. + pass_sshfs_bind_flags "rw,nodev,nosymfollow" "bind,ro,symfollow" + # The flagset should be the union of the two. + run -0 grep -wq ro <<<"$mnt_flags" + run -0 grep -wq nodev <<<"$mnt_flags" + # nosymfollow is not lockable, so it must be cleared. + run ! grep -wq nosymfollow <<<"$mnt_flags" + + # Implied unsetting of non-lockable flags should also work. + pass_sshfs_bind_flags "rw,nodev,nosymfollow" "bind,rw" + # The flagset should be the union of the two. + run -0 grep -wq rw <<<"$mnt_flags" + run -0 grep -wq nodev <<<"$mnt_flags" + # nosymfollow is not lockable, so it must be cleared. + run ! grep -wq nosymfollow <<<"$mnt_flags" +} + +@test "runc run [ro,noexec bind mount of a nosuid,noatime fuse sshfs mount]" { + requires root + + # Setting flags that are not locked should work. + pass_sshfs_bind_flags "nodev,nosuid,noatime" "bind,ro,exec" + # The flagset must match the requested set. + run -0 grep -wq ro <<<"$mnt_flags" + run ! grep -wq noexec <<<"$mnt_flags" + # Unspecified flags must be cleared. + run ! grep -wq nosuid <<<"$mnt_flags" + run ! grep -wq nodev <<<"$mnt_flags" + # FIXME: As with mount(8), runc keeps the old atime setting by default. + run -0 grep -wq noatime <<<"$mnt_flags" + + # Now try with a user namespace. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + # Setting flags that are not locked should work. + pass_sshfs_bind_flags "nodev,nosuid,noatime" "bind,ro,exec" + # The flagset should be the union of the two. + run -0 grep -wq ro <<<"$mnt_flags" + run ! grep -wq noexec <<<"$mnt_flags" + # (Unspecified MNT_LOCKED flags are inherited.) + run -0 grep -wq nosuid <<<"$mnt_flags" + run -0 grep -wq nodev <<<"$mnt_flags" + run -0 grep -wq noatime <<<"$mnt_flags" +} + +@test "runc run [bind mount {no,rel,strict}atime semantics]" { + requires root + + function is_strictatime() { + # There is no "strictatime" in /proc/self/mounts. + run ! grep -wq noatime <<<"${1:-$mnt_flags}" + run ! grep -wq relatime <<<"${1:-$mnt_flags}" + run ! grep -wq nodiratime <<<"${1:-$mnt_flags}" + } + + # FIXME: As with mount(8), runc keeps the old atime setting by default. + pass_sshfs_bind_flags "noatime" "bind" + run -0 grep -wq noatime <<<"$mnt_flags" + run ! grep -wq relatime <<<"$mnt_flags" + + # FIXME: As with mount(8), runc keeps the old atime setting by default. + pass_sshfs_bind_flags "noatime" "bind,norelatime" + run -0 grep -wq noatime <<<"$mnt_flags" + run ! grep -wq relatime <<<"$mnt_flags" + + # FIXME FIXME: As with mount(8), trying to clear an atime flag the "naive" + # way will be ignored! + pass_sshfs_bind_flags "noatime" "bind,atime" + run -0 grep -wq noatime <<<"$mnt_flags" + run ! grep -wq relatime <<<"$mnt_flags" + + # ... but explicitly setting a different flag works. + pass_sshfs_bind_flags "noatime" "bind,relatime" + run ! grep -wq noatime <<<"$mnt_flags" + run -0 grep -wq relatime <<<"$mnt_flags" + + # Setting a flag that mount(8) would combine should result in only the + # requested flag being set. + pass_sshfs_bind_flags "noatime" "bind,nodiratime" + run ! grep -wq noatime <<<"$mnt_flags" + run -0 grep -wq nodiratime <<<"$mnt_flags" + # MS_DIRATIME implies MS_RELATIME by default. + run -0 grep -wq relatime <<<"$mnt_flags" + + # Clearing flags that mount(8) would not clear works. + pass_sshfs_bind_flags "nodiratime" "bind,strictatime" + is_strictatime "$mnt_flags" + + # nodiratime is a little weird -- it implies relatime unless you set + # another option (noatime or strictatime). But, runc also has norelatime -- + # so nodiratime,norelatime should _probably_ result in the same thing as + # nodiratime,strictatime. + pass_sshfs_bind_flags "noatime" "bind,nodiratime,strictatime" + run ! grep -wq noatime <<<"$mnt_flags" + run -0 grep -wq nodiratime <<<"$mnt_flags" + run ! grep -wq relatime <<<"$mnt_flags" + # FIXME FIXME: relatime should not be set in this case. + pass_sshfs_bind_flags "noatime" "bind,nodiratime,norelatime" + run ! grep -wq noatime <<<"$mnt_flags" + run -0 grep -wq nodiratime <<<"$mnt_flags" + run -0 grep -wq relatime <<<"$mnt_flags" + + # Now try with a user namespace. + update_config ' .linux.namespaces += [{"type": "user"}] + | .linux.uidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] + | .linux.gidMappings += [{"hostID": 100000, "containerID": 0, "size": 65534}] ' + + # Requesting a mount without specifying any preference for atime works, and + # inherits the original flags. + + pass_sshfs_bind_flags "strictatime" "bind" + is_strictatime "$mnt_flags" + + pass_sshfs_bind_flags "relatime" "bind" + run -0 grep -wq relatime <<<"$mnt_flags" + + pass_sshfs_bind_flags "nodiratime" "bind" + run -0 grep -wq nodiratime <<<"$mnt_flags" + # MS_DIRATIME implies MS_RELATIME by default. + run -0 grep -wq relatime <<<"$mnt_flags" + + pass_sshfs_bind_flags "noatime,nodiratime" "bind" + run -0 grep -wq noatime <<<"$mnt_flags" + run -0 grep -wq nodiratime <<<"$mnt_flags" + + # An unrelated clear flag has no effect. + pass_sshfs_bind_flags "noatime,nodiratime" "bind,norelatime" + run -0 grep -wq noatime <<<"$mnt_flags" + run -0 grep -wq nodiratime <<<"$mnt_flags" + + # Attempting to change most *atime flags will fail with user namespaces + # because *atime flags are all MNT_LOCKED. + fail_sshfs_bind_flags "nodiratime" "bind,strictatime" + fail_sshfs_bind_flags "relatime" "bind,strictatime" + fail_sshfs_bind_flags "noatime" "bind,strictatime" + fail_sshfs_bind_flags "nodiratime" "bind,noatime" + fail_sshfs_bind_flags "relatime" "bind,noatime" + fail_sshfs_bind_flags "relatime" "bind,nodiratime" + # Make sure strictatime sources are correctly handled by runc (the kernel + # ignores some other mount flags when passing MS_STRICTATIME). See + # remount() in rootfs_linux.go for details. + fail_sshfs_bind_flags "strictatime" "bind,relatime" + fail_sshfs_bind_flags "strictatime" "bind,noatime" + fail_sshfs_bind_flags "strictatime" "bind,nodiratime" + # Make sure that runc correctly handles the MS_NOATIME|MS_RELATIME kernel + # bug. See remount() in rootfs_linux.go for more details. + fail_sshfs_bind_flags "noatime" "bind,relatime" + + # Attempting to bind-mount a mount with a request to clear the atime + # setting that would normally inherited must not work. + # FIXME FIXME: All of these cases should fail. + pass_sshfs_bind_flags "strictatime" "bind,nostrictatime" + is_strictatime "$mnt_flags" + pass_sshfs_bind_flags "nodiratime" "bind,diratime" + run -0 grep -wq nodiratime <<<"$mnt_flags" + pass_sshfs_bind_flags "nodiratime" "bind,norelatime" # MS_DIRATIME implies MS_RELATIME + run -0 grep -wq nodiratime <<<"$mnt_flags" + pass_sshfs_bind_flags "relatime" "bind,norelatime" + run -0 grep -wq relatime <<<"$mnt_flags" + pass_sshfs_bind_flags "noatime" "bind,atime" + run -0 grep -wq noatime <<<"$mnt_flags" + pass_sshfs_bind_flags "noatime,nodiratime" "bind,atime" + run -0 grep -wq noatime <<<"$mnt_flags" + run -0 grep -wq nodiratime <<<"$mnt_flags" } diff --git a/tests/integration/personality.bats b/tests/integration/personality.bats new file mode 100644 index 00000000000..37aa8e86fda --- /dev/null +++ b/tests/integration/personality.bats @@ -0,0 +1,64 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + requires arch_x86_64 + setup_busybox +} + +function teardown() { + teardown_bundle +} + +@test "runc run personality for i686" { + update_config ' + .process.args = ["/bin/sh", "-c", "uname -a"] + | .linux.personality = { + "domain": "LINUX32", + "flags": [] + }' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "$output" == *"i686"* ]] +} + +@test "runc run personality with exec for i686" { + update_config ' + .linux.personality = { + "domain": "LINUX32", + }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + runc exec test_busybox /bin/sh -c "uname -a" + [ "$status" -eq 0 ] + [[ "$output" == *"i686"* ]] +} + +@test "runc run personality for x86_64" { + update_config ' + .process.args = ["/bin/sh", "-c", "uname -a"] + | .linux.personality = { + "domain": "LINUX", + "flags": [] + }' + + runc run test_busybox + [ "$status" -eq 0 ] + [[ "$output" == *"x86_64"* ]] +} + +@test "runc run personality with exec for x86_64" { + update_config ' + .linux.personality = { + "domain": "LINUX", + }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_busybox + [ "$status" -eq 0 ] + runc exec test_busybox /bin/sh -c "uname -a" + [ "$status" -eq 0 ] + [[ "$output" == *"x86_64"* ]] +} diff --git a/tests/integration/update.bats b/tests/integration/update.bats index 616fe809b9c..5a3dc7f0563 100644 --- a/tests/integration/update.bats +++ b/tests/integration/update.bats @@ -288,12 +288,6 @@ EOF runc update test_update --cpu-share 200 [ "$status" -eq 0 ] check_cpu_shares 200 - runc update test_update --cpu-period 900000 --cpu-burst 500000 - [ "$status" -eq 0 ] - check_cpu_burst 500000 - runc update test_update --cpu-period 900000 --cpu-burst 0 - [ "$status" -eq 0 ] - check_cpu_burst 0 # Revert to the test initial value via json on stding runc update -r - test_update <= PinType(len(_PinType_index)-1) { + if i >= PinType(len(_PinType_index)-1) { return "PinType(" + strconv.FormatInt(int64(i), 10) + ")" } return _PinType_name[_PinType_index[i]:_PinType_index[i+1]] diff --git a/vendor/modules.txt b/vendor/modules.txt index 5f597f7a567..2d660aa8a7b 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -2,7 +2,7 @@ ## explicit; go 1.16 github.com/checkpoint-restore/go-criu/v6 github.com/checkpoint-restore/go-criu/v6/rpc -# github.com/cilium/ebpf v0.12.1 +# github.com/cilium/ebpf v0.12.2 ## explicit; go 1.20 github.com/cilium/ebpf github.com/cilium/ebpf/asm