From d639b0b02fcdefc6ef618b78a5b93c98d6026a3a Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 31 Oct 2018 20:07:27 -0700 Subject: [PATCH] libcontainer: ability to compile without kmem Commit fe898e7862f94 (PR #1350) enables kernel memory accounting for all cgroups created by libcontainer -- even if kmem limit is not configured. Kernel memory accounting is known to be broken in some kernels, specifically the ones from RHEL7 (including RHEL 7.5). Those kernels do not support kernel memory reclaim, and are prone to oopses. Unconditionally enabling kmem acct on such kernels lead to bugs, such as * https://github.com/opencontainers/runc/issues/1725 * https://github.com/kubernetes/kubernetes/issues/61937 * https://github.com/moby/moby/issues/29638 This commit gives a way to compile runc without kernel memory setting support. To do so, use something like make BUILDTAGS="seccomp nokmem" Signed-off-by: Kir Kolyshkin (cherry picked from commit 6a2c15596845f6ff5182e2022f38a65e5dfa88eb) Signed-off-by: Sebastiaan van Stijn --- Makefile | 2 +- libcontainer/cgroups/fs/kmem.go | 54 ++++++++++++++++++++++++ libcontainer/cgroups/fs/kmem_disabled.go | 11 +++++ libcontainer/cgroups/fs/memory.go | 45 +------------------- 4 files changed, 68 insertions(+), 44 deletions(-) create mode 100644 libcontainer/cgroups/fs/kmem.go create mode 100644 libcontainer/cgroups/fs/kmem_disabled.go diff --git a/Makefile b/Makefile index d6d337dab5c..499b87eea4c 100644 --- a/Makefile +++ b/Makefile @@ -9,7 +9,7 @@ GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g") RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) PROJECT := github.com/opencontainers/runc -BUILDTAGS := seccomp +BUILDTAGS ?= seccomp COMMIT_NO := $(shell git rev-parse HEAD 2> /dev/null || true) COMMIT := $(if $(shell git status --porcelain --untracked-files=no),"${COMMIT_NO}-dirty","${COMMIT_NO}") diff --git a/libcontainer/cgroups/fs/kmem.go b/libcontainer/cgroups/fs/kmem.go new file mode 100644 index 00000000000..4d10ace2a2a --- /dev/null +++ b/libcontainer/cgroups/fs/kmem.go @@ -0,0 +1,54 @@ +// +build linux,!nokmem + +package fs + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "syscall" // for Errno type only + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +const cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" + +func EnableKernelMemoryAccounting(path string) error { + // Check if kernel memory is enabled + // We have to limit the kernel memory here as it won't be accounted at all + // until a limit is set on the cgroup and limit cannot be set once the + // cgroup has children, or if there are already tasks in the cgroup. + for _, i := range []int64{1, -1} { + if err := setKernelMemory(path, i); err != nil { + return err + } + } + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + if path == "" { + return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) + } + if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { + // kernel memory is not enabled on the system so we should do nothing + return nil + } + if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { + // Check if the error number returned by the syscall is "EBUSY" + // The EBUSY signal is returned on attempts to write to the + // memory.kmem.limit_in_bytes file if the cgroup has children or + // once tasks have been attached to the cgroup + if pathErr, ok := err.(*os.PathError); ok { + if errNo, ok := pathErr.Err.(syscall.Errno); ok { + if errNo == syscall.EBUSY { + return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) + } + } + } + return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) + } + return nil +} diff --git a/libcontainer/cgroups/fs/kmem_disabled.go b/libcontainer/cgroups/fs/kmem_disabled.go new file mode 100644 index 00000000000..12253b6bbbf --- /dev/null +++ b/libcontainer/cgroups/fs/kmem_disabled.go @@ -0,0 +1,11 @@ +// +build linux,nokmem + +package fs + +func EnableKernelMemoryAccounting(path string) error { + return nil +} + +func setKernelMemory(path string, kernelMemoryLimit int64) error { + return nil +} diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index c993839c2b0..b852c13147c 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -5,21 +5,18 @@ package fs import ( "bufio" "fmt" - "io/ioutil" "os" "path/filepath" "strconv" "strings" - "syscall" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" ) const ( - cgroupKernelMemoryLimit = "memory.kmem.limit_in_bytes" - cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" - cgroupMemoryLimit = "memory.limit_in_bytes" + cgroupMemorySwapLimit = "memory.memsw.limit_in_bytes" + cgroupMemoryLimit = "memory.limit_in_bytes" ) type MemoryGroup struct { @@ -65,44 +62,6 @@ func (s *MemoryGroup) Apply(d *cgroupData) (err error) { return nil } -func EnableKernelMemoryAccounting(path string) error { - // Check if kernel memory is enabled - // We have to limit the kernel memory here as it won't be accounted at all - // until a limit is set on the cgroup and limit cannot be set once the - // cgroup has children, or if there are already tasks in the cgroup. - for _, i := range []int64{1, -1} { - if err := setKernelMemory(path, i); err != nil { - return err - } - } - return nil -} - -func setKernelMemory(path string, kernelMemoryLimit int64) error { - if path == "" { - return fmt.Errorf("no such directory for %s", cgroupKernelMemoryLimit) - } - if !cgroups.PathExists(filepath.Join(path, cgroupKernelMemoryLimit)) { - // kernel memory is not enabled on the system so we should do nothing - return nil - } - if err := ioutil.WriteFile(filepath.Join(path, cgroupKernelMemoryLimit), []byte(strconv.FormatInt(kernelMemoryLimit, 10)), 0700); err != nil { - // Check if the error number returned by the syscall is "EBUSY" - // The EBUSY signal is returned on attempts to write to the - // memory.kmem.limit_in_bytes file if the cgroup has children or - // once tasks have been attached to the cgroup - if pathErr, ok := err.(*os.PathError); ok { - if errNo, ok := pathErr.Err.(syscall.Errno); ok { - if errNo == syscall.EBUSY { - return fmt.Errorf("failed to set %s, because either tasks have already joined this cgroup or it has children", cgroupKernelMemoryLimit) - } - } - } - return fmt.Errorf("failed to write %v to %v: %v", kernelMemoryLimit, cgroupKernelMemoryLimit, err) - } - return nil -} - func setMemoryAndSwap(path string, cgroup *configs.Cgroup) error { // If the memory update is set to -1 we should also // set swap to -1, it means unlimited memory.