Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nsexec: cloned binary rework #3987

Merged
merged 8 commits into from
Sep 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 16 additions & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,18 @@ jobs:
rootless: ["rootless", ""]
race: ["-race", ""]
criu: ["", "criu-dev"]
dmz: ["", "runc_nodmz"]
cyphar marked this conversation as resolved.
Show resolved Hide resolved
cyphar marked this conversation as resolved.
Show resolved Hide resolved
exclude:
- criu: criu-dev
rootless: rootless
- criu: criu-dev
go-version: 1.20.x
- criu: criu-dev
race: -race
- dmz: runc_nodmz
criu: criu-dev
- dmz: runc_nodmz
os: ubuntu-20.04
runs-on: ${{ matrix.os }}

steps:
Expand Down Expand Up @@ -71,6 +76,8 @@ jobs:
go-version: ${{ matrix.go-version }}

- name: build
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" make EXTRA_FLAGS="${{ matrix.race }}" all

- name: install bats
Expand All @@ -80,6 +87,8 @@ jobs:

- name: unit test
if: matrix.rootless != 'rootless'
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make TESTFLAGS="${{ matrix.race }}" localunittest

- name: add rootless user
Expand Down Expand Up @@ -113,8 +122,12 @@ jobs:
# However, we do not have 32-bit ARM CI, so we use i386 for testing 32bit stuff.
# We are not interested in providing official support for i386.
cross-i386:
runs-on: ubuntu-22.04
timeout-minutes: 15
strategy:
fail-fast: false
matrix:
dmz: ["", "runc_nodmz"]
runs-on: ubuntu-22.04

steps:

Expand All @@ -136,4 +149,6 @@ jobs:
go-version: 1.x # Latest stable

- name: unit test
env:
EXTRA_BUILDTAGS: ${{ matrix.dmz }}
run: sudo -E PATH="$PATH" -- make GOARCH=386 localunittest
9 changes: 5 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
vendor/pkg
/runc
/runc-*
contrib/cmd/recvtty/recvtty
contrib/cmd/sd-helper/sd-helper
contrib/cmd/seccompagent/seccompagent
contrib/cmd/fs-idmap/fs-idmap
/contrib/cmd/recvtty/recvtty
/contrib/cmd/sd-helper/sd-helper
/contrib/cmd/seccompagent/seccompagent
/contrib/cmd/fs-idmap/fs-idmap
/contrib/cmd/memfd-bind/memfd-bind
man/man8
release
Vagrantfile
Expand Down
1 change: 1 addition & 0 deletions .golangci-extra.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
run:
build-tags:
- seccomp
- runc_nodmz

linters:
disable-all: true
Expand Down
1 change: 1 addition & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
run:
build-tags:
- seccomp
- runc_nodmz

linters:
enable:
Expand Down
20 changes: 12 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,15 @@ ARG CRIU_REPO=https://download.opensuse.org/repositories/devel:/tools:/criu/Debi
RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
wget -nv $CRIU_REPO/Release.key -O- | gpg --dearmor > "$KEYFILE" \
&& echo "deb [signed-by=$KEYFILE] $CRIU_REPO/ /" > /etc/apt/sources.list.d/criu.list \
&& dpkg --add-architecture i386 \
&& apt-get update \
&& apt-get install -y --no-install-recommends \
build-essential \
criu \
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
gcc-arm-linux-gnueabi libc-dev-armel-cross \
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
gcc-s390x-linux-gnu libc-dev-s390x-cross \
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
gcc \
gcc-multilib \
curl \
gawk \
gcc \
gperf \
iptables \
jq \
Expand All @@ -32,6 +28,14 @@ RUN KEYFILE=/usr/share/keyrings/criu-repo-keyring.gpg; \
sudo \
uidmap \
iproute2 \
&& apt-get install -y --no-install-recommends \
libc-dev:i386 libgcc-s1:i386 \
gcc-aarch64-linux-gnu libc-dev-arm64-cross \
gcc-arm-linux-gnueabi libc-dev-armel-cross \
gcc-arm-linux-gnueabihf libc-dev-armhf-cross \
gcc-powerpc64le-linux-gnu libc-dev-ppc64el-cross \
gcc-s390x-linux-gnu libc-dev-s390x-cross \
gcc-riscv64-linux-gnu libc-dev-riscv64-cross \
&& apt-get clean \
&& rm -rf /var/cache/apt /var/lib/apt/lists/* /etc/apt/sources.list.d/*.list

Expand All @@ -54,7 +58,7 @@ RUN cd /tmp \
ARG LIBSECCOMP_VERSION
COPY script/seccomp.sh script/lib.sh /tmp/script/
RUN mkdir -p /opt/libseccomp \
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp arm64 armel armhf ppc64le riscv64 s390x
&& /tmp/script/seccomp.sh "$LIBSECCOMP_VERSION" /opt/libseccomp 386 amd64 arm64 armel armhf ppc64le riscv64 s390x
ENV LIBSECCOMP_VERSION=$LIBSECCOMP_VERSION
ENV LD_LIBRARY_PATH=/opt/libseccomp/lib
ENV PKG_CONFIG_PATH=/opt/libseccomp/lib/pkgconfig
Expand Down
44 changes: 35 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
SHELL = /bin/bash

CONTAINER_ENGINE := docker
GO ?= go

# Get CC values for cross-compilation.
include cc_platform.mk

PREFIX ?= /usr/local
BINDIR := $(PREFIX)/sbin
MANDIR := $(PREFIX)/share/man
Expand All @@ -10,6 +15,7 @@ GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g")
RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN))
PROJECT := github.com/opencontainers/runc
BUILDTAGS ?= seccomp urfave_cli_no_docs
BUILDTAGS += $(EXTRA_BUILDTAGS)

COMMIT ?= $(shell git describe --dirty --long --always)
VERSION := $(shell cat ./VERSION)
Expand Down Expand Up @@ -57,18 +63,25 @@ endif

.DEFAULT: runc

runc:
runc: runc-dmz
$(GO_BUILD) -o runc .
make verify-dmz-arch

all: runc recvtty sd-helper seccompagent fs-idmap
all: runc recvtty sd-helper seccompagent fs-idmap memfd-bind

recvtty sd-helper seccompagent fs-idmap:
recvtty sd-helper seccompagent fs-idmap memfd-bind:
$(GO_BUILD) -o contrib/cmd/$@/$@ ./contrib/cmd/$@

static:
static: runc-dmz
$(GO_BUILD_STATIC) -o runc .
make verify-dmz-arch

.PHONY: runc-dmz
runc-dmz:
rm -f libcontainer/dmz/runc-dmz
$(GO) generate -tags "$(BUILDTAGS)" ./libcontainer/dmz

releaseall: RELEASE_ARGS := "-a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
releaseall: RELEASE_ARGS := "-a 386 -a amd64 -a arm64 -a armel -a armhf -a ppc64le -a riscv64 -a s390x"
releaseall: release

release: runcimage
Expand Down Expand Up @@ -147,12 +160,13 @@ install-man: man
install -D -m 644 man/man8/*.8 $(DESTDIR)$(MANDIR)/man8

clean:
rm -f runc runc-*
rm -f runc runc-* libcontainer/dmz/runc-dmz
cyphar marked this conversation as resolved.
Show resolved Hide resolved
rm -f contrib/cmd/fs-idmap/fs-idmap
rm -f contrib/cmd/recvtty/recvtty
rm -f contrib/cmd/sd-helper/sd-helper
rm -f contrib/cmd/seccompagent/seccompagent
rm -f contrib/cmd/fs-idmap/fs-idmap
rm -rf release
rm -f contrib/cmd/memfd-bind/memfd-bind
sudo rm -rf release
cyphar marked this conversation as resolved.
Show resolved Hide resolved
rm -rf man/man8

cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
Expand Down Expand Up @@ -188,6 +202,18 @@ verify-dependencies: vendor
@test -z "$$(git status --porcelain -- go.mod go.sum vendor/)" \
|| (echo -e "git status:\n $$(git status -- go.mod go.sum vendor/)\nerror: vendor/, go.mod and/or go.sum not up to date. Run \"make vendor\" to update"; exit 1) \
&& echo "all vendor files are up to date."
verify-dmz-arch:
@test -s libcontainer/dmz/runc-dmz || exit 0; \
set -Eeuo pipefail; \
export LC_ALL=C; \
echo "readelf -h runc"; \
readelf -h runc | grep -E "(Machine|Flags):"; \
echo "readelf -h libcontainer/dmz/runc-dmz"; \
readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):"; \
lifubang marked this conversation as resolved.
Show resolved Hide resolved
diff -u \
<(readelf -h runc | grep -E "(Machine|Flags):") \
<(readelf -h libcontainer/dmz/runc-dmz | grep -E "(Machine|Flags):") \
&& echo "runc-dmz architecture matches runc binary."

validate-keyring:
script/keyring_validate.sh
Expand All @@ -197,4 +223,4 @@ validate-keyring:
test localtest unittest localunittest integration localintegration \
rootlessintegration localrootlessintegration shell install install-bash \
install-man clean cfmt shfmt localshfmt shellcheck \
vendor verify-changelog verify-dependencies validate-keyring
vendor verify-changelog verify-dependencies verify-dmz-arch validate-keyring
9 changes: 6 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,15 +65,18 @@ e.g. to disable seccomp:
make BUILDTAGS=""
```

| Build Tag | Feature | Enabled by default | Dependency |
|-----------|------------------------------------|--------------------|------------|
| seccomp | Syscall filtering | yes | libseccomp |
| Build Tag | Feature | Enabled by Default | Dependencies |
|---------------|---------------------------------------|--------------------|---------------------|
| `seccomp` | Syscall filtering using `libseccomp`. | yes | `libseccomp` |
| `!runc_nodmz` | Reduce memory usage for CVE-2019-5736 protection by using a small C binary, [see `memfd-bind` for more details][contrib-memfd-bind]. `runc_nodmz` disables this feature and causes runc to use a different protection mechanism which will further increases memory usage temporarily during container startup. This feature can also be disabled at runtime by setting the `RUNC_DMZ=legacy` environment variable. | yes ||

The following build tags were used earlier, but are now obsoleted:
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)

[contrib-memfd-bind]: /contrib/memfd-bind/README.md

### Running the test suite

`runc` currently supports running its test suite via Docker.
Expand Down
61 changes: 61 additions & 0 deletions cc_platform.mk
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# NOTE: Make sure you keep this file in sync with scripts/lib.sh.

GO ?= go
GOARCH ?= $(shell $(GO) env GOARCH)

ifneq ($(shell grep -i "ID_LIKE=.*suse" /etc/os-release),)
# openSUSE has a custom PLATFORM
PLATFORM ?= suse-linux
IS_SUSE := 1
else
PLATFORM ?= linux-gnu
endif

ifeq ($(GOARCH),$(shell GOARCH= $(GO) env GOARCH))
# use the native CC and STRIP
HOST :=
else ifeq ($(GOARCH),386)
# Always use the 64-bit compiler to build the 386 binary, which works for
# the more common cross-build method for x86 (namely, the equivalent of
# dpkg --add-architecture).
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
CPU_TYPE := i586
else
HOST := x86_64-$(PLATFORM)-
CPU_TYPE := i686
endif
CFLAGS := -m32 -march=$(CPU_TYPE) $(CFLAGS)
else ifeq ($(GOARCH),amd64)
ifdef IS_SUSE
# There is no x86_64-suse-linux-gcc, so use the native one.
HOST :=
else
HOST := x86_64-$(PLATFORM)-
endif
else ifeq ($(GOARCH),arm64)
HOST := aarch64-$(PLATFORM)-
else ifeq ($(GOARCH),arm)
# HOST already configured by release_build.sh in this case.
else ifeq ($(GOARCH),armel)
HOST := arm-$(PLATFORM)eabi-
else ifeq ($(GOARCH),armhf)
HOST := arm-$(PLATFORM)eabihf-
else ifeq ($(GOARCH),ppc64le)
HOST := powerpc64le-$(PLATFORM)-
else ifeq ($(GOARCH),riscv64)
HOST := riscv64-$(PLATFORM)-
else ifeq ($(GOARCH),s390x)
HOST := s390x-$(PLATFORM)-
else
$(error Unsupported GOARCH $(GOARCH))
endif

ifeq ($(origin CC),$(filter $(origin CC),undefined default))
# Override CC if it's undefined or just the default value set by Make.
CC := $(HOST)gcc
export CC
endif
STRIP ?= $(HOST)strip
export STRIP
67 changes: 67 additions & 0 deletions contrib/cmd/memfd-bind/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
## memfd-bind ##
cyphar marked this conversation as resolved.
Show resolved Hide resolved

`runc` normally has to make a binary copy of itself (or of a smaller helper
binary called `runc-dmz`) when constructing a container process in order to
defend against certain container runtime attacks such as CVE-2019-5736.

This cloned binary only exists until the container process starts (this means
for `runc run` and `runc exec`, it only exists for a few hundred milliseconds
-- for `runc create` it exists until `runc start` is called). However, because
the clone is done using a memfd (or by creating files in directories that are
likely to be a `tmpfs`), this can lead to temporary increases in *host* memory
usage. Unless you are running on a cgroupv1 system with the cgroupv1 memory
controller enabled and the (deprecated) `memory.move_charge_at_immigrate`
enabled, there is no effect on the container's memory.

However, for certain configurations this can still be undesirable. This daemon
allows you to create a sealed memfd copy of the `runc` binary, which will cause
`runc` to skip all binary copying, resulting in no additional memory usage for
each container process (instead there is a single in-memory copy of the
binary). It should be noted that (strictly speaking) this is slightly less
secure if you are concerned about Dirty Cow-like 0-day kernel vulnerabilities,
but for most users the security benefit is identical.

The provided `memfd-bind@.service` file can be used to get systemd to manage
this daemon. You can supply the path like so:

```
% systemctl start memfd-bind@/usr/bin/runc
```

Thus, there are three ways of protecting against CVE-2019-5736, in order of how
much memory usage they can use:

* `memfd-bind` only creates a single in-memory copy of the `runc` binary (about
10MB), regardless of how many containers are running.

* `runc-dmz` is (depending on which libc it was compiled with) between 10kB and
1MB in size, and a copy is created once per process spawned inside a
container by runc (both the pid1 and every `runc exec`). There are
circumstances where using `runc-dmz` will fail in ways that runc cannot
predict ahead of time (such as restrictive LSMs applied to containers), in
which case users can disable it with the `RUNC_DMZ=legacy` setting.
`runc-dmz` also requires an additional `execve` over the other options,
though since the binary is so small the cost is probably not even noticeable.

* The classic method of making a copy of the entire `runc` binary during
container process setup takes up about 10MB per process spawned inside the
container by runc (both pid1 and `runc exec`).

### Caveats ###

There are several downsides with using `memfd-bind` on the `runc` binary:

* The `memfd-bind` process needs to continue to run indefinitely in order for
the memfd reference to stay alive. If the process is forcefully killed, the
bind-mount on top of the `runc` binary will become stale and nobody will be
able to execute it (you can use `memfd-bind --cleanup` to clean up the stale
mount).

* Only root can execute the cloned binary due to permission restrictions on
accessing other process's files. More specifically, only users with ptrace
privileges over the memfd-bind daemon can access the file (but in practice
this is usually only root).

* When updating `runc`, the daemon needs to be stopped before the update (so
the package manager can access the underlying file) and then restarted after
the update.
Loading