Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

nsexec: cloned_binary: remove bindfd logic entirely #3931

Merged
merged 2 commits into from
Aug 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
305 changes: 83 additions & 222 deletions libcontainer/nsenter/cloned_binary.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@
# define MFD_CLOEXEC 0x0001U
# define MFD_ALLOW_SEALING 0x0002U
#endif
#ifndef MFD_EXEC
# define MFD_EXEC 0x0010U
#endif

int memfd_create(const char *name, unsigned int flags)
{
Expand All @@ -116,15 +119,27 @@ int memfd_create(const char *name, unsigned int flags)
# define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
#endif
#ifndef F_SEAL_SEAL
# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
# define F_SEAL_GROW 0x0004 /* prevent file from growing */
# define F_SEAL_WRITE 0x0008 /* prevent writes */
# define F_SEAL_SEAL 0x0001 /* prevent further seals from being set */
# define F_SEAL_SHRINK 0x0002 /* prevent file from shrinking */
# define F_SEAL_GROW 0x0004 /* prevent file from growing */
# define F_SEAL_WRITE 0x0008 /* prevent writes */
#endif
#ifndef F_SEAL_FUTURE_WRITE
# define F_SEAL_FUTURE_WRITE 0x0010 /* prevent future writes while mapped */
#endif
#ifndef F_SEAL_EXEC
# define F_SEAL_EXEC 0x0020 /* prevent chmod modifying exec bits */
#endif

#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
#define RUNC_MEMFD_SEALS \
/*
* There are newer memfd seals (such as F_SEAL_FUTURE_WRITE and F_SEAL_EXEC),
* which we use opportunistically. However, this set is the original set of
* memfd seals, and we require them all to be set to trust our /proc/self/exe
* if it is a memfd.
*/
#define RUNC_MEMFD_MIN_SEALS \
(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)

static void *must_realloc(void *ptr, size_t size)
Expand All @@ -143,25 +158,27 @@ static void *must_realloc(void *ptr, size_t size)
*/
static int is_self_cloned(void)
{
int fd, is_cloned = 0;
int fd, seals = 0, is_cloned = false;
struct stat statbuf = { };
struct statfs fsbuf = { };

fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
if (fd < 0) {
fprintf(stderr, "you have no read access to runc binary file\n");
write_log(ERROR, "cannot open runc binary for reading: open /proc/self/exe: %m");
return -ENOTRECOVERABLE;
}

/*
* Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
* this, because you cannot write to a sealed memfd no matter what (so
* sharing it isn't a bad thing -- and an admin could bind-mount a sealed
* memfd to /usr/bin/runc to allow re-use).
* this, because you cannot write to a sealed memfd no matter what.
*/
is_cloned = (fcntl(fd, F_GET_SEALS) == RUNC_MEMFD_SEALS);
if (is_cloned)
goto out;
seals = fcntl(fd, F_GET_SEALS);
if (seals >= 0) {
write_log(DEBUG, "checking /proc/self/exe memfd seals: 0x%x", seals);
is_cloned = (seals & RUNC_MEMFD_MIN_SEALS) == RUNC_MEMFD_MIN_SEALS;
if (is_cloned)
goto out;
}

/*
* All other forms require CLONED_BINARY_ENV, since they are potentially
Expand Down Expand Up @@ -298,6 +315,35 @@ enum {
# endif
#endif

static inline bool is_memfd_unsupported_error(int err)
{
/*
* - ENOSYS is obviously an "unsupported" error.
*
* - EINVAL could be hit if MFD_EXEC is not supported (pre-6.3 kernel),
* but it can also be hit if vm.memfd_noexec=2 (in kernels without
* [1] applied) and the flags does not contain MFD_EXEC. However,
* there was a bug in the original 6.3 implementation of
* vm.memfd_noexec=2, which meant that MFD_EXEC would work even in
* the "strict" mode. Because we try MFD_EXEC first, we won't get
* EINVAL in the vm.memfd_noexec=2 case (which means we don't need to
* figure out whether to log the message about memfd_create).
*
* - EACCES is returned in kernels that contain [1] in the
* vm.memfd_noexec=2 case.
*
* At time of writing, [1] is not in Linus's tree and it't not clear if
* it will be backported to stable, so what exact versions apply here
* is unclear. But the bug is present in 6.3-6.5 at the very least.
*
* [1]: https://lore.kernel.org/all/20230705063315.3680666-2-jeffxu@google.com/
*/
cyphar marked this conversation as resolved.
Show resolved Hide resolved
if (err == EACCES)
write_log(INFO,
"memfd_create(MFD_EXEC) failed, possibly due to vm.memfd_noexec=2 -- falling back to less secure O_TMPFILE");
return err == ENOSYS || err == EINVAL || err == EACCES;
}

static int make_execfd(int *fdtype)
{
int fd = -1;
Expand All @@ -315,10 +361,20 @@ static int make_execfd(int *fdtype)
* assumptions about STATEDIR.
*/
*fdtype = EFD_MEMFD;
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
/*
* On newer kernels we should set MFD_EXEC to indicate we need +x
* permissions. Otherwise an admin with vm.memfd_noexec=1 would subtly
* break runc. vm.memfd_noexec=2 is a little bit more complicated, see the
* comment in is_memfd_unsupported_error() -- the upshot is that doing it
* this way works, but only because of two overlapping bugs in the sysctl
* implementation.
*/
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_EXEC | MFD_CLOEXEC | MFD_ALLOW_SEALING);
if (fd < 0 && is_memfd_unsupported_error(errno))
fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
if (fd >= 0)
return fd;
if (errno != ENOSYS && errno != EINVAL)
if (!is_memfd_unsupported_error(errno))
goto error;

#ifdef O_TMPFILE
Copy link
Member

@lifubang lifubang Aug 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😭 Maybe there is a bug for many years? In some old kernels, we will use O_TMPFILE or mkostemp, but at this time, the runc state dir has not been created yet. So we will got an error in these old kernels:
FATA[0000] nsexec[18089]: could not ensure we are a cloned binary: Permission denied

So, we should deal with it.

  1. use /tmp directly;
    or
  2. continue to use getenv("_LIBCONTAINER_STATEDIR"), but we should create first, and delete it when got an error, I think it's very complex.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, I think we should split this function to 3 functions: make_memfd, make_o_tmpfile_fd, make_ostemp_fd, it's convenient to write unit test cases.

Copy link
Member Author

@cyphar cyphar Aug 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, that's pretty embarrassing 😅.

The logic for using the state directory is that we are guaranteed it is writable -- if we try to use some other directory it might be mounted as ro or the user might not have write permission. I guess it's okay to use /tmp though.

It's very odd that this wasn't caught before -- I tested the O_TMPFILE logic pretty extensively when I first implemented this code (on some SLES versions we don't have memfds). Some aspect of the statedir code must've changed after we added bindfd (which masked the breakage). Permission denied seems like a strange error for a missing directory -- I would expect ENOENT instead...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, I think we should split this function to 3 functions: make_memfd, make_o_tmpfile_fd, make_ostemp_fd, it's convenient to write unit test cases.

All of this code is going to be deleted and moved to Go in #3953. I'd prefer to keep this patch as simple as possible, and we can talk about how to break up the implementation in that PR.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd prefer to keep this patch as simple as possible, and we can talk about how to break up the implementation in that PR.

I quite agree with you.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This bug is not relate to this PR, I think this PR could be merged, and open a new PR to fix this error or waiting you fix it in Go code?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll fix the O_TMPFILE bug here. Let me take a look...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@lifubang I can't reproduce the issue you mentioned -- if I delete the memfd code, ensure_cloned_binary() still works. If I delete the O_TMPFILE code too, it also still works.

Since you're getting -EACCES it seems likely this is an LSM-related issue. Can you open a separate bug report about that, since it seems unlikely this is a common problem. I think we can merge this as-is. WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK, let's merge this and open an new issue.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Opened here: #3965

Expand Down Expand Up @@ -373,8 +429,18 @@ static int make_execfd(int *fdtype)
static int seal_execfd(int *fd, int fdtype)
{
switch (fdtype) {
case EFD_MEMFD:
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
case EFD_MEMFD:{
/*
* Try to seal with newer seals, but we ignore errors because older
* kernels don't support some of them. For container security only
* RUNC_MEMFD_MIN_SEALS are strictly required, but the rest are
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seals seem added in Linux 3.17, but we still need to support kernel 3.10 at least until the EOL of CentOS 7.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Slightly off-topic, but we have to have an official documentation to clarify the minimum supported kernel version)

Copy link
Member Author

@cyphar cyphar Aug 4, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

memfd_create() was added in 3.17, the seals were part of the initial implementation. We handle older kernel versions with O_TMPFILE (3.11 and later) or mkostemp(3) (glibc) for ancient kernels.

This PR doesn't change any of this behaviour, it just removes bindfd and passes MFD_EXEC as well as applying the two newer seals but ignores errors when adding them because they're not necessary. IOW the behaviour on older kernels is unchanged.

* nice-to-haves. We apply RUNC_MEMFD_MIN_SEALS at the end because it
* contains F_SEAL_SEAL.
*/
int __attribute__((unused)) _err1 = fcntl(*fd, F_ADD_SEALS, F_SEAL_FUTURE_WRITE); // Linux 5.1
int __attribute__((unused)) _err2 = fcntl(*fd, F_ADD_SEALS, F_SEAL_EXEC); // Linux 6.3
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_MIN_SEALS);
}
case EFD_FILE:{
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
int newfd;
Expand All @@ -400,199 +466,6 @@ static int seal_execfd(int *fd, int fdtype)
return -1;
}

struct bindfd_child_args {
int sockfd;
const char *mount_target;
};

static int bindfd_in_subprocess(void *arg)
{
/*
* In the interests of efficiency (read: minimizing the syscall count)
* and conciseness, no attempt is made to release resources which would
* be cleaned up automatically on process exit, i.e. when this function
* returns. This includes filesystem mounts, as this function is
* executed in a dedicated mount namespace.
*/

/*
* For obvious reasons this won't work in rootless mode because we
* haven't created a userns -- but getting that to work will be a bit
* complicated and it's only worth doing if someone actually needs it.
*/
if (mount("none", "/", NULL, MS_SLAVE | MS_REC, NULL) < 0)
return errno;
/*
* The kernel resolves the magic symlink /proc/self/exe to the real file
* _in the original mount namespace_. Cross-namespace bind mounts are
* not allowed, so we must locate the file inside the current mount
* namespace to be able to bind-mount it. (The mount(8) command resolves
* symlinks, which is why it appears to work at first glance.)
*/
char linkbuf[PATH_MAX + 1] = { 0 };
ssize_t linkpathlen = readlink("/proc/self/exe", linkbuf, sizeof(linkbuf));
if (linkpathlen < 0)
return errno;
if (linkpathlen == sizeof(linkbuf)) {
/*
* The link path is longer than PATH_MAX, and the contents of
* linkbuf might have been truncated. A truncated path could
* happen to be a valid path to a different file, which could
* allow for local privilege escalation if we were to exec it.
* The mount syscall doesn't accept paths longer than PATH_MAX,
* anyway.
*/
return ENAMETOOLONG;
}

int srcfd = open(linkbuf, O_PATH | O_CLOEXEC);
if (srcfd < 0)
return errno;
/*
* linkbuf holds the path to the binary which the parent process was
* launched from. Someone could have moved a different file to that path
* in the interim, in which case srcfd is not the file we want to
* bind-mount. Guard against this situation by verifying srcfd is the
* same file as /proc/self/exe.
*/
struct stat realexe = { 0 };
if (stat("/proc/self/exe", &realexe) < 0)
return errno;
struct stat resolved = { 0 };
if (fstat(srcfd, &resolved) < 0)
return errno;
if (resolved.st_dev != realexe.st_dev || resolved.st_ino != realexe.st_ino)
return ENOENT;
if (snprintf(linkbuf, sizeof(linkbuf), "/proc/self/fd/%d", srcfd) == sizeof(linkbuf))
return ENAMETOOLONG;

const struct bindfd_child_args *args = arg;
if (mount(linkbuf, args->mount_target, "", MS_BIND, "") < 0)
return errno;
if (mount("", args->mount_target, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
return errno;

int fd = open(args->mount_target, O_PATH | O_CLOEXEC);
if (fd < 0)
return errno;

/*
* Make sure the MNT_DETACH works, otherwise we could get remounted
* read-write and that would be quite bad.
*/
if (umount2(args->mount_target, MNT_DETACH) < 0)
return errno;

if (send_fd(args->sockfd, fd) < 0)
return errno;
return 0;
}

static int spawn_bindfd_child(const struct bindfd_child_args *args) __attribute__((noinline));
static int spawn_bindfd_child(const struct bindfd_child_args *args)
{
/*
* Carve out a chunk of our call stack for the child process to use as
* we can be sure it is correctly mapped for use as stack. (Technically
* only the libc clone() wrapper writes to this buffer. The child
* process operates on a copy of the parent's virtual memory space and
* so can safely overflow into the rest of the stack memory region
* without consequence.)
*/
char stack[4 * 1024] __attribute__((aligned(16)));
int tid = clone(bindfd_in_subprocess,
/*
* Assume stack grows down, as HP-PA, the only Linux
* platform where stack grows up, is obsolete.
*/
stack + sizeof(stack),
/*
* Suspend the parent process until the child has exited to
* save an unnecessary context switch as we'd just be
* waiting for the child process to exit anyway.
*/
CLONE_NEWNS | CLONE_VFORK, (void *)args);
if (tid < 0)
return -errno;
return tid;
}

static int try_bindfd(void)
{
int fd, ret = -1;
char template[PATH_MAX] = { 0 };
char *prefix = getenv("_LIBCONTAINER_STATEDIR");

if (!prefix || *prefix != '/')
prefix = "/tmp";
if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
return ret;

/*
* We need somewhere to mount it, mounting anything over /proc/self is a
* BAD idea on the host -- even if we do it temporarily.
*/
fd = mkstemp(template);
if (fd < 0)
return ret;
close(fd);

/*
* Daemons such as systemd and udisks2 watch /proc/self/mountinfo and
* re-parse it on every change, which gets expensive when the mount table
* is large and/or changes frequently. Perform the mount operations in a
* new, private mount namespace so as not to wake up those processes
* every time we nsexec into a container. We clone a child process into
* a new mount namespace to do the dirty work so the side effects of
* unsharing the mount namespace do not leak into the current process.
*/
int sock[2];
if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sock) < 0) {
ret = -errno;
goto cleanup_unlink;
}

struct bindfd_child_args args = {
.sockfd = sock[0],
.mount_target = template,
};
int cpid = spawn_bindfd_child(&args);
close(sock[0]);
if (cpid < 0) {
ret = cpid;
goto cleanup_socketpair;
}

int wstatus = 0;
if (waitpid(cpid, &wstatus, __WCLONE) < 0)
bail("error waiting for bindfd child process to exit");
if (WIFEXITED(wstatus)) {
if (WEXITSTATUS(wstatus)) {
ret = -WEXITSTATUS(wstatus);
goto cleanup_socketpair;
}
} else if (WIFSIGNALED(wstatus)) {
int sig = WTERMSIG(wstatus);
bail("bindfd child process terminated by signal %d (%s)", sig, strsignal(sig));
} else {
/* Should never happen... */
bail("unexpected waitpid() status for bindfd child process: 0x%x", wstatus);
}

ret = receive_fd(sock[1]);

cleanup_socketpair:
close(sock[1]);

cleanup_unlink:
/*
* We don't care about unlink errors, the worst that happens is that
* there's an empty file left around in STATEDIR.
*/
unlink(template);
return ret;
}

static ssize_t fd_to_fd(int outfd, int infd)
{
ssize_t total = 0;
Expand Down Expand Up @@ -627,18 +500,6 @@ static int clone_binary(void)
size_t sent = 0;
int fdtype = EFD_NONE;

/*
* Before we resort to copying, let's try creating an ro-binfd in one shot
* by getting a handle for a read-only bind-mount of the execfd.
*/
execfd = try_bindfd();
cyphar marked this conversation as resolved.
Show resolved Hide resolved
if (execfd >= 0)
return execfd;

/*
* Dammit, that didn't work -- time to copy the binary to a safe place we
* can seal the contents.
*/
execfd = make_execfd(&fdtype);
if (execfd < 0 || fdtype == EFD_NONE)
return -ENOTRECOVERABLE;
Expand Down
Loading