diff --git a/docs/1-installing.md b/docs/1-installing.md index b1c2822..44cb689 100644 --- a/docs/1-installing.md +++ b/docs/1-installing.md @@ -35,7 +35,7 @@ To also set up crun-vm for use with Docker: 1. Install crun-vm's runtime dependencies: ```console - $ dnf install bash coreutils crun genisoimage grep libselinux-devel libvirt-client libvirt-daemon-driver-qemu libvirt-daemon-log openssh-clients qemu-img qemu-system-x86-core shadow-utils util-linux virtiofsd + $ dnf install bash coreutils crun crun-krun genisoimage grep libselinux-devel libvirt-client libvirt-daemon-driver-qemu libvirt-daemon-log openssh-clients qemu-img qemu-system-x86-core sed shadow-utils util-linux virtiofsd ``` 2. Install Rust and Cargo if you do not already have Rust tooling available: diff --git a/embed/bootc/config.json b/embed/bootc/config.json new file mode 100644 index 0000000..a40fc6c --- /dev/null +++ b/embed/bootc/config.json @@ -0,0 +1,88 @@ +{ + "ociVersion": "1.0.0", + "process": { + "terminal": true, + "user": { "uid": 0, "gid": 0 }, + "args": ["/output/entrypoint.sh", ""], + "env": [ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm" + ], + "cwd": "/", + "capabilities": { + "bounding": [], + "effective": [], + "inheritable": [], + "permitted": [], + "ambient": [] + }, + "rlimits": [ + { + "type": "RLIMIT_NOFILE", + "hard": 262144, + "soft": 262144 + } + ], + "noNewPrivileges": true + }, + "root": { + "path": "", + "readonly": false + }, + "hostname": "bootc-install", + "mounts": [ + { + "type": "bind", + "source": "/root/crun-vm/bootc", + "destination": "/output", + "options": ["bind", "rprivate", "rw"] + }, + { + "destination": "/proc", + "type": "proc", + "source": "proc" + }, + { + "destination": "/dev/pts", + "type": "devpts", + "source": "devpts", + "options": [ + "nosuid", + "noexec", + "newinstance", + "ptmxmode=0666", + "mode=0620", + "gid=5" + ] + } + ], + "linux": { + "namespaces": [ + { "type": "pid" }, + { "type": "network" }, + { "type": "ipc" }, + { "type": "uts" }, + { "type": "cgroup" }, + { "type": "mount" } + ], + "maskedPaths": [ + "/proc/acpi", + "/proc/asound", + "/proc/kcore", + "/proc/keys", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + "/proc/scsi" + ], + "readonlyPaths": [ + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger" + ] + } +} diff --git a/embed/bootc/entrypoint.sh b/embed/bootc/entrypoint.sh new file mode 100644 index 0000000..0a5aa21 --- /dev/null +++ b/embed/bootc/entrypoint.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# SPDX-License-Identifier: GPL-2.0-or-later + +set -e + +image_name=$1 + +bootc install to-disk \ + --source-imgref oci-archive:/output/image.oci-archive \ + --target-imgref "$image_name" \ + --skip-fetch-check \ + --generic-image \ + --via-loopback \ + --karg console=tty0 \ + --karg console=ttyS0 \ + --karg selinux=0 \ + /output/image.raw + +touch /output/success diff --git a/embed/bootc/prepare.sh b/embed/bootc/prepare.sh new file mode 100644 index 0000000..3b7dd5b --- /dev/null +++ b/embed/bootc/prepare.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-or-later + +set -o errexit -o pipefail -o nounset + +original_root=$1 +priv_dir=$2 +container_id=$3 + +__step() { + >&2 printf "\033[36m%s\033[0m\n" "$*" +} + +mkfifo "$priv_dir/root/crun-vm/bootc/progress" +exec > "$priv_dir/root/crun-vm/bootc/progress" 2>&1 + +# this blocks here until the named pipe above is opened by entrypoint.sh + +# get info about the container *image* + +__step 'Storing the container image as an OCI archive...' + +image_info=$( + podman container inspect \ + --format '{{.ImageName}}\t{{.Image}}' \ + "$container_id" + ) + +image_name=$( cut -f1 <<< "$image_info" ) +image_id=$( cut -f2 <<< "$image_info" ) + +oci_archive=$priv_dir/root/crun-vm/bootc/image.oci-archive + +# save container *image* as an OCI archive + +podman save --format oci-archive --output "$oci_archive.tmp" "$image_id" +mv "$oci_archive.tmp" "$oci_archive" + +# adjust krun config + +__step 'Generating a VM image from the container image...' + +__sed() { + sed -i "s|$1|$2|" "$priv_dir/root/crun-vm/bootc/config.json" +} + +__sed "" "$image_name" +__sed "" "$original_root" +__sed "" "$priv_dir" + +# run bootc-install under krun + +truncate --size 10G "$priv_dir/root/crun-vm/bootc/image.raw" # TODO: allow adjusting disk size + +krun run \ + --config "$priv_dir/root/crun-vm/bootc/config.json" \ + "crun-vm-$container_id" \ + ]) -> Resu let config_path = bundle_path.join("config.json"); let mut spec = oci_spec::runtime::Spec::load(&config_path)?; - let original_root_path: Utf8PathBuf = spec.root_path()?.canonicalize()?.try_into()?; // ensure absolute - - if let Some(process) = spec.process().as_ref() { - if let Some(capabilities) = process.capabilities().as_ref() { - fn any_is_cap_sys_admin(caps: &Option) -> bool { - caps.as_ref() - .is_some_and(|set| set.contains(&oci_spec::runtime::Capability::SysAdmin)) - } + ensure_unprivileged(&spec)?; - ensure!( - !any_is_cap_sys_admin(capabilities.bounding()) - && !any_is_cap_sys_admin(capabilities.effective()) - && !any_is_cap_sys_admin(capabilities.inheritable()) - && !any_is_cap_sys_admin(capabilities.permitted()) - && !any_is_cap_sys_admin(capabilities.ambient()), - "crun-vm is incompatible with privileged containers" - ); - } - } + let original_root_path: Utf8PathBuf = spec.root_path()?.canonicalize()?.try_into()?; // ensure absolute let runtime_env = RuntimeEnv::current(&spec, &original_root_path)?; let custom_options = CustomOptions::from_spec(&spec, runtime_env)?; + let is_bootc_container = is_bootc_container( + &args.container_id, + bundle_path, + &original_root_path, + runtime_env, + )?; + // We include container_id in our paths to ensure no overlap with the user container's contents. let priv_dir_path = original_root_path.join(format!("crun-vm-{}", args.container_id)); fs::create_dir_all(&priv_dir_path)?; @@ -66,7 +59,13 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu set_file_context(&priv_dir_path, context)?; } - set_up_container_root(&mut spec, &priv_dir_path, &custom_options)?; + set_up_container_root( + &mut spec, + &priv_dir_path, + &custom_options, + is_bootc_container, + )?; + let is_first_create = is_first_create(&spec)?; let base_vm_image_info = set_up_vm_image( @@ -75,6 +74,7 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu &priv_dir_path, &custom_options, is_first_create, + is_bootc_container, )?; let mut mounts = Mounts::default(); @@ -100,9 +100,87 @@ pub fn create(args: &liboci_cli::Create, raw_args: &[impl AsRef]) -> Resu crun(raw_args)?; // actually create container + if is_first_create && is_bootc_container { + // We want to ask podman what our image name is, so we can give it to bootc-install, but we + // can't wait synchronously for a response since podman hangs until this create command + // completes. We then want to run bootc-install under krun, which already isolates the + // workload and so can be run outside of our container. We thus launch a process that + // asynchronously performs these steps, and share its progress and output with out + // container's entrypoint through a named pipe. + // + // Note that this process blocks until our container's entrypoint actually starts running, + // thus after the "start" OCI runtime command is called. + + let bootc_dir = priv_dir_path.join("root/crun-vm/bootc"); + fs::create_dir_all(&bootc_dir)?; + + std::process::Command::new(bootc_dir.join("prepare.sh")) + .arg(&original_root_path) + .arg(&priv_dir_path) + .arg(&args.container_id) + .stdin(Stdio::null()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn()?; + } + + Ok(()) +} + +fn ensure_unprivileged(spec: &oci_spec::runtime::Spec) -> Result<()> { + if let Some(process) = spec.process().as_ref() { + if let Some(capabilities) = process.capabilities().as_ref() { + fn any_is_cap_sys_admin(caps: &Option) -> bool { + caps.as_ref() + .is_some_and(|set| set.contains(&oci_spec::runtime::Capability::SysAdmin)) + } + + ensure!( + !any_is_cap_sys_admin(capabilities.bounding()) + && !any_is_cap_sys_admin(capabilities.effective()) + && !any_is_cap_sys_admin(capabilities.inheritable()) + && !any_is_cap_sys_admin(capabilities.permitted()) + && !any_is_cap_sys_admin(capabilities.ambient()), + "crun-vm is incompatible with privileged containers" + ); + } + } + Ok(()) } +fn is_bootc_container( + container_id: &str, + bundle_path: &Utf8Path, + original_root_path: &Utf8Path, + env: RuntimeEnv, +) -> Result { + lazy_static! { + static ref PATTERN: Regex = Regex::new(r"/overlay-containers/([^/]+)/userdata$").unwrap(); + } + + let bootc_config_dir = original_root_path.join("usr/lib/bootc/install"); + + let is_bootc_container = + bootc_config_dir.is_dir() && bootc_config_dir.read_dir()?.next().is_some(); + + if is_bootc_container { + // check as much as we can that we're running under podman + + let is_podman_bundle_path = match PATTERN.captures(bundle_path.as_str()) { + Some(captures) => &captures[1] == container_id, + None => false, + }; + + ensure!( + env == RuntimeEnv::Other && is_podman_bundle_path, + "bootc containers are only supported with Podman" + ); + } + + Ok(is_bootc_container) +} + fn is_first_create(spec: &oci_spec::runtime::Spec) -> Result { let path = spec.root_path()?.join("crun-vm/create-ran"); @@ -124,6 +202,7 @@ fn set_up_container_root( spec: &mut oci_spec::runtime::Spec, priv_dir_path: &Utf8Path, custom_options: &CustomOptions, + is_bootc_container: bool, ) -> Result<()> { let new_root_path = priv_dir_path.join("root"); fs::create_dir_all(&new_root_path)?; @@ -138,19 +217,22 @@ fn set_up_container_root( .unwrap(), )); - // set up container scripts + // set up container files #[derive(RustEmbed)] - #[folder = "scripts/"] - struct Scripts; + #[folder = "embed/"] + struct Embed; - for path in Scripts::iter() { + for path in Embed::iter() { let path_in_host = new_root_path.join("crun-vm").join(path.as_ref()); fs::create_dir_all(path_in_host.parent().unwrap())?; - let file = Scripts::get(&path).unwrap(); + let file = Embed::get(&path).unwrap(); fs::write(&path_in_host, file.data)?; - fs::set_permissions(&path_in_host, Permissions::from_mode(0o755))?; + + let is_script = path.as_ref().ends_with(".sh"); + let mode = if is_script { 0o755 } else { 0o644 }; + fs::set_permissions(&path_in_host, Permissions::from_mode(mode))?; } // configure container entrypoint @@ -160,7 +242,8 @@ fn set_up_container_root( } else if custom_options.print_config_json { vec!["cat", "/crun-vm/config.json"] } else { - vec!["/crun-vm/entrypoint.sh"] + let arg = if is_bootc_container { "1" } else { "0" }; + vec!["/crun-vm/entrypoint.sh", arg] }; spec.set_process({ @@ -184,7 +267,20 @@ fn set_up_vm_image( priv_dir_path: &Utf8Path, custom_options: &CustomOptions, is_first_create: bool, + is_bootc_container: bool, ) -> Result { + let mirror_vm_image_path_in_container = Utf8PathBuf::from("/crun-vm/image/image"); + let mirror_vm_image_path_in_host = spec.root_path()?.join("crun-vm/image/image"); + + if is_bootc_container { + // the image will be generated later + return Ok(VmImageInfo { + path: mirror_vm_image_path_in_container, + size: 0, + format: "raw".to_string(), + }); + } + // where inside the container to look for the VM image const VM_IMAGE_SEARCH_PATHS: [&str; 2] = ["./", "disk/"]; @@ -208,9 +304,6 @@ fn set_up_vm_image( fs::hard_link(vm_image_path_in_host, image_dir_path.join("image"))?; } - let mirror_vm_image_path_in_container = Utf8PathBuf::from("/crun-vm/image/image"); - let mirror_vm_image_path_in_host = spec.root_path()?.join("crun-vm/image/image"); - if custom_options.persistent { // Mount overlayfs to expose the user's VM image file with a different SELinux context so we // can always access it, using the file's parent as the upperdir so that writes still @@ -220,7 +313,7 @@ fn set_up_vm_image( bind_mount_dir_with_different_context( image_dir_path, mirror_vm_image_path_in_host.parent().unwrap(), - priv_dir_path.join("scratch"), + priv_dir_path.join("scratch-image"), spec.mount_label(), false, )?; @@ -243,7 +336,7 @@ fn set_up_vm_image( bind_mount_dir_with_different_context( image_dir_path, mirror_vm_image_path_in_host.parent().unwrap(), - priv_dir_path.join("scratch"), + priv_dir_path.join("scratch-image"), spec.mount_label(), true, )?; @@ -560,7 +653,7 @@ fn set_up_security(spec: &mut oci_spec::runtime::Spec) { // TODO: This doesn't seem reasonable at all. Should we just force users to use a different // seccomp profile? Should passt provide the option to bypass a lot of the isolation that it // does, given we're already in a container *and* under a seccomp profile? - spec.linux_seccomp_syscalls_push( + spec.linux_seccomp_syscalls_push_front( oci_spec::runtime::LinuxSyscallBuilder::default() .names(["mount", "pivot_root", "umount2", "unshare"].map(String::from)) .action(oci_spec::runtime::LinuxSeccompAction::ScmpActAllow) diff --git a/src/util.rs b/src/util.rs index fc0cde4..84ce83f 100644 --- a/src/util.rs +++ b/src/util.rs @@ -33,13 +33,13 @@ pub fn fix_selinux_label(process: &mut oci_spec::runtime::Process) { pub fn set_file_context(path: impl AsRef, context: &str) -> Result<()> { extern "C" { - fn setfilecon(path: *const c_char, con: *const c_char) -> i32; + fn lsetfilecon(path: *const c_char, con: *const c_char) -> i32; } let path = CString::new(path.as_ref().as_os_str().as_bytes())?; let context = CString::new(context.as_bytes())?; - if unsafe { setfilecon(path.as_ptr(), context.as_ptr()) } != 0 { + if unsafe { lsetfilecon(path.as_ptr(), context.as_ptr()) } != 0 { return Err(io::Error::last_os_error().into()); } @@ -198,7 +198,7 @@ pub trait SpecExt { linux_device_cgroup: oci_spec::runtime::LinuxDeviceCgroup, ); fn process_capabilities_insert_beip(&mut self, capability: oci_spec::runtime::Capability); - fn linux_seccomp_syscalls_push(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall); + fn linux_seccomp_syscalls_push_front(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall); } impl SpecExt for oci_spec::runtime::Spec { @@ -276,7 +276,10 @@ impl SpecExt for oci_spec::runtime::Spec { }); } - fn linux_seccomp_syscalls_push(&mut self, linux_syscall: oci_spec::runtime::LinuxSyscall) { + fn linux_seccomp_syscalls_push_front( + &mut self, + linux_syscall: oci_spec::runtime::LinuxSyscall, + ) { self.set_linux({ let mut linux = self.linux().clone().expect("linux config"); linux.set_seccomp({ @@ -284,7 +287,7 @@ impl SpecExt for oci_spec::runtime::Spec { if let Some(seccomp) = &mut seccomp { seccomp.set_syscalls({ let mut syscalls = seccomp.syscalls().clone().unwrap_or_default(); - syscalls.push(linux_syscall); + syscalls.insert(0, linux_syscall); Some(syscalls) }); } diff --git a/tests/env.sh b/tests/env.sh index 2aea024..d2b7008 100755 --- a/tests/env.sh +++ b/tests/env.sh @@ -11,20 +11,23 @@ container_name=crun-vm-test-env declare -A TEST_IMAGES TEST_IMAGES=( - [fedora]=quay.io/containerdisks/fedora:39 # uses cloud-init - [coreos]=quay.io/crun-vm/example-fedora-coreos:39 # uses Ignition + [fedora]=quay.io/containerdisks/fedora:39 # uses cloud-init + [coreos]=quay.io/crun-vm/example-fedora-coreos:39 # uses Ignition + [fedora-bootc]=quay.io/centos-bootc/fedora-bootc:eln # bootable container ) declare -A TEST_IMAGES_DEFAULT_USER TEST_IMAGES_DEFAULT_USER=( [fedora]=fedora [coreos]=core + [fedora-bootc]=cloud-user ) declare -A TEST_IMAGES_DEFAULT_USER_HOME TEST_IMAGES_DEFAULT_USER_HOME=( [fedora]=/home/fedora [coreos]=/var/home/core + [fedora-bootc]=/var/home/cloud-user ) __bad_usage() { diff --git a/tests/t/bootc-rootfs.sh b/tests/t/bootc-rootfs.sh new file mode 100644 index 0000000..5d78b04 --- /dev/null +++ b/tests/t/bootc-rootfs.sh @@ -0,0 +1,15 @@ +# SPDX-License-Identifier: GPL-2.0-or-later + +if [[ "$ENGINE" == docker ]]; then + # we only support bootc containers under Podman + __skip +fi + +"$UTIL_DIR/extract-vm-image.sh" "${TEST_IMAGES[fedora-bootc]}" "$TEMP_DIR/image" + +__run() { + __engine run --rm --detach --name bootc-rootfs "$@" --rootfs "$TEMP_DIR" +} + +! __run +! __run --persistent diff --git a/tests/t/cloud-init.sh b/tests/t/cloud-init.sh index 6ea51dd..0c95f02 100644 --- a/tests/t/cloud-init.sh +++ b/tests/t/cloud-init.sh @@ -1,9 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-or-later -image="${TEST_IMAGES[fedora]}" -user="${TEST_IMAGES_DEFAULT_USER[fedora]}" -home="${TEST_IMAGES_DEFAULT_USER_HOME[fedora]}" - cat >"$TEMP_DIR/user-data" <"$TEMP_DIR/meta-data" </dev/null + endpoint=$( __engine port publish | tee /dev/stderr | cut -d' ' -f3 ) -__engine exec publish --as "$user" python -m http.server & -trap '__engine stop publish' EXIT + __engine exec publish --as "$user" + + __log 'Ensuring curl fails...' + ! curl "$endpoint" 2>/dev/null + + __engine exec publish --as "$user" python -m http.server & + + sleep 3 + + __log 'Ensuring curl succeeds...' + [[ "$( curl "$endpoint" 2>/dev/null | head -1 )" == "" ]] -sleep 3 + __engine stop publish -__log 'Ensuring curl succeeds...' -[[ "$( curl "$endpoint" 2>/dev/null | head -1 )" == "" ]] +done