diff --git a/oci_spec/src/lib.rs b/oci_spec/src/lib.rs index 4fbc56371..d05efc6b9 100644 --- a/oci_spec/src/lib.rs +++ b/oci_spec/src/lib.rs @@ -559,9 +559,9 @@ pub enum LinuxSeccompOperator { #[derive(Serialize, Deserialize, Debug, Clone)] #[serde(rename_all = "camelCase")] pub struct Linux { - #[serde(default, rename = "LinuxIDMapping")] + #[serde(default)] pub uid_mappings: Vec, - #[serde(default, rename = "LinuxIDMapping")] + #[serde(default)] pub gid_mappings: Vec, #[serde(default)] pub sysctl: HashMap, diff --git a/src/create.rs b/src/create.rs index 3c1b19e1d..b24a73700 100644 --- a/src/create.rs +++ b/src/create.rs @@ -9,16 +9,17 @@ use nix::sched; use nix::unistd; use nix::unistd::{Gid, Uid}; -use crate::cgroups; use crate::container::{Container, ContainerStatus}; use crate::namespaces::Namespaces; use crate::notify_socket::NotifyListener; use crate::process::{fork, Process}; use crate::rootfs; +use crate::rootless::{lookup_map_binaries, should_use_rootless, Rootless}; use crate::stdio::FileDescriptor; use crate::tty; use crate::utils; use crate::{capabilities, command::Command}; +use crate::{cgroups, rootless}; /// This is the main structure which stores various commandline options given by /// high-level container runtime @@ -131,19 +132,27 @@ fn run_container>( let linux = spec.linux.as_ref().unwrap(); let namespaces: Namespaces = linux.namespaces.clone().into(); + let rootless = if should_use_rootless() { + log::debug!("rootless container should be created"); + log::warn!( + "resource constraints and multi id mapping is unimplemented for rootless containers" + ); + rootless::validate(&spec)?; + let mut rootless = Rootless::from(linux); + if let Some((uid_binary, gid_binary)) = lookup_map_binaries(linux)? { + rootless.newuidmap = Some(uid_binary); + rootless.newgidmap = Some(gid_binary); + } + Some(rootless) + } else { + None + }; + let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, container.id()); let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path)?; // first fork, which creates process, which will later create actual container process - match fork::fork_first( - pid_file, - namespaces - .clone_flags - .contains(sched::CloneFlags::CLONE_NEWUSER), - linux, - &container, - cmanager, - )? { + match fork::fork_first(pid_file, rootless, linux, &container, cmanager)? { // In the parent process, which called run_container Process::Parent(parent) => Ok(Process::Parent(parent)), // in child process diff --git a/src/lib.rs b/src/lib.rs index 98be65394..75326ef25 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,7 @@ pub mod notify_socket; pub mod pipe; pub mod process; pub mod rootfs; +pub mod rootless; pub mod signal; pub mod start; pub mod stdio; diff --git a/src/main.rs b/src/main.rs index 04ee7102e..d8ceb7884 100644 --- a/src/main.rs +++ b/src/main.rs @@ -12,6 +12,7 @@ use nix::sys::signal as nix_signal; use youki::command::linux::LinuxCommand; use youki::container::{Container, ContainerStatus}; use youki::create; +use youki::rootless::should_use_rootless; use youki::signal; use youki::start; @@ -79,7 +80,11 @@ fn main() -> Result<()> { eprintln!("log init failed: {:?}", e); } - let root_path = PathBuf::from(&opts.root); + let root_path = if should_use_rootless() && opts.root.eq(&PathBuf::from("/run/youki")) { + PathBuf::from("/tmp/rootless") + } else { + PathBuf::from(&opts.root) + }; fs::create_dir_all(&root_path)?; match opts.subcmd { diff --git a/src/process/child.rs b/src/process/child.rs index 2bdca4931..65db39921 100644 --- a/src/process/child.rs +++ b/src/process/child.rs @@ -1,6 +1,5 @@ use std::io::ErrorKind; use std::io::Read; -use std::io::Write; use anyhow::{bail, Result}; use mio::unix::pipe; @@ -9,6 +8,7 @@ use mio::unix::pipe::Sender; use mio::{Events, Interest, Poll, Token}; use nix::unistd::Pid; +use super::parent::ParentChannel; use super::{MAX_EVENTS, WAIT_FOR_INIT}; use crate::process::message::Message; @@ -18,7 +18,7 @@ const CHILD: Token = Token(1); /// Contains sending end of pipe for parent process, receiving end of pipe /// for the init process and poller for that pub struct ChildProcess { - sender_for_parent: Sender, + parent_channel: ParentChannel, receiver: Option, poll: Option, } @@ -29,9 +29,9 @@ pub struct ChildProcess { // a process point of view, init process is child of child process, which is child of original youki process. impl ChildProcess { /// create a new Child process structure - pub fn new(sender_for_parent: Sender) -> Result { + pub fn new(parent_channel: ParentChannel) -> Result { Ok(Self { - sender_for_parent, + parent_channel, receiver: None, poll: None, }) @@ -55,24 +55,17 @@ impl ChildProcess { /// Indicate that child process has forked the init process to parent process pub fn notify_parent(&mut self, init_pid: Pid) -> Result<()> { - log::debug!( - "child send to parent {:?}", - (Message::ChildReady as u8).to_be_bytes() - ); - // write ChildReady message to the pipe to parent - self.write_message_for_parent(Message::ChildReady)?; - // write pid of init process which is forked by child process to the pipe, - // Pid in nix::unistd is type alias of SessionId which itself is alias of i32 - self.sender_for_parent - .write_all(&(init_pid.as_raw()).to_be_bytes())?; + self.parent_channel.send_init_pid(init_pid)?; + Ok(()) + } + + pub fn request_identifier_mapping(&mut self) -> Result<()> { + self.parent_channel.request_identifier_mapping()?; Ok(()) } - /// writes given message to pipe for the parent - #[inline] - fn write_message_for_parent(&mut self, msg: Message) -> Result<()> { - self.sender_for_parent - .write_all(&(msg as u8).to_be_bytes())?; + pub fn wait_for_mapping_ack(&mut self) -> Result<()> { + self.parent_channel.wait_for_mapping_ack()?; Ok(()) } diff --git a/src/process/fork.rs b/src/process/fork.rs index 9689dacff..8a7a25198 100644 --- a/src/process/fork.rs +++ b/src/process/fork.rs @@ -14,25 +14,23 @@ use nix::unistd; use nix::unistd::Pid; use crate::cgroups::common::CgroupManager; +use crate::container::Container; use crate::container::ContainerStatus; use crate::process::{child, init, parent, Process}; -use crate::{container::Container, pipe::Pipe}; +use crate::rootless::Rootless; /// Function to perform the first fork for in order to run the container process pub fn fork_first>( pid_file: Option

, - is_userns: bool, + rootless: Option, linux: &oci_spec::Linux, container: &Container, cmanager: Box, ) -> Result { - // create a new pipe - let cpipe = Pipe::new()?; - // create new parent process structure - let (mut parent, sender_for_parent) = parent::ParentProcess::new()?; + let (mut parent, parent_channel) = parent::ParentProcess::new(rootless.clone())?; // create a new child process structure with sending end of parent process - let child = child::ChildProcess::new(sender_for_parent)?; + let mut child = child::ChildProcess::new(parent_channel)?; // fork the process match unsafe { unistd::fork()? } { @@ -51,21 +49,28 @@ pub fn fork_first>( // if new user is specified in specification, this will be true // and new namespace will be created, check https://man7.org/linux/man-pages/man7/user_namespaces.7.html // for more information - if is_userns { + if rootless.is_some() { + log::debug!("creating new user namespace"); sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?; + + // child needs to be dumpable, otherwise the non root parent is not + // allowed to write the uid/gid maps + prctl::set_dumpable(true).unwrap(); + child.request_identifier_mapping()?; + child.wait_for_mapping_ack()?; + prctl::set_dumpable(false).unwrap(); } - cpipe.notify()?; Ok(Process::Child(child)) } // in the parent process unistd::ForkResult::Parent { child } => { - cpipe.wait()?; - // wait for child to fork init process and report back its pid - let init_pid = parent.wait_for_child_ready()?; + let init_pid = parent.wait_for_child_ready(child)?; log::debug!("init pid is {:?}", init_pid); - cmanager.apply(&linux.resources.as_ref().unwrap(), Pid::from_raw(init_pid))?; + if rootless.is_none() && linux.resources.is_some() { + cmanager.apply(&linux.resources.as_ref().unwrap(), Pid::from_raw(init_pid))?; + } // update status and pid of the container process container diff --git a/src/process/message.rs b/src/process/message.rs index fddf09ab9..386b4fb77 100644 --- a/src/process/message.rs +++ b/src/process/message.rs @@ -3,6 +3,8 @@ pub enum Message { ChildReady = 0x00, InitReady = 0x01, + WriteMapping = 0x02, + MappingWritten = 0x03, } impl From for Message { @@ -10,6 +12,8 @@ impl From for Message { match from { 0x00 => Message::ChildReady, 0x01 => Message::InitReady, + 0x02 => Message::WriteMapping, + 0x03 => Message::MappingWritten, _ => panic!("unknown message."), } } diff --git a/src/process/mod.rs b/src/process/mod.rs index c64fea8bf..99ff334e8 100644 --- a/src/process/mod.rs +++ b/src/process/mod.rs @@ -26,3 +26,5 @@ const MAX_EVENTS: usize = 128; const WAIT_FOR_CHILD: Duration = Duration::from_secs(5); /// Time to wait when polling for message from init process const WAIT_FOR_INIT: Duration = Duration::from_millis(1000); +/// Time to wait when polling for mapping ack from parent +const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3); diff --git a/src/process/parent.rs b/src/process/parent.rs index 3ff941794..bd1fe6d2f 100644 --- a/src/process/parent.rs +++ b/src/process/parent.rs @@ -1,89 +1,252 @@ use std::io::ErrorKind; use std::io::Read; +use std::io::Write; +use std::path::Path; +use std::process::Command; use super::{MAX_EVENTS, WAIT_FOR_CHILD}; use crate::process::message::Message; +use crate::process::WAIT_FOR_MAPPING; +use crate::rootless::Rootless; +use crate::utils; +use anyhow::Context; use anyhow::{bail, Result}; use mio::unix::pipe; use mio::unix::pipe::{Receiver, Sender}; use mio::{Events, Interest, Poll, Token}; +use nix::unistd::Pid; +use oci_spec::LinuxIdMapping; // Token is used to identify which socket generated an event const PARENT: Token = Token(0); /// Contains receiving end of pipe to child process and a poller for that. pub struct ParentProcess { - receiver: Receiver, - poll: Poll, + child_channel: ChildChannel, } // Poll is used to register and listen for various events // by registering it with an event source such as receiving end of a pipe impl ParentProcess { /// Create new Parent process structure - pub fn new() -> Result<(Self, Sender)> { - // create a new pipe - let (sender, mut receiver) = pipe::new()?; - // create a new poll, and register the receiving end of pipe to it - // This will poll for the read events, so when data is written to sending end of the pipe, - // the receiving end will be readable and poll wil notify + pub fn new(rootless: Option) -> Result<(Self, ParentChannel)> { + let (parent_channel, child_channel) = Self::setup_pipes(rootless)?; + let parent = Self { child_channel }; + + Ok((parent, parent_channel)) + } + + fn setup_pipes(rootless: Option) -> Result<(ParentChannel, ChildChannel)> { + let (send_to_parent, receive_from_child) = pipe::new()?; + let (send_to_child, receive_from_parent) = pipe::new()?; + + let parent_channel = ParentChannel::new(send_to_parent, receive_from_parent)?; + let child_channel = ChildChannel::new(send_to_child, receive_from_child, rootless)?; + + Ok((parent_channel, child_channel)) + } + + /// Waits for associated child process to send ready message + /// and return the pid of init process which is forked by child process + pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result { + let init_pid = self.child_channel.wait_for_child_ready(child_pid)?; + Ok(init_pid) + } +} + +// Channel for communicating with the parent +pub struct ParentChannel { + sender: Sender, + receiver: Receiver, + poll: Poll, +} + +impl ParentChannel { + fn new(sender: Sender, mut receiver: Receiver) -> Result { let poll = Poll::new()?; poll.registry() .register(&mut receiver, PARENT, Interest::READABLE)?; - Ok((Self { receiver, poll }, sender)) + Ok(Self { + sender, + receiver, + poll, + }) } - /// Waits for associated child process to send ready message - /// and return the pid of init process which is forked by child process - pub fn wait_for_child_ready(&mut self) -> Result { - // Create collection with capacity to store up to MAX_EVENTS events + pub fn send_init_pid(&mut self, pid: Pid) -> Result<()> { + // write ChildReady message to the pipe to parent + log::debug!("[child to parent] sending init pid ({:?})", pid); + self.write_message(Message::ChildReady)?; + // write pid of init process which is forked by child process to the pipe, + // Pid in nix::unistd is type alias of SessionId which itself is alias of i32 + self.sender.write_all(&(pid.as_raw()).to_be_bytes())?; + Ok(()) + } + + // requests the parent to write the id mappings for the child process + // this needs to be done from the parent see https://man7.org/linux/man-pages/man7/user_namespaces.7.html + pub fn request_identifier_mapping(&mut self) -> Result<()> { + log::debug!("[child to parent] request identifier mapping"); + self.write_message(Message::WriteMapping)?; + Ok(()) + } + + // wait until the parent process has finished writing the id mappings + pub fn wait_for_mapping_ack(&mut self) -> Result<()> { let mut events = Events::with_capacity(MAX_EVENTS); + log::debug!("waiting for ack from parent"); - // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event - self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?; + self.poll.poll(&mut events, Some(WAIT_FOR_MAPPING))?; for event in events.iter() { - // check if the event token in PARENT - // note that this does not assign anything to PARENT, but instead compares PARENT and event.token() - // check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation - if let PARENT = event.token() { - // read data from pipe + if event.token() == PARENT { let mut buf = [0; 1]; match self.receiver.read_exact(&mut buf) { - // This error simply means that there are no more incoming connections waiting to be accepted at this point. Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), Err(e) => bail!( "Failed to receive a message from the child process. {:?}", e ), _ => (), - }; - // convert to Message wrapper + } + match Message::from(u8::from_be_bytes(buf)) { - Message::ChildReady => { - // read pid of init process forked by child, 4 bytes as the type is i32 - let mut buf = [0; 4]; - match self.receiver.read_exact(&mut buf) { - // This error simply means that there are no more incoming connections waiting to be accepted at this point. - Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), - Err(e) => bail!( - "Failed to receive a message from the child process. {:?}", - e - ), - _ => (), + Message::MappingWritten => return Ok(()), + msg => bail!("receive unexpected message {:?} in child process", msg), + } + } + } + unreachable!("timed out waiting for mapping ack from parent") + } + + #[inline] + fn write_message(&mut self, msg: Message) -> Result<()> { + self.sender.write_all(&(msg as u8).to_be_bytes())?; + Ok(()) + } +} + +struct ChildChannel { + sender: Sender, + receiver: Receiver, + poll: Poll, + rootless: Option, +} + +impl ChildChannel { + fn new(sender: Sender, mut receiver: Receiver, rootless: Option) -> Result { + let poll = Poll::new()?; + poll.registry() + .register(&mut receiver, PARENT, Interest::READABLE)?; + Ok(Self { + sender, + receiver, + poll, + rootless, + }) + } + + /// Waits for associated child process to send ready message + /// and return the pid of init process which is forked by child process + pub fn wait_for_child_ready(&mut self, child_pid: Pid) -> Result { + // Create collection with capacity to store up to MAX_EVENTS events + let mut events = Events::with_capacity(MAX_EVENTS); + loop { + // poll the receiving end of pipe created for WAIT_FOR_CHILD duration for an event + self.poll.poll(&mut events, Some(WAIT_FOR_CHILD))?; + for event in events.iter() { + // check if the event token in PARENT + // note that this does not assign anything to PARENT, but instead compares PARENT and event.token() + // check http://patshaughnessy.net/2018/1/18/learning-rust-if-let-vs--match for a bit more detailed explanation + if let PARENT = event.token() { + // read data from pipe + let mut buf = [0; 1]; + match self.receiver.read_exact(&mut buf) { + // This error simply means that there are no more incoming connections waiting to be accepted at this point. + Err(ref e) if e.kind() == ErrorKind::WouldBlock => { + break; + } + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + }; + // convert to Message wrapper + match Message::from(u8::from_be_bytes(buf)) { + Message::ChildReady => { + log::debug!("received child ready message"); + // read pid of init process forked by child, 4 bytes as the type is i32 + let mut buf = [0; 4]; + match self.receiver.read_exact(&mut buf) { + // This error simply means that there are no more incoming connections waiting to be accepted at this point. + Err(ref e) if e.kind() == ErrorKind::WouldBlock => (), + Err(e) => bail!( + "Failed to receive a message from the child process. {:?}", + e + ), + _ => (), + } + return Ok(i32::from_be_bytes(buf)); } - return Ok(i32::from_be_bytes(buf)); + Message::WriteMapping => { + log::debug!("write mapping for pid {:?}", child_pid); + utils::write_file(format!("/proc/{}/setgroups", child_pid), "deny")?; + self.write_uid_mapping(child_pid)?; + self.write_gid_mapping(child_pid)?; + self.notify_mapping_written()?; + } + msg => bail!("receive unexpected message {:?} in parent process", msg), } - msg => bail!("receive unexpected message {:?} in parent process", msg), + } else { + // as the poll is registered with only parent token + unreachable!() } - } else { - // as the poll is registered with only parent token - unreachable!() } } - // should not reach here, as there should be a ready event from child within WAIT_FOR_CHILD duration - unreachable!( - "No message received from child process within {} seconds", - WAIT_FOR_CHILD.as_secs() - ); } + + fn notify_mapping_written(&mut self) -> Result<()> { + self.sender + .write_all(&(Message::MappingWritten as u8).to_be_bytes())?; + Ok(()) + } + + fn write_uid_mapping(&self, target_pid: Pid) -> Result<()> { + let rootless = self.rootless.as_ref().unwrap(); + write_id_mapping( + &format!("/proc/{}/uid_map", target_pid), + &rootless.uid_mappings, + rootless.newuidmap.as_deref(), + ) + } + + fn write_gid_mapping(&self, target_pid: Pid) -> Result<()> { + let rootless = self.rootless.as_ref().unwrap(); + write_id_mapping( + &format!("/proc/{}/gid_map", target_pid), + &rootless.gid_mappings, + rootless.newgidmap.as_deref(), + ) + } +} + +fn write_id_mapping( + map_file: &str, + mappings: &[LinuxIdMapping], + map_binary: Option<&Path>, +) -> Result<()> { + let mappings: Vec = mappings + .iter() + .map(|m| format!("{} {} {}", m.container_id, m.host_id, m.size)) + .collect(); + if mappings.len() == 1 { + utils::write_file(map_file, mappings.first().unwrap())?; + } else { + Command::new(map_binary.unwrap()) + .args(mappings) + .output() + .with_context(|| format!("failed to execute {:?}", map_binary))?; + } + + Ok(()) } diff --git a/src/rootless.rs b/src/rootless.rs new file mode 100644 index 000000000..799b39fa8 --- /dev/null +++ b/src/rootless.rs @@ -0,0 +1,117 @@ +use std::{env, path::PathBuf}; + +use anyhow::{bail, Result}; +use nix::sched::CloneFlags; +use oci_spec::{Linux, LinuxIdMapping, Mount, Spec}; + +use crate::namespaces::Namespaces; + +#[derive(Debug, Clone)] +pub struct Rootless { + /// Location of the newuidmap binary + pub newuidmap: Option, + /// Location of the newgidmap binary + pub newgidmap: Option, + /// Mappings for user ids + pub uid_mappings: Vec, + /// Mappings for group ids + pub gid_mappings: Vec, +} + +impl From<&Linux> for Rootless { + fn from(linux: &Linux) -> Self { + Self { + newuidmap: None, + newgidmap: None, + uid_mappings: linux.uid_mappings.clone(), + gid_mappings: linux.gid_mappings.clone(), + } + } +} + +/// Checks if rootless mode should be used +pub fn should_use_rootless() -> bool { + if !nix::unistd::geteuid().is_root() { + return true; + } + + if let Ok("true") = std::env::var("YOUKI_USE_ROOTLESS").as_deref() { + return true; + } + + false +} + +/// Validates that the spec contains the required information for +/// running in rootless mode +pub fn validate(spec: &Spec) -> Result<()> { + let linux = spec.linux.as_ref().unwrap(); + + if linux.uid_mappings.is_empty() { + bail!("rootless containers require at least one uid mapping"); + } + + if linux.gid_mappings.is_empty() { + bail!("rootless containers require at least one gid mapping") + } + + let namespaces: Namespaces = linux.namespaces.clone().into(); + if !namespaces.clone_flags.contains(CloneFlags::CLONE_NEWUSER) { + bail!("rootless containers require the specification of a user namespace"); + } + + validate_mounts(&spec.mounts, &linux.uid_mappings, &linux.gid_mappings)?; + + Ok(()) +} + +fn validate_mounts( + mounts: &[Mount], + uid_mappings: &[LinuxIdMapping], + gid_mappings: &[LinuxIdMapping], +) -> Result<()> { + for mount in mounts { + for opt in &mount.options { + if opt.starts_with("uid=") && !is_id_mapped(&opt[4..], uid_mappings)? { + bail!("Mount {:?} specifies option {} which is not mapped inside the rootless container", mount, opt); + } + + if opt.starts_with("gid=") && !is_id_mapped(&opt[4..], gid_mappings)? { + bail!("Mount {:?} specifies option {} which is not mapped inside the rootless container", mount, opt); + } + } + } + + Ok(()) +} + +fn is_id_mapped(id: &str, mappings: &[LinuxIdMapping]) -> Result { + let id = id.parse::()?; + Ok(mappings + .iter() + .any(|m| id >= m.container_id && id <= m.container_id + m.size)) +} + +/// Looks up the location of the newuidmap and newgidmap binaries which +/// are required to write multiple user/group mappings +pub fn lookup_map_binaries(spec: &Linux) -> Result> { + if spec.uid_mappings.len() == 1 && spec.uid_mappings.len() == 1 { + return Ok(None); + } + + let uidmap = lookup_map_binary("newuidmap")?; + let gidmap = lookup_map_binary("newgidmap")?; + + match (uidmap, gidmap) { + (Some(newuidmap), Some(newgidmap)) => Ok(Some((newuidmap, newgidmap))), + _ => bail!("newuidmap/newgidmap binaries could not be found in path. This is required if multiple id mappings are specified"), + } +} + +fn lookup_map_binary(binary: &str) -> Result> { + let paths = env::var("PATH")?; + Ok(paths + .split_terminator(':') + .find(|p| PathBuf::from(p).join(binary).exists()) + .map(PathBuf::from)) +} diff --git a/src/utils.rs b/src/utils.rs index 178b3d054..441103dde 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -6,6 +6,7 @@ use std::fs; use std::path::{Path, PathBuf}; use std::time::Duration; +use anyhow::Context; use anyhow::{bail, Result}; use nix::unistd; @@ -88,6 +89,12 @@ pub fn delete_with_retry>(path: P) -> Result<()> { bail!("could not delete {:?}", path) } +pub fn write_file, C: AsRef<[u8]>>(path: P, contents: C) -> Result<()> { + let path = path.as_ref(); + fs::write(path, contents).with_context(|| format!("failed to write to {:?}", path))?; + Ok(()) +} + #[cfg(test)] mod tests { use super::*;