Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental support for rootless containers #98

Merged
merged 10 commits into from
Jun 19, 2021
4 changes: 2 additions & 2 deletions oci_spec/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -559,9 +559,9 @@ pub enum LinuxSeccompOperator {
#[derive(Serialize, Deserialize, Debug, Clone)]
#[serde(rename_all = "camelCase")]
pub struct Linux {
#[serde(default, rename = "LinuxIDMapping")]
#[serde(default)]
pub uid_mappings: Vec<LinuxIdMapping>,
#[serde(default, rename = "LinuxIDMapping")]
#[serde(default)]
pub gid_mappings: Vec<LinuxIdMapping>,
#[serde(default)]
pub sysctl: HashMap<String, String>,
Expand Down
29 changes: 19 additions & 10 deletions src/create.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@ use nix::sched;
use nix::unistd;
use nix::unistd::{Gid, Uid};

use crate::cgroups;
use crate::container::{Container, ContainerStatus};
use crate::namespaces::Namespaces;
use crate::notify_socket::NotifyListener;
use crate::process::{fork, Process};
use crate::rootfs;
use crate::rootless::{lookup_map_binaries, should_use_rootless, Rootless};
use crate::stdio::FileDescriptor;
use crate::tty;
use crate::utils;
use crate::{capabilities, command::Command};
use crate::{cgroups, rootless};

/// This is the main structure which stores various commandline options given by
/// high-level container runtime
Expand Down Expand Up @@ -131,19 +132,27 @@ fn run_container<P: AsRef<Path>>(
let linux = spec.linux.as_ref().unwrap();
let namespaces: Namespaces = linux.namespaces.clone().into();

let rootless = if should_use_rootless() {
log::debug!("rootless container should be created");
log::warn!(
"resource constraints and multi id mapping is unimplemented for rootless containers"
);
rootless::validate(&spec)?;
let mut rootless = Rootless::from(linux);
if let Some((uid_binary, gid_binary)) = lookup_map_binaries(linux)? {
rootless.newuidmap = Some(uid_binary);
rootless.newgidmap = Some(gid_binary);
}
Some(rootless)
} else {
None
};

let cgroups_path = utils::get_cgroup_path(&linux.cgroups_path, container.id());
let cmanager = cgroups::common::create_cgroup_manager(&cgroups_path)?;

// first fork, which creates process, which will later create actual container process
match fork::fork_first(
pid_file,
namespaces
.clone_flags
.contains(sched::CloneFlags::CLONE_NEWUSER),
linux,
&container,
cmanager,
)? {
match fork::fork_first(pid_file, rootless, linux, &container, cmanager)? {
// In the parent process, which called run_container
Process::Parent(parent) => Ok(Process::Parent(parent)),
// in child process
Expand Down
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ pub mod notify_socket;
pub mod pipe;
pub mod process;
pub mod rootfs;
pub mod rootless;
pub mod signal;
pub mod start;
pub mod stdio;
Expand Down
7 changes: 6 additions & 1 deletion src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use nix::sys::signal as nix_signal;
use youki::command::linux::LinuxCommand;
use youki::container::{Container, ContainerStatus};
use youki::create;
use youki::rootless::should_use_rootless;
use youki::signal;
use youki::start;

Expand Down Expand Up @@ -79,7 +80,11 @@ fn main() -> Result<()> {
eprintln!("log init failed: {:?}", e);
}

let root_path = PathBuf::from(&opts.root);
let root_path = if should_use_rootless() && opts.root.eq(&PathBuf::from("/run/youki")) {
PathBuf::from("/tmp/rootless")
} else {
PathBuf::from(&opts.root)
};
fs::create_dir_all(&root_path)?;

match opts.subcmd {
Expand Down
31 changes: 12 additions & 19 deletions src/process/child.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
use std::io::ErrorKind;
use std::io::Read;
use std::io::Write;

use anyhow::{bail, Result};
use mio::unix::pipe;
Expand All @@ -9,6 +8,7 @@ use mio::unix::pipe::Sender;
use mio::{Events, Interest, Poll, Token};
use nix::unistd::Pid;

use super::parent::ParentChannel;
use super::{MAX_EVENTS, WAIT_FOR_INIT};
use crate::process::message::Message;

Expand All @@ -18,7 +18,7 @@ const CHILD: Token = Token(1);
/// Contains sending end of pipe for parent process, receiving end of pipe
/// for the init process and poller for that
pub struct ChildProcess {
sender_for_parent: Sender,
parent_channel: ParentChannel,
receiver: Option<Receiver>,
poll: Option<Poll>,
}
Expand All @@ -29,9 +29,9 @@ pub struct ChildProcess {
// a process point of view, init process is child of child process, which is child of original youki process.
impl ChildProcess {
/// create a new Child process structure
pub fn new(sender_for_parent: Sender) -> Result<Self> {
pub fn new(parent_channel: ParentChannel) -> Result<Self> {
Ok(Self {
sender_for_parent,
parent_channel,
receiver: None,
poll: None,
})
Expand All @@ -55,24 +55,17 @@ impl ChildProcess {

/// Indicate that child process has forked the init process to parent process
pub fn notify_parent(&mut self, init_pid: Pid) -> Result<()> {
log::debug!(
"child send to parent {:?}",
(Message::ChildReady as u8).to_be_bytes()
);
// write ChildReady message to the pipe to parent
self.write_message_for_parent(Message::ChildReady)?;
// write pid of init process which is forked by child process to the pipe,
// Pid in nix::unistd is type alias of SessionId which itself is alias of i32
self.sender_for_parent
.write_all(&(init_pid.as_raw()).to_be_bytes())?;
self.parent_channel.send_init_pid(init_pid)?;
Ok(())
}

pub fn request_identifier_mapping(&mut self) -> Result<()> {
self.parent_channel.request_identifier_mapping()?;
Ok(())
}

/// writes given message to pipe for the parent
#[inline]
fn write_message_for_parent(&mut self, msg: Message) -> Result<()> {
self.sender_for_parent
.write_all(&(msg as u8).to_be_bytes())?;
pub fn wait_for_mapping_ack(&mut self) -> Result<()> {
self.parent_channel.wait_for_mapping_ack()?;
Ok(())
}

Expand Down
31 changes: 18 additions & 13 deletions src/process/fork.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,25 +14,23 @@ use nix::unistd;
use nix::unistd::Pid;

use crate::cgroups::common::CgroupManager;
use crate::container::Container;
use crate::container::ContainerStatus;
use crate::process::{child, init, parent, Process};
use crate::{container::Container, pipe::Pipe};
use crate::rootless::Rootless;

/// Function to perform the first fork for in order to run the container process
pub fn fork_first<P: AsRef<Path>>(
pid_file: Option<P>,
is_userns: bool,
rootless: Option<Rootless>,
linux: &oci_spec::Linux,
container: &Container,
cmanager: Box<dyn CgroupManager>,
) -> Result<Process> {
// create a new pipe
let cpipe = Pipe::new()?;

// create new parent process structure
let (mut parent, sender_for_parent) = parent::ParentProcess::new()?;
let (mut parent, parent_channel) = parent::ParentProcess::new(rootless.clone())?;
// create a new child process structure with sending end of parent process
let child = child::ChildProcess::new(sender_for_parent)?;
let mut child = child::ChildProcess::new(parent_channel)?;

// fork the process
match unsafe { unistd::fork()? } {
Expand All @@ -51,21 +49,28 @@ pub fn fork_first<P: AsRef<Path>>(
// if new user is specified in specification, this will be true
// and new namespace will be created, check https://man7.org/linux/man-pages/man7/user_namespaces.7.html
// for more information
if is_userns {
if rootless.is_some() {
log::debug!("creating new user namespace");
sched::unshare(sched::CloneFlags::CLONE_NEWUSER)?;

// child needs to be dumpable, otherwise the non root parent is not
// allowed to write the uid/gid maps
prctl::set_dumpable(true).unwrap();
child.request_identifier_mapping()?;
child.wait_for_mapping_ack()?;
prctl::set_dumpable(false).unwrap();
}

cpipe.notify()?;
Ok(Process::Child(child))
}
// in the parent process
unistd::ForkResult::Parent { child } => {
cpipe.wait()?;

// wait for child to fork init process and report back its pid
let init_pid = parent.wait_for_child_ready()?;
let init_pid = parent.wait_for_child_ready(child)?;
log::debug!("init pid is {:?}", init_pid);
cmanager.apply(&linux.resources.as_ref().unwrap(), Pid::from_raw(init_pid))?;
if rootless.is_none() && linux.resources.is_some() {
cmanager.apply(&linux.resources.as_ref().unwrap(), Pid::from_raw(init_pid))?;
}

// update status and pid of the container process
container
Expand Down
4 changes: 4 additions & 0 deletions src/process/message.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,17 @@
pub enum Message {
ChildReady = 0x00,
InitReady = 0x01,
WriteMapping = 0x02,
MappingWritten = 0x03,
}

impl From<u8> for Message {
fn from(from: u8) -> Self {
match from {
0x00 => Message::ChildReady,
0x01 => Message::InitReady,
0x02 => Message::WriteMapping,
0x03 => Message::MappingWritten,
_ => panic!("unknown message."),
}
}
Expand Down
2 changes: 2 additions & 0 deletions src/process/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,3 +26,5 @@ const MAX_EVENTS: usize = 128;
const WAIT_FOR_CHILD: Duration = Duration::from_secs(5);
/// Time to wait when polling for message from init process
const WAIT_FOR_INIT: Duration = Duration::from_millis(1000);
/// Time to wait when polling for mapping ack from parent
const WAIT_FOR_MAPPING: Duration = Duration::from_secs(3);
Loading