Skip to content

Commit

Permalink
Use clone instead of fork on pvf (#2477)
Browse files Browse the repository at this point in the history
@mrcnski Done the change on the prepare worker, once the prepare worker
part is good I'll do the same for the execute worker.

This is based on
https://github.com/koute/polkavm/blob/11beebd06276ce9b84f335350138479e714f6caf/crates/polkavm/src/sandbox/linux.rs#L711.

## TODO

- [x] Add a check for this capability at startup
- [x] Add prdoc mentioning the new Secure Validator Mode (optional)
requirement.

## Related

Closes #2162

---------

Co-authored-by: Marcin S <marcin@realemail.net>
  • Loading branch information
jpserrat and mrcnski authored Jan 21, 2024
1 parent caa987d commit 21ef949
Show file tree
Hide file tree
Showing 20 changed files with 860 additions and 394 deletions.
14 changes: 2 additions & 12 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions polkadot/node/core/pvf/common/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" }

[target.'cfg(target_os = "linux")'.dependencies]
landlock = "0.3.0"
nix = { version = "0.27.1", features = ["sched"] }
seccompiler = "0.4.0"

[dev-dependencies]
Expand Down
5 changes: 3 additions & 2 deletions polkadot/node/core/pvf/common/src/execute.rs
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,11 @@ pub enum JobError {
TimedOut,
#[error("An unexpected panic has occurred in the execution job: {0}")]
Panic(String),
/// Some error occurred when interfacing with the kernel.
#[error("Error interfacing with the kernel: {0}")]
Kernel(String),
#[error("Could not spawn the requested thread: {0}")]
CouldNotSpawnThread(String),
#[error("An error occurred in the CPU time monitor thread: {0}")]
CpuTimeMonitorThread(String),
#[error("Could not set pdeathsig: {0}")]
CouldNotSetPdeathsig(String),
}
13 changes: 8 additions & 5 deletions polkadot/node/core/pvf/common/src/executor_interface.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,15 +140,18 @@ pub unsafe fn create_runtime_from_artifact_bytes(
executor_params: &ExecutorParams,
) -> Result<WasmtimeRuntime, WasmError> {
let mut config = DEFAULT_CONFIG.clone();
config.semantics = params_to_wasmtime_semantics(executor_params);
config.semantics = params_to_wasmtime_semantics(executor_params).0;

sc_executor_wasmtime::create_runtime_from_artifact_bytes::<HostFunctions>(
compiled_artifact_blob,
config,
)
}

pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
/// Takes the default config and overwrites any settings with existing executor parameters.
///
/// Returns the semantics as well as the stack limit (since we are guaranteed to have it).
pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> (Semantics, DeterministicStackLimit) {
let mut sem = DEFAULT_CONFIG.semantics.clone();
let mut stack_limit = sem
.deterministic_stack_limit
Expand All @@ -169,8 +172,8 @@ pub fn params_to_wasmtime_semantics(par: &ExecutorParams) -> Semantics {
ExecutorParam::PvfExecTimeout(_, _) => (), /* Not used here */
}
}
sem.deterministic_stack_limit = Some(stack_limit);
sem
sem.deterministic_stack_limit = Some(stack_limit.clone());
(sem, stack_limit)
}

/// Runs the prevalidation on the given code. Returns a [`RuntimeBlob`] if it succeeds.
Expand All @@ -187,7 +190,7 @@ pub fn prepare(
blob: RuntimeBlob,
executor_params: &ExecutorParams,
) -> Result<Vec<u8>, sc_executor_common::error::WasmError> {
let semantics = params_to_wasmtime_semantics(executor_params);
let (semantics, _) = params_to_wasmtime_semantics(executor_params);
sc_executor_wasmtime::prepare_runtime_artifact(blob, &semantics)
}

Expand Down
2 changes: 2 additions & 0 deletions polkadot/node/core/pvf/common/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,8 @@ pub struct SecurityStatus {
pub can_enable_seccomp: bool,
/// Whether we are able to unshare the user namespace and change the filesystem root.
pub can_unshare_user_namespace_and_change_root: bool,
/// Whether we are able to call `clone` with all sandboxing flags.
pub can_do_secure_clone: bool,
}

/// A handshake with information for the worker.
Expand Down
160 changes: 128 additions & 32 deletions polkadot/node/core/pvf/common/src/worker/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,19 @@
pub mod security;

use crate::{framed_recv_blocking, WorkerHandshake, LOG_TARGET};
use crate::{framed_recv_blocking, SecurityStatus, WorkerHandshake, LOG_TARGET};
use cpu_time::ProcessTime;
use futures::never::Never;
use parity_scale_codec::Decode;
use std::{
any::Any,
fmt, io,
os::unix::net::UnixStream,
fmt::{self},
fs::File,
io::{self, Read, Write},
os::{
fd::{AsRawFd, FromRawFd, RawFd},
unix::net::UnixStream,
},
path::PathBuf,
sync::mpsc::{Receiver, RecvTimeoutError},
time::Duration,
Expand Down Expand Up @@ -78,7 +83,7 @@ macro_rules! decl_worker_main {

"--check-can-enable-landlock" => {
#[cfg(target_os = "linux")]
let status = if let Err(err) = security::landlock::check_is_fully_enabled() {
let status = if let Err(err) = security::landlock::check_can_fully_enable() {
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
-1
Expand All @@ -91,7 +96,7 @@ macro_rules! decl_worker_main {
},
"--check-can-enable-seccomp" => {
#[cfg(all(target_os = "linux", target_arch = "x86_64"))]
let status = if let Err(err) = security::seccomp::check_is_fully_enabled() {
let status = if let Err(err) = security::seccomp::check_can_fully_enable() {
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
-1
Expand All @@ -107,7 +112,7 @@ macro_rules! decl_worker_main {
let cache_path_tempdir = std::path::Path::new(&args[2]);
#[cfg(target_os = "linux")]
let status = if let Err(err) =
security::change_root::check_is_fully_enabled(&cache_path_tempdir)
security::change_root::check_can_fully_enable(&cache_path_tempdir)
{
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
Expand All @@ -119,6 +124,21 @@ macro_rules! decl_worker_main {
let status = -1;
std::process::exit(status)
},
"--check-can-do-secure-clone" => {
#[cfg(target_os = "linux")]
// SAFETY: new process is spawned within a single threaded process. This
// invariant is enforced by tests.
let status = if let Err(err) = unsafe { security::clone::check_can_fully_clone() } {
// Write the error to stderr, log it on the host-side.
eprintln!("{}", err);
-1
} else {
0
};
#[cfg(not(target_os = "linux"))]
let status = -1;
std::process::exit(status)
},

"test-sleep" => {
std::thread::sleep(std::time::Duration::from_secs(5));
Expand Down Expand Up @@ -171,6 +191,84 @@ macro_rules! decl_worker_main {
};
}

//taken from the os_pipe crate. Copied here to reduce one dependency and
// because its type-safe abstractions do not play well with nix's clone
#[cfg(not(target_os = "macos"))]
pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
let mut fds: [libc::c_int; 2] = [0; 2];
let res = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
if res != 0 {
return Err(io::Error::last_os_error())
}
Ok((fds[0], fds[1]))
}

#[cfg(target_os = "macos")]
pub fn pipe2_cloexec() -> io::Result<(libc::c_int, libc::c_int)> {
let mut fds: [libc::c_int; 2] = [0; 2];
let res = unsafe { libc::pipe(fds.as_mut_ptr()) };
if res != 0 {
return Err(io::Error::last_os_error())
}
let res = unsafe { libc::fcntl(fds[0], libc::F_SETFD, libc::FD_CLOEXEC) };
if res != 0 {
return Err(io::Error::last_os_error())
}
let res = unsafe { libc::fcntl(fds[1], libc::F_SETFD, libc::FD_CLOEXEC) };
if res != 0 {
return Err(io::Error::last_os_error())
}
Ok((fds[0], fds[1]))
}

/// A wrapper around a file descriptor used to encapsulate and restrict
/// functionality for pipe operations.
pub struct PipeFd {
file: File,
}

impl AsRawFd for PipeFd {
/// Returns the raw file descriptor associated with this `PipeFd`
fn as_raw_fd(&self) -> RawFd {
self.file.as_raw_fd()
}
}

impl FromRawFd for PipeFd {
/// Creates a new `PipeFd` instance from a raw file descriptor.
///
/// # Safety
///
/// The fd passed in must be an owned file descriptor; in particular, it must be open.
unsafe fn from_raw_fd(fd: RawFd) -> Self {
PipeFd { file: File::from_raw_fd(fd) }
}
}

impl Read for PipeFd {
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
self.file.read(buf)
}

fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
self.file.read_to_end(buf)
}
}

impl Write for PipeFd {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.file.write(buf)
}

fn flush(&mut self) -> io::Result<()> {
self.file.flush()
}

fn write_all(&mut self, buf: &[u8]) -> io::Result<()> {
self.file.write_all(buf)
}
}

/// Some allowed overhead that we account for in the "CPU time monitor" thread's sleeps, on the
/// child process.
pub const JOB_TIMEOUT_OVERHEAD: Duration = Duration::from_millis(50);
Expand All @@ -192,14 +290,12 @@ impl fmt::Display for WorkerKind {
}
}

// Some fields are only used for logging, and dead-code analysis ignores Debug.
#[allow(dead_code)]
#[derive(Debug)]
pub struct WorkerInfo {
pid: u32,
kind: WorkerKind,
version: Option<String>,
worker_dir_path: PathBuf,
pub pid: u32,
pub kind: WorkerKind,
pub version: Option<String>,
pub worker_dir_path: PathBuf,
}

// NOTE: The worker version must be passed in so that we accurately get the version of the worker,
Expand All @@ -218,7 +314,7 @@ pub fn run_worker<F>(
worker_version: Option<&str>,
mut event_loop: F,
) where
F: FnMut(UnixStream, PathBuf) -> io::Result<Never>,
F: FnMut(UnixStream, &WorkerInfo, SecurityStatus) -> io::Result<Never>,
{
#[cfg_attr(not(target_os = "linux"), allow(unused_mut))]
let mut worker_info = WorkerInfo {
Expand Down Expand Up @@ -250,11 +346,8 @@ pub fn run_worker<F>(
}

// Make sure that we can read the worker dir path, and log its contents.
let entries = || -> Result<Vec<_>, io::Error> {
std::fs::read_dir(&worker_info.worker_dir_path)?
.map(|res| res.map(|e| e.file_name()))
.collect()
}();
let entries: io::Result<Vec<_>> = std::fs::read_dir(&worker_info.worker_dir_path)
.and_then(|d| d.map(|res| res.map(|e| e.file_name())).collect());
match entries {
Ok(entries) =>
gum::trace!(target: LOG_TARGET, ?worker_info, "content of worker dir: {:?}", entries),
Expand Down Expand Up @@ -284,6 +377,22 @@ pub fn run_worker<F>(
{
gum::trace!(target: LOG_TARGET, ?security_status, "Enabling security features");

// First, make sure env vars were cleared, to match the environment we perform the checks
// within. (In theory, running checks with different env vars could result in different
// outcomes of the checks.)
if !security::check_env_vars_were_cleared(&worker_info) {
let err = "not all env vars were cleared when spawning the process";
gum::error!(
target: LOG_TARGET,
?worker_info,
"{}",
err
);
if security_status.secure_validator_mode {
worker_shutdown(worker_info, err);
}
}

// Call based on whether we can change root. Error out if it should work but fails.
//
// NOTE: This should not be called in a multi-threaded context (i.e. inside the tokio
Expand Down Expand Up @@ -337,23 +446,10 @@ pub fn run_worker<F>(
}
}
}

if !security::check_env_vars_were_cleared(&worker_info) {
let err = "not all env vars were cleared when spawning the process";
gum::error!(
target: LOG_TARGET,
?worker_info,
"{}",
err
);
if security_status.secure_validator_mode {
worker_shutdown(worker_info, err);
}
}
}

// Run the main worker loop.
let err = event_loop(stream, worker_info.worker_dir_path.clone())
let err = event_loop(stream, &worker_info, security_status)
// It's never `Ok` because it's `Ok(Never)`.
.unwrap_err();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,7 @@ pub fn enable_for_worker(worker_info: &WorkerInfo) -> Result<()> {
///
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
/// "CLONE_NEWUSER requires that the calling process is not threaded."
#[cfg(target_os = "linux")]
pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
pub fn check_can_fully_enable(tempdir: &Path) -> Result<()> {
let worker_dir_path = tempdir.to_owned();
try_restrict(&WorkerInfo {
pid: std::process::id(),
Expand All @@ -69,7 +68,6 @@ pub fn check_is_fully_enabled(tempdir: &Path) -> Result<()> {
///
/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`:
/// "CLONE_NEWUSER requires that the calling process is not threaded."
#[cfg(target_os = "linux")]
fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
// TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723
macro_rules! cstr_ptr {
Expand All @@ -78,12 +76,6 @@ fn try_restrict(worker_info: &WorkerInfo) -> Result<()> {
};
}

gum::trace!(
target: LOG_TARGET,
?worker_info,
"unsharing the user namespace and calling pivot_root",
);

let worker_dir_path_c = CString::new(worker_info.worker_dir_path.as_os_str().as_bytes())
.expect("on unix; the path will never contain 0 bytes; qed");

Expand Down
Loading

0 comments on commit 21ef949

Please sign in to comment.