From 6009a8533a08bc6a8f5f8208cee7a592276fac2b Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Sun, 24 Sep 2023 07:26:02 -0300 Subject: [PATCH 01/47] change prepare worker to use fork instead of threads --- .../node/core/pvf/prepare-worker/src/lib.rs | 385 ++++++++++-------- 1 file changed, 222 insertions(+), 163 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index caa7d33df12a..a82fe9bef9f0 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -20,6 +20,7 @@ mod executor_intf; mod memory_stats; pub use executor_intf::{prepare, prevalidate}; +use libc::{c_int, exit, rusage, write}; // NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-prepare-worker=trace`. @@ -37,18 +38,17 @@ use polkadot_node_core_pvf_common::{ prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ - bytes_to_path, cpu_time_monitor_loop, + bytes_to_path, security::LandlockStatus, - stringify_panic_payload, - thread::{self, WaitOutcome}, + thread::{self}, worker_event_loop, }, - ProcessTime, }; use polkadot_primitives::ExecutorParams; use std::{ + ffi::c_void, + mem, path::PathBuf, - sync::{mpsc::channel, Arc}, time::Duration, }; use tokio::{io, net::UnixStream}; @@ -143,173 +143,56 @@ pub fn worker_entrypoint( let prepare_job_kind = pvf.prep_kind(); let executor_params = (*pvf.executor_params()).clone(); - // Conditional variable to notify us when a thread is done. - let condvar = thread::get_condvar(); - - // Run the memory tracker in a regular, non-worker thread. - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let condvar_memory = Arc::clone(&condvar); - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); - - let cpu_time_start = ProcessTime::now(); - - // Spawn a new thread that runs the CPU time monitor. - let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>(); - let cpu_time_monitor_thread = thread::spawn_worker_thread( - "cpu time monitor thread", - move || { - cpu_time_monitor_loop( - cpu_time_start, - preparation_timeout, - cpu_time_monitor_rx, - ) - }, - Arc::clone(&condvar), - WaitOutcome::TimedOut, - )?; - // Spawn another thread for preparation. - let prepare_thread = thread::spawn_worker_thread( - "prepare thread", - move || { - // Try to enable landlock. - #[cfg(target_os = "linux")] - let landlock_status = polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() - .map(LandlockStatus::from_ruleset_status) - .map_err(|e| e.to_string()); - #[cfg(not(target_os = "linux"))] - let landlock_status: Result = Ok(LandlockStatus::NotEnforced); - - #[allow(unused_mut)] - let mut result = prepare_artifact(pvf, cpu_time_start); - - // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. - #[cfg(target_os = "linux")] - let mut result = result - .map(|(artifact, elapsed)| (artifact, elapsed, get_max_rss_thread())); - - // If we are pre-checking, check for runtime construction errors. - // - // As pre-checking is more strict than just preparation in terms of memory - // and time, it is okay to do extra checks here. This takes negligible time - // anyway. - if let PrepareJobKind::Prechecking = prepare_job_kind { - result = result.and_then(|output| { - runtime_construction_check(output.0.as_ref(), executor_params)?; - Ok(output) - }); - } - - (result, landlock_status) - }, - Arc::clone(&condvar), - WaitOutcome::Finished, - )?; - - let outcome = thread::wait_for_threads(condvar); - - let result = match outcome { - WaitOutcome::Finished => { - let _ = cpu_time_monitor_tx.send(()); - - match prepare_thread.join().unwrap_or_else(|err| { - ( - Err(PrepareError::Panic(stringify_panic_payload(err))), - Ok(LandlockStatus::Unavailable), - ) - }) { - (Err(err), _) => { - // Serialized error will be written into the socket. - Err(err) - }, - (Ok(ok), landlock_status) => { - #[cfg(not(target_os = "linux"))] - let (artifact, cpu_time_elapsed) = ok; - #[cfg(target_os = "linux")] - let (artifact, cpu_time_elapsed, max_rss) = ok; - - // Stop the memory stats worker and get its observed memory stats. - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid) - .await; - let memory_stats = MemoryStats { - #[cfg(any( - target_os = "linux", - feature = "jemalloc-allocator" - ))] - memory_tracker_stats, - #[cfg(target_os = "linux")] - max_rss: extract_max_rss_stat(max_rss, worker_pid), - }; - - // Log if landlock threw an error. - if let Err(err) = landlock_status { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "error enabling landlock: {}", - err - ); - } - - // Write the serialized artifact into a temp file. - // - // PVF host only keeps artifacts statuses in its memory, - // successfully compiled code gets stored on the disk (and - // consequently deserialized by execute-workers). The prepare worker - // is only required to send `Ok` to the pool to indicate the - // success. - - gum::debug!( - target: LOG_TARGET, - %worker_pid, - "worker: writing artifact to {}", - temp_artifact_dest.display(), - ); - tokio::fs::write(&temp_artifact_dest, &artifact).await?; - - Ok(PrepareStats { cpu_time_elapsed, memory_stats }) - }, + let mut rusage_before = unsafe { mem::zeroed() }; + let mut rusage_after = unsafe { mem::zeroed() }; + if unsafe { libc::getrusage(libc::RUSAGE_CHILDREN, &mut rusage_before) } == -1 { + send_response(&mut stream, + Err(PrepareError::Panic(format!("error getting children resource usage for worker pid {}", worker_pid)))) + .await?; + }; + let mut pipe_fds = [0; 2]; + if unsafe { libc::pipe(pipe_fds.as_mut_ptr()) } == -1 { + send_response(&mut stream, Err(PrepareError::Panic(format!("error creating pipe {}", worker_pid)))).await?; + } + + let pipe_read = pipe_fds[0]; + let pipe_write = pipe_fds[1]; + let result = match unsafe { libc::fork() } { + // error + -1 => Err(PrepareError::Panic(String::from("error forking"))), + // child + 0 => { + unsafe { + handle_child_process( + pvf, + pipe_write, + pipe_read, + preparation_timeout, + prepare_job_kind, + executor_params, + ).await; + Err(PrepareError::Panic(String::from("unreachable"))) } }, - // If the CPU thread is not selected, we signal it to end, the join handle is - // dropped and the thread will finish in the background. - WaitOutcome::TimedOut => { - match cpu_time_monitor_thread.join() { - Ok(Some(cpu_time_elapsed)) => { - // Log if we exceed the timeout and the other thread hasn't - // finished. - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", - cpu_time_elapsed.as_millis(), - preparation_timeout.as_millis(), - ); - Err(PrepareError::TimedOut) - }, - Ok(None) => Err(PrepareError::IoErr( - "error communicating over closed channel".into(), - )), - // Errors in this thread are independent of the PVF. - Err(err) => Err(PrepareError::IoErr(stringify_panic_payload(err))), + // parent + _ => unsafe { handle_parent_process( + &mut rusage_after, + rusage_before, + pipe_read, + pipe_write, + temp_artifact_dest, + preparation_timeout, + worker_pid, + ).await } - }, - WaitOutcome::Pending => unreachable!( - "we run wait_while until the outcome is no longer pending; qed" - ), }; - send_response(&mut stream, result).await?; } }, ); } -fn prepare_artifact( - pvf: PvfPrepData, - cpu_time_start: ProcessTime, -) -> Result<(CompiledArtifact, Duration), PrepareError> { +fn prepare_artifact(pvf: PvfPrepData) -> Result { let blob = match prevalidate(&pvf.code()) { Err(err) => return Err(PrepareError::Prevalidation(format!("{:?}", err))), Ok(b) => b, @@ -319,7 +202,6 @@ fn prepare_artifact( Ok(compiled_artifact) => Ok(CompiledArtifact::new(compiled_artifact)), Err(err) => Err(PrepareError::Preparation(format!("{:?}", err))), } - .map(|artifact| (artifact, cpu_time_start.elapsed())) } /// Try constructing the runtime to catch any instantiation errors during pre-checking. @@ -336,3 +218,180 @@ fn runtime_construction_check( .map(|_runtime| ()) .map_err(|err| PrepareError::RuntimeConstruction(format!("{:?}", err))) } + +struct Response { + artifact_result: Result, + landlock_status: Result, + memory_stats: MemoryStats, +} + +async unsafe fn handle_child_process( + pvf: PvfPrepData, + pipe_write: c_int, + pipe_read: c_int, + preparation_timeout: Duration, + prepare_job_kind: PrepareJobKind, + executor_params: ExecutorParams, +) { + if libc::close(pipe_read) == -1 { + exit(libc::EXIT_FAILURE) + } + + let worker_pid = std::process::id(); + // Run the memory tracker in a regular, non-worker thread. + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + let condvar_memory = thread::get_condvar(); + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); + if libc::setrlimit( + libc::RLIMIT_CPU, + &mut libc::rlimit { + rlim_cur: preparation_timeout.as_secs(), + rlim_max: preparation_timeout.as_secs(), + }, + ) == -1 { + exit(libc::EXIT_FAILURE); +}; + + // Try to enable landlock. + #[cfg(target_os = "linux")] + let landlock_status = + polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() + .map(LandlockStatus::from_ruleset_status) + .map_err(|e| e.to_string()); + #[cfg(not(target_os = "linux"))] + let landlock_status: Result = Ok(LandlockStatus::NotEnforced); + + #[allow(unused_mut)] + let mut artifact_result = prepare_artifact(pvf); + + // If we are pre-checking, check for runtime construction errors. + // + // As pre-checking is more strict than just preparation in terms of memory + // and time, it is okay to do extra checks here. This takes negligible time + // anyway. + if let PrepareJobKind::Prechecking = prepare_job_kind { + artifact_result = artifact_result.and_then(|output| { + runtime_construction_check(output.as_ref(), executor_params)?; + Ok(output) + }); + } + + // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. + #[cfg(target_os = "linux")] + let max_rss = get_max_rss_thread(); + + // // Stop the memory stats worker and get its observed memory stats. + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid).await; + + let memory_stats = MemoryStats { + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + memory_tracker_stats, + #[cfg(target_os = "linux")] + max_rss: extract_max_rss_stat(max_rss, worker_pid), + }; + + let response = Response { artifact_result, landlock_status, memory_stats }; + + if write(pipe_write, &response as *const _ as *const c_void, mem::size_of::()) == -1 { + exit(libc::EXIT_FAILURE); + } + + if libc::close(pipe_write) == -1 { + exit(libc::EXIT_FAILURE); + } +} + +async unsafe fn handle_parent_process( + rusage_after: &mut rusage, + rusage_before: rusage, + pipe_read: c_int, + pipe_write: c_int, + temp_artifact_dest: PathBuf, + preparation_timeout: Duration, + worker_pid: u32 +) -> Result { + if libc::close(pipe_write) == -1 { + return Err(PrepareError::Panic(String::from("error closing pipe write end on the parent process"))); + } + + let mut status: c_int = 0; + if libc::wait(&mut status) == -1 { + return Err(PrepareError::Panic(String::from("error waiting for child"))) + } + + if libc::getrusage(libc::RUSAGE_CHILDREN, rusage_after) == -1 { + return Err(PrepareError::Panic(String::from("error getting resource usage from child"))) + }; + + let cpu_tv = rusage_after.ru_utime.tv_sec; + let cpu_duration = Duration::from_micros(cpu_tv as u64); + + return match status { + libc::EXIT_SUCCESS => { + let mut result: Response = mem::zeroed(); + let data_size = mem::size_of::(); + + if libc::read(pipe_read, &mut result as *mut _ as *mut c_void, data_size) == -1 { + return Err(PrepareError::Panic(String::from("error reading pipe from child"))) + }; + + if libc::close(pipe_read) == -1 { + return Err(PrepareError::Panic(String::from("error reading pipe from child"))) + } + + match result.artifact_result { + Err(err) => Err(err), + Ok(artifact) => { + + // Log if landlock threw an error. + if let Err(err) = &result.landlock_status { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "error enabling landlock: {}", + err + ); + } + + // Write the serialized artifact into a temp file. + // + // PVF host only keeps artifacts statuses in its memory, + // successfully compiled code gets stored on the disk (and + // consequently deserialized by execute-workers). The prepare worker + // is only required to send `Ok` to the pool to indicate the + // success. + gum::debug!( + target: LOG_TARGET, + %worker_pid, + "worker: writing artifact to {}", + temp_artifact_dest.display(), + ); + if let Err(err) = tokio::fs::write(&temp_artifact_dest, &artifact).await { + return Err(PrepareError::Panic(format!("{:?}", err))); + }; + + Ok(PrepareStats { + cpu_time_elapsed: cpu_duration, + memory_stats: result.memory_stats, + }) + }, + } + }, + libc::SIGXCPU => { + // Log if we exceed the timeout and the other thread hasn't + // finished. + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", + cpu_duration.as_millis(), + preparation_timeout.as_millis(), + ); + Err(PrepareError::TimedOut) + }, + status => Err(PrepareError::Panic(format!("child failed with status {}", status))), + } +} + From cbed258c994f13d9c27b051d4bc116ccece20329 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Sun, 24 Sep 2023 16:11:00 -0300 Subject: [PATCH 02/47] get total cpu time for current child --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index a82fe9bef9f0..180306f7b977 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -325,7 +325,8 @@ async unsafe fn handle_parent_process( return Err(PrepareError::Panic(String::from("error getting resource usage from child"))) }; - let cpu_tv = rusage_after.ru_utime.tv_sec; + // Get total cpu usage for current child + let cpu_tv = get_total_cpu_usage(*rusage_after) - get_total_cpu_usage(rusage_before); let cpu_duration = Duration::from_micros(cpu_tv as u64); return match status { @@ -395,3 +396,7 @@ async unsafe fn handle_parent_process( } } +fn get_total_cpu_usage(rusage: libc::rusage) -> i64 { + return rusage.ru_utime.tv_sec * 1000000 + rusage.ru_utime.tv_usec + rusage.ru_stime.tv_sec * 1000000 + rusage.ru_stime.tv_usec; +} + From 08c77661cddd5e560e362f830b22ad544b340a72 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Sun, 24 Sep 2023 16:21:56 -0300 Subject: [PATCH 03/47] fix fmt and int cast --- .../node/core/pvf/prepare-worker/src/lib.rs | 93 ++++++++++--------- 1 file changed, 51 insertions(+), 42 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 180306f7b977..3ce6f9940aae 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -45,12 +45,7 @@ use polkadot_node_core_pvf_common::{ }, }; use polkadot_primitives::ExecutorParams; -use std::{ - ffi::c_void, - mem, - path::PathBuf, - time::Duration, -}; +use std::{ffi::c_void, mem, path::PathBuf, time::Duration}; use tokio::{io, net::UnixStream}; /// Contains the bytes for a successfully compiled artifact. @@ -146,13 +141,22 @@ pub fn worker_entrypoint( let mut rusage_before = unsafe { mem::zeroed() }; let mut rusage_after = unsafe { mem::zeroed() }; if unsafe { libc::getrusage(libc::RUSAGE_CHILDREN, &mut rusage_before) } == -1 { - send_response(&mut stream, - Err(PrepareError::Panic(format!("error getting children resource usage for worker pid {}", worker_pid)))) - .await?; + send_response( + &mut stream, + Err(PrepareError::Panic(format!( + "error getting children resource usage for worker pid {}", + worker_pid + ))), + ) + .await?; }; let mut pipe_fds = [0; 2]; if unsafe { libc::pipe(pipe_fds.as_mut_ptr()) } == -1 { - send_response(&mut stream, Err(PrepareError::Panic(format!("error creating pipe {}", worker_pid)))).await?; + send_response( + &mut stream, + Err(PrepareError::Panic(format!("error creating pipe {}", worker_pid))), + ) + .await?; } let pipe_read = pipe_fds[0]; @@ -161,30 +165,31 @@ pub fn worker_entrypoint( // error -1 => Err(PrepareError::Panic(String::from("error forking"))), // child - 0 => { - unsafe { - handle_child_process( - pvf, - pipe_write, - pipe_read, - preparation_timeout, - prepare_job_kind, - executor_params, - ).await; - Err(PrepareError::Panic(String::from("unreachable"))) - } + 0 => unsafe { + handle_child_process( + pvf, + pipe_write, + pipe_read, + preparation_timeout, + prepare_job_kind, + executor_params, + ) + .await; + Err(PrepareError::Panic(String::from("unreachable"))) }, // parent - _ => unsafe { handle_parent_process( - &mut rusage_after, - rusage_before, - pipe_read, - pipe_write, - temp_artifact_dest, - preparation_timeout, - worker_pid, - ).await - } + _ => unsafe { + handle_parent_process( + &mut rusage_after, + rusage_before, + pipe_read, + pipe_write, + temp_artifact_dest, + preparation_timeout, + worker_pid, + ) + .await + }, }; send_response(&mut stream, result).await?; } @@ -243,15 +248,16 @@ async unsafe fn handle_child_process( let condvar_memory = thread::get_condvar(); #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); - if libc::setrlimit( + if libc::setrlimit( libc::RLIMIT_CPU, &mut libc::rlimit { rlim_cur: preparation_timeout.as_secs(), rlim_max: preparation_timeout.as_secs(), }, - ) == -1 { - exit(libc::EXIT_FAILURE); -}; + ) == -1 + { + exit(libc::EXIT_FAILURE); + }; // Try to enable landlock. #[cfg(target_os = "linux")] @@ -310,10 +316,12 @@ async unsafe fn handle_parent_process( pipe_write: c_int, temp_artifact_dest: PathBuf, preparation_timeout: Duration, - worker_pid: u32 + worker_pid: u32, ) -> Result { if libc::close(pipe_write) == -1 { - return Err(PrepareError::Panic(String::from("error closing pipe write end on the parent process"))); + return Err(PrepareError::Panic(String::from( + "error closing pipe write end on the parent process", + ))) } let mut status: c_int = 0; @@ -345,7 +353,6 @@ async unsafe fn handle_parent_process( match result.artifact_result { Err(err) => Err(err), Ok(artifact) => { - // Log if landlock threw an error. if let Err(err) = &result.landlock_status { gum::warn!( @@ -370,7 +377,7 @@ async unsafe fn handle_parent_process( temp_artifact_dest.display(), ); if let Err(err) = tokio::fs::write(&temp_artifact_dest, &artifact).await { - return Err(PrepareError::Panic(format!("{:?}", err))); + return Err(PrepareError::Panic(format!("{:?}", err))) }; Ok(PrepareStats { @@ -397,6 +404,8 @@ async unsafe fn handle_parent_process( } fn get_total_cpu_usage(rusage: libc::rusage) -> i64 { - return rusage.ru_utime.tv_sec * 1000000 + rusage.ru_utime.tv_usec + rusage.ru_stime.tv_sec * 1000000 + rusage.ru_stime.tv_usec; + return rusage.ru_utime.tv_sec * 1000000 + + (rusage.ru_utime.tv_usec as i64) + + rusage.ru_stime.tv_sec * 1000000 + + (rusage.ru_stime.tv_usec as i64) } - From 40429a2bc497fc1fa1ae6db20a41c26424c1583a Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Mon, 25 Sep 2023 06:33:30 -0300 Subject: [PATCH 04/47] exit from child job on success for prepare worker --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 3ce6f9940aae..6c5f7c7122e5 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -307,6 +307,8 @@ async unsafe fn handle_child_process( if libc::close(pipe_write) == -1 { exit(libc::EXIT_FAILURE); } + + exit(libc::EXIT_SUCCESS); } async unsafe fn handle_parent_process( From 43005661a6653ac6afd3b96ecaa9df789ce3f7a8 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Sun, 1 Oct 2023 18:30:49 -0300 Subject: [PATCH 05/47] prepare worker: change entrypoint docstring, remove cpu time thread --- Cargo.lock | 28 +- polkadot/node/core/pvf/common/Cargo.toml | 1 + polkadot/node/core/pvf/common/src/error.rs | 3 +- polkadot/node/core/pvf/common/src/prepare.rs | 5 +- .../core/pvf/common/src/worker/security.rs | 3 + .../node/core/pvf/prepare-worker/Cargo.toml | 4 + .../node/core/pvf/prepare-worker/src/lib.rs | 296 ++++++++---------- .../node/core/pvf/src/prepare/worker_intf.rs | 15 +- 8 files changed, 178 insertions(+), 177 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ccf546b16b6b..476337accbd9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8131,6 +8131,17 @@ dependencies = [ "static_assertions", ] +[[package]] +name = "nix" +version = "0.27.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2eb04e9c688eff1c89d72b407f168cf79bb9e867a9d3323ed6c01519eb9cc053" +dependencies = [ + "bitflags 2.4.0", + "cfg-if", + "libc", +] + [[package]] name = "no-std-net" version = "0.6.0" @@ -8739,6 +8750,16 @@ dependencies = [ "num-traits", ] +[[package]] +name = "os_pipe" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ae859aa07428ca9a929b936690f8b12dc5f11dd8c6992a18ca93919f28bc177" +dependencies = [ + "libc", + "windows-sys 0.48.0", +] + [[package]] name = "os_str_bytes" version = "6.5.1" @@ -12081,6 +12102,7 @@ dependencies = [ "sc-executor", "sc-executor-common", "sc-executor-wasmtime", + "serde", "sp-core", "sp-externalities", "sp-io", @@ -12112,8 +12134,11 @@ dependencies = [ name = "polkadot-node-core-pvf-prepare-worker" version = "1.0.0" dependencies = [ + "bincode", "futures", "libc", + "nix 0.27.1", + "os_pipe", "parity-scale-codec", "polkadot-node-core-pvf-common", "polkadot-parachain-primitives", @@ -12122,6 +12147,7 @@ dependencies = [ "sc-executor", "sc-executor-common", "sc-executor-wasmtime", + "serde", "sp-io", "sp-maybe-compressed-blob", "sp-tracing", @@ -19257,7 +19283,7 @@ checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" dependencies = [ "cfg-if", "digest 0.10.7", - "rand 0.7.3", + "rand 0.8.5", "static_assertions", ] diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 621f7e24f72b..5b6bab942486 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -26,6 +26,7 @@ sp-core = { path = "../../../../../substrate/primitives/core" } sp-externalities = { path = "../../../../../substrate/primitives/externalities" } sp-io = { path = "../../../../../substrate/primitives/io" } sp-tracing = { path = "../../../../../substrate/primitives/tracing" } +serde = { version = "1.0.188", features = ["derive"] } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.2.0" diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 6eb0d9b7df42..50b6db3f4843 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -17,13 +17,14 @@ use crate::prepare::PrepareStats; use parity_scale_codec::{Decode, Encode}; use std::fmt; +use serde::{Deserialize, Serialize}; /// Result of PVF preparation performed by the validation host. Contains stats about the preparation /// if successful pub type PrepareResult = Result; /// An error that occurred during the prepare part of the PVF pipeline. -#[derive(Debug, Clone, Encode, Decode)] +#[derive(Debug, Clone, Encode, Decode, Deserialize, Serialize)] pub enum PrepareError { /// During the prevalidation stage of preparation an issue was found with the PVF. Prevalidation(String), diff --git a/polkadot/node/core/pvf/common/src/prepare.rs b/polkadot/node/core/pvf/common/src/prepare.rs index c205eddfb8b1..4f412ddd2c9e 100644 --- a/polkadot/node/core/pvf/common/src/prepare.rs +++ b/polkadot/node/core/pvf/common/src/prepare.rs @@ -15,19 +15,18 @@ // along with Polkadot. If not, see . use parity_scale_codec::{Decode, Encode}; +use serde::{Deserialize, Serialize}; /// Preparation statistics, including the CPU time and memory taken. #[derive(Debug, Clone, Default, Encode, Decode)] pub struct PrepareStats { - /// The CPU time that elapsed for the preparation job. - pub cpu_time_elapsed: std::time::Duration, /// The observed memory statistics for the preparation job. pub memory_stats: MemoryStats, } /// Helper struct to contain all the memory stats, including `MemoryAllocationStats` and, if /// supported by the OS, `ru_maxrss`. -#[derive(Clone, Debug, Default, Encode, Decode)] +#[derive(Clone, Debug, Default, Encode, Decode, Serialize, Deserialize)] pub struct MemoryStats { /// Memory stats from `tikv_jemalloc_ctl`. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs index 6c5f96e0b5db..3ba4f5e56236 100644 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ b/polkadot/node/core/pvf/common/src/worker/security.rs @@ -18,8 +18,11 @@ //! //! This is needed because workers are used to compile and execute untrusted code (PVFs). +use serde::{Deserialize, Serialize}; + /// To what degree landlock is enabled. It's a separate struct from `RulesetStatus` because that is /// only available on Linux, plus this has a nicer name. +#[derive(Serialize, Deserialize, Debug)] pub enum LandlockStatus { FullyEnforced, PartiallyEnforced, diff --git a/polkadot/node/core/pvf/prepare-worker/Cargo.toml b/polkadot/node/core/pvf/prepare-worker/Cargo.toml index e7a12cd9a809..26fd9c830fa0 100644 --- a/polkadot/node/core/pvf/prepare-worker/Cargo.toml +++ b/polkadot/node/core/pvf/prepare-worker/Cargo.toml @@ -13,6 +13,10 @@ libc = "0.2.139" rayon = "1.5.1" tikv-jemalloc-ctl = { version = "0.5.0", optional = true } tokio = { version = "1.24.2", features = ["fs", "process"] } +os_pipe = "1.1.4" +nix = { version = "0.27.1", features = ["resource", "process"]} +serde = { version = "1.0.188", features = ["derive"]} +bincode = "1.3.3" parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 6c5f7c7122e5..56262d540b31 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -20,7 +20,7 @@ mod executor_intf; mod memory_stats; pub use executor_intf::{prepare, prevalidate}; -use libc::{c_int, exit, rusage, write}; +use libc; // NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-prepare-worker=trace`. @@ -45,10 +45,19 @@ use polkadot_node_core_pvf_common::{ }, }; use polkadot_primitives::ExecutorParams; -use std::{ffi::c_void, mem, path::PathBuf, time::Duration}; +use std::{mem, path::PathBuf, process, time::Duration}; +use std::io::{Read, Write}; +use std::sync::Arc; +use futures::TryFutureExt; +use nix::sys::resource::Resource; +use os_pipe::PipeWriter; +use serde::{Deserialize, Serialize}; use tokio::{io, net::UnixStream}; +use polkadot_node_core_pvf_common::worker::stringify_panic_payload; +use polkadot_node_core_pvf_common::worker::thread::{spawn_worker_thread, WaitOutcome}; /// Contains the bytes for a successfully compiled artifact. +#[derive(Serialize, Deserialize)] pub struct CompiledArtifact(Vec); impl CompiledArtifact { @@ -101,17 +110,21 @@ async fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Re /// /// 1. Get the code and parameters for preparation from the host. /// -/// 2. Start a memory tracker in a separate thread. +/// 2. Start a new child process /// -/// 3. Start the CPU time monitor loop and the actual preparation in two separate threads. +/// 3. Start a memory tracker in a separate thread. /// -/// 4. Wait on the two threads created in step 3. +/// 3. Start the actual preparation in a separate thread. +/// +/// 4. Wait on the thread created in step 3. /// /// 5. Stop the memory tracker and get the stats. /// -/// 6. If compilation succeeded, write the compiled artifact into a temporary file. +/// 6. Pipe the result back to the parent process and exit from child process. +/// +/// 7. If compilation succeeded, write the compiled artifact into a temporary file. /// -/// 7. Send the result of preparation back to the host. If any error occurred in the above steps, we +/// 8. Send the result of preparation back to the host. If any error occurred in the above steps, we /// send that in the `PrepareResult`. pub fn worker_entrypoint( socket_path: &str, @@ -138,58 +151,30 @@ pub fn worker_entrypoint( let prepare_job_kind = pvf.prep_kind(); let executor_params = (*pvf.executor_params()).clone(); - let mut rusage_before = unsafe { mem::zeroed() }; - let mut rusage_after = unsafe { mem::zeroed() }; - if unsafe { libc::getrusage(libc::RUSAGE_CHILDREN, &mut rusage_before) } == -1 { - send_response( - &mut stream, - Err(PrepareError::Panic(format!( - "error getting children resource usage for worker pid {}", - worker_pid - ))), - ) - .await?; - }; - let mut pipe_fds = [0; 2]; - if unsafe { libc::pipe(pipe_fds.as_mut_ptr()) } == -1 { - send_response( - &mut stream, - Err(PrepareError::Panic(format!("error creating pipe {}", worker_pid))), - ) - .await?; - } - - let pipe_read = pipe_fds[0]; - let pipe_write = pipe_fds[1]; + let (pipe_reader, pipe_writer) = os_pipe::pipe()?; + + // SAFETY: new process is spawned within a single threaded process let result = match unsafe { libc::fork() } { // error -1 => Err(PrepareError::Panic(String::from("error forking"))), // child - 0 => unsafe { - handle_child_process( + 0 => handle_child_process( pvf, - pipe_write, - pipe_read, + pipe_writer, preparation_timeout, prepare_job_kind, executor_params, ) - .await; - Err(PrepareError::Panic(String::from("unreachable"))) - }, + .await, // parent - _ => unsafe { - handle_parent_process( - &mut rusage_after, - rusage_before, - pipe_read, - pipe_write, + _ => handle_parent_process( + pipe_reader, + pipe_writer, temp_artifact_dest, preparation_timeout, worker_pid, ) .await - }, }; send_response(&mut stream, result).await?; } @@ -224,133 +209,143 @@ fn runtime_construction_check( .map_err(|err| PrepareError::RuntimeConstruction(format!("{:?}", err))) } +#[derive(Serialize, Deserialize)] struct Response { artifact_result: Result, landlock_status: Result, memory_stats: MemoryStats, } -async unsafe fn handle_child_process( +async fn handle_child_process( pvf: PvfPrepData, - pipe_write: c_int, - pipe_read: c_int, + mut pipe_write: os_pipe::PipeWriter, preparation_timeout: Duration, prepare_job_kind: PrepareJobKind, executor_params: ExecutorParams, -) { - if libc::close(pipe_read) == -1 { - exit(libc::EXIT_FAILURE) - } +) -> ! { let worker_pid = std::process::id(); + + nix::sys::resource::setrlimit( + Resource::RLIMIT_CPU, + preparation_timeout.as_secs(), + preparation_timeout.as_secs() + ).unwrap_or_else(|e| { + process::exit(libc::EXIT_FAILURE) + }); + + + let condvar = thread::get_condvar(); + // Run the memory tracker in a regular, non-worker thread. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let condvar_memory = thread::get_condvar(); + let condvar_memory = Arc::clone(&condvar); #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); - if libc::setrlimit( - libc::RLIMIT_CPU, - &mut libc::rlimit { - rlim_cur: preparation_timeout.as_secs(), - rlim_max: preparation_timeout.as_secs(), + let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); + + let prepare_thread = spawn_worker_thread( + "prepare worker", + move || { + // Try to enable landlock. + #[cfg(target_os = "linux")] + let landlock_status = + polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() + .map(LandlockStatus::from_ruleset_status) + .map_err(|e| e.to_string()); + #[cfg(not(target_os = "linux"))] + let landlock_status: Result = Ok(LandlockStatus::NotEnforced); + + #[allow(unused_mut)] + let mut artifact_result = prepare_artifact(pvf); + + // If we are pre-checking, check for runtime construction errors. + // + // As pre-checking is more strict than just preparation in terms of memory + // and time, it is okay to do extra checks here. This takes negligible time + // anyway. + if let PrepareJobKind::Prechecking = prepare_job_kind { + artifact_result = artifact_result.and_then(|output| { + runtime_construction_check(output.as_ref(), executor_params)?; + Ok(output) + }); + } + (artifact_result, landlock_status) }, - ) == -1 - { - exit(libc::EXIT_FAILURE); - }; - - // Try to enable landlock. - #[cfg(target_os = "linux")] - let landlock_status = - polkadot_node_core_pvf_common::worker::security::landlock::try_restrict_thread() - .map(LandlockStatus::from_ruleset_status) - .map_err(|e| e.to_string()); - #[cfg(not(target_os = "linux"))] - let landlock_status: Result = Ok(LandlockStatus::NotEnforced); - - #[allow(unused_mut)] - let mut artifact_result = prepare_artifact(pvf); - - // If we are pre-checking, check for runtime construction errors. - // - // As pre-checking is more strict than just preparation in terms of memory - // and time, it is okay to do extra checks here. This takes negligible time - // anyway. - if let PrepareJobKind::Prechecking = prepare_job_kind { - artifact_result = artifact_result.and_then(|output| { - runtime_construction_check(output.as_ref(), executor_params)?; - Ok(output) - }); - } + Arc::clone(&condvar), + WaitOutcome::Finished + ).unwrap_or_else(|_| { + process::exit(libc::EXIT_FAILURE) + }); + + let outcome = thread::wait_for_threads(condvar); + + match outcome { + WaitOutcome::Finished => { + let (artifact_result, landlock_status) = prepare_thread.join().unwrap_or_else(|err| { + ( + Err(PrepareError::Panic(stringify_panic_payload(err))), + Ok(LandlockStatus::Unavailable), + ) + }); + + // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. + #[cfg(target_os = "linux")] + let max_rss = get_max_rss_thread(); + + // Stop the memory stats worker and get its observed memory stats. + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid).await; + + let memory_stats = MemoryStats { + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + memory_tracker_stats, + #[cfg(target_os = "linux")] + max_rss: extract_max_rss_stat(max_rss, worker_pid), + }; - // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. - #[cfg(target_os = "linux")] - let max_rss = get_max_rss_thread(); + let response = Response { artifact_result, landlock_status, memory_stats }; - // // Stop the memory stats worker and get its observed memory stats. - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid).await; + let bytes = bincode::serialize(&response).unwrap_or_else(|_| { + process::exit(libc::EXIT_FAILURE) + }); - let memory_stats = MemoryStats { - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - memory_tracker_stats, - #[cfg(target_os = "linux")] - max_rss: extract_max_rss_stat(max_rss, worker_pid), - }; + pipe_write.write_all(bytes.as_slice()).unwrap_or_else(|_| { + process::exit(libc::EXIT_FAILURE) + }); - let response = Response { artifact_result, landlock_status, memory_stats }; + process::exit(libc::EXIT_SUCCESS); - if write(pipe_write, &response as *const _ as *const c_void, mem::size_of::()) == -1 { - exit(libc::EXIT_FAILURE); - } - - if libc::close(pipe_write) == -1 { - exit(libc::EXIT_FAILURE); + }, + _ => { + process::exit(libc::EXIT_FAILURE) + } } - - exit(libc::EXIT_SUCCESS); } -async unsafe fn handle_parent_process( - rusage_after: &mut rusage, - rusage_before: rusage, - pipe_read: c_int, - pipe_write: c_int, +async fn handle_parent_process( + mut pipe_read: os_pipe::PipeReader, + pipe_write: PipeWriter, temp_artifact_dest: PathBuf, preparation_timeout: Duration, worker_pid: u32, ) -> Result { - if libc::close(pipe_write) == -1 { - return Err(PrepareError::Panic(String::from( - "error closing pipe write end on the parent process", - ))) - } - - let mut status: c_int = 0; - if libc::wait(&mut status) == -1 { - return Err(PrepareError::Panic(String::from("error waiting for child"))) - } + drop(pipe_write); - if libc::getrusage(libc::RUSAGE_CHILDREN, rusage_after) == -1 { - return Err(PrepareError::Panic(String::from("error getting resource usage from child"))) - }; + return match nix::sys::wait::wait() { + Ok(nix::sys::wait::WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { + let data_size = mem::size_of::(); - // Get total cpu usage for current child - let cpu_tv = get_total_cpu_usage(*rusage_after) - get_total_cpu_usage(rusage_before); - let cpu_duration = Duration::from_micros(cpu_tv as u64); + let mut received_data = Vec::new(); - return match status { - libc::EXIT_SUCCESS => { - let mut result: Response = mem::zeroed(); - let data_size = mem::size_of::(); + pipe_read.read_to_end(&mut received_data).map_err(|_| { + PrepareError::Panic(format!("error reading pipe for worker id {}", worker_pid)) + })?; - if libc::read(pipe_read, &mut result as *mut _ as *mut c_void, data_size) == -1 { - return Err(PrepareError::Panic(String::from("error reading pipe from child"))) - }; + let result: Response = bincode::deserialize(received_data.as_slice()).map_err(|e| { + PrepareError::Panic(e.to_string()) + })?; - if libc::close(pipe_read) == -1 { - return Err(PrepareError::Panic(String::from("error reading pipe from child"))) - } + drop(pipe_read); match result.artifact_result { Err(err) => Err(err), @@ -383,31 +378,16 @@ async unsafe fn handle_parent_process( }; Ok(PrepareStats { - cpu_time_elapsed: cpu_duration, memory_stats: result.memory_stats, }) }, } }, - libc::SIGXCPU => { - // Log if we exceed the timeout and the other thread hasn't - // finished. - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", - cpu_duration.as_millis(), - preparation_timeout.as_millis(), - ); + Ok(nix::sys::wait::WaitStatus::Signaled(_, nix::sys::signal::Signal::SIGXCPU, _)) => { Err(PrepareError::TimedOut) }, - status => Err(PrepareError::Panic(format!("child failed with status {}", status))), + _ => { + Err(PrepareError::Panic(format!("child failed"))) + }, } -} - -fn get_total_cpu_usage(rusage: libc::rusage) -> i64 { - return rusage.ru_utime.tv_sec * 1000000 + - (rusage.ru_utime.tv_usec as i64) + - rusage.ru_stime.tv_sec * 1000000 + - (rusage.ru_stime.tv_usec as i64) -} +} \ No newline at end of file diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 5280ab6b42a2..218879f321d9 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -170,26 +170,13 @@ async fn handle_response( artifact_path: PathBuf, preparation_timeout: Duration, ) -> Outcome { - let PrepareStats { cpu_time_elapsed, memory_stats } = match result.clone() { + let PrepareStats { memory_stats } = match result.clone() { Ok(result) => result, // Timed out on the child. This should already be logged by the child. Err(PrepareError::TimedOut) => return Outcome::TimedOut, Err(_) => return Outcome::Concluded { worker, result }, }; - if cpu_time_elapsed > preparation_timeout { - // The job didn't complete within the timeout. - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "prepare job took {}ms cpu time, exceeded preparation timeout {}ms. Clearing WIP artifact {}", - cpu_time_elapsed.as_millis(), - preparation_timeout.as_millis(), - tmp_file.display(), - ); - return Outcome::TimedOut - } - gum::debug!( target: LOG_TARGET, %worker_pid, From d0a261a04727f24b59a61b5091bf4d8e98658c44 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Wed, 4 Oct 2023 06:02:47 -0300 Subject: [PATCH 06/47] prepare worker: use scale codec for encode and decode and fix docstring --- Cargo.lock | 4 +- polkadot/node/core/pvf/common/Cargo.toml | 1 - polkadot/node/core/pvf/common/src/error.rs | 3 +- polkadot/node/core/pvf/common/src/prepare.rs | 3 +- .../node/core/pvf/prepare-worker/Cargo.toml | 3 +- .../node/core/pvf/prepare-worker/src/lib.rs | 48 +++++++------------ 6 files changed, 20 insertions(+), 42 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 66031d24c341..08db9c014f25 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12023,7 +12023,6 @@ dependencies = [ "sc-executor", "sc-executor-common", "sc-executor-wasmtime", - "serde", "sp-core", "sp-externalities", "sp-io", @@ -12055,7 +12054,7 @@ dependencies = [ name = "polkadot-node-core-pvf-prepare-worker" version = "1.0.0" dependencies = [ - "bincode", + "bytes", "cfg-if", "futures", "libc", @@ -12069,7 +12068,6 @@ dependencies = [ "sc-executor", "sc-executor-common", "sc-executor-wasmtime", - "serde", "sp-io", "sp-maybe-compressed-blob", "sp-tracing", diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 97423322beca..0f7308396d80 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -27,7 +27,6 @@ sp-core = { path = "../../../../../substrate/primitives/core" } sp-externalities = { path = "../../../../../substrate/primitives/externalities" } sp-io = { path = "../../../../../substrate/primitives/io" } sp-tracing = { path = "../../../../../substrate/primitives/tracing" } -serde = { version = "1.0.188", features = ["derive"] } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.2.0" diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index b6686047a7dc..6fdd06057c8b 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -17,14 +17,13 @@ use crate::prepare::PrepareStats; use parity_scale_codec::{Decode, Encode}; use std::fmt; -use serde::{Deserialize, Serialize}; /// Result of PVF preparation performed by the validation host. Contains stats about the preparation /// if successful pub type PrepareResult = Result; /// An error that occurred during the prepare part of the PVF pipeline. -#[derive(Debug, Clone, Encode, Decode, Deserialize, Serialize)] +#[derive(Debug, Clone, Encode, Decode)] pub enum PrepareError { /// During the prevalidation stage of preparation an issue was found with the PVF. Prevalidation(String), diff --git a/polkadot/node/core/pvf/common/src/prepare.rs b/polkadot/node/core/pvf/common/src/prepare.rs index 4f412ddd2c9e..e2ac011c3184 100644 --- a/polkadot/node/core/pvf/common/src/prepare.rs +++ b/polkadot/node/core/pvf/common/src/prepare.rs @@ -15,7 +15,6 @@ // along with Polkadot. If not, see . use parity_scale_codec::{Decode, Encode}; -use serde::{Deserialize, Serialize}; /// Preparation statistics, including the CPU time and memory taken. #[derive(Debug, Clone, Default, Encode, Decode)] @@ -26,7 +25,7 @@ pub struct PrepareStats { /// Helper struct to contain all the memory stats, including `MemoryAllocationStats` and, if /// supported by the OS, `ru_maxrss`. -#[derive(Clone, Debug, Default, Encode, Decode, Serialize, Deserialize)] +#[derive(Clone, Debug, Default, Encode, Decode)] pub struct MemoryStats { /// Memory stats from `tikv_jemalloc_ctl`. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] diff --git a/polkadot/node/core/pvf/prepare-worker/Cargo.toml b/polkadot/node/core/pvf/prepare-worker/Cargo.toml index dd9428c46029..ca61915cf6ee 100644 --- a/polkadot/node/core/pvf/prepare-worker/Cargo.toml +++ b/polkadot/node/core/pvf/prepare-worker/Cargo.toml @@ -16,8 +16,7 @@ tikv-jemalloc-ctl = { version = "0.5.0", optional = true } tokio = { version = "1.24.2", features = ["fs", "process"] } os_pipe = "1.1.4" nix = { version = "0.27.1", features = ["resource", "process"]} -serde = { version = "1.0.188", features = ["derive"]} -bincode = "1.3.3" +bytes = { version = "1.1.0", default-features = false } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 905ff3e57133..b022b0cb01d0 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -37,18 +37,16 @@ use polkadot_node_core_pvf_common::{error::{PrepareError, PrepareResult}, execut worker_event_loop, }, worker_dir}; use polkadot_primitives::ExecutorParams; -use std::{mem, path::PathBuf, process, time::Duration}; +use std::{path::PathBuf, process, time::Duration}; use std::io::{Read, Write}; use std::os::unix::net::UnixStream; use std::sync::Arc; use nix::sys::resource::Resource; -use os_pipe::PipeWriter; -use serde::{Deserialize, Serialize}; use polkadot_node_core_pvf_common::worker::thread::{spawn_worker_thread}; use tokio::io; /// Contains the bytes for a successfully compiled artifact. -#[derive(Serialize, Deserialize)] +#[derive(Encode, Decode)] pub struct CompiledArtifact(Vec); impl CompiledArtifact { @@ -101,11 +99,9 @@ fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Result<( /// /// 2. Start a new child process /// -/// 3. Start a memory tracker in a separate thread. +/// 3. Start the memory tracker and the actual preparation in two separate threads. /// -/// 3. Start the actual preparation in a separate thread. -/// -/// 4. Wait on the thread created in step 3. +/// 4. Wait on the two threads created in step 3. /// /// 5. Stop the memory tracker and get the stats. /// @@ -159,14 +155,17 @@ pub fn worker_entrypoint( ) .await, // parent - _ => handle_parent_process( + _ => { + // the read end will wait until all ends have been closed, + // this drop is necessary to avoid deadlock + drop(pipe_writer); + handle_parent_process( pipe_reader, - pipe_writer, temp_artifact_dest.clone(), - preparation_timeout, worker_pid, ) - .await + .await + } }; send_response(&mut stream, result)?; } @@ -201,7 +200,7 @@ fn runtime_construction_check( .map_err(|err| PrepareError::RuntimeConstruction(format!("{:?}", err))) } -#[derive(Serialize, Deserialize)] +#[derive(Encode, Decode)] struct Response { artifact_result: Result, memory_stats: MemoryStats, @@ -215,8 +214,6 @@ async fn handle_child_process( executor_params: ExecutorParams, ) -> ! { - let worker_pid = std::process::id(); - nix::sys::resource::setrlimit( Resource::RLIMIT_CPU, preparation_timeout.as_secs(), @@ -270,9 +267,7 @@ async fn handle_child_process( match outcome { WaitOutcome::Finished => { let result = prepare_thread.join().unwrap_or_else(|_| { - ( process::exit(libc::EXIT_FAILURE) - ) }); cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { @@ -284,22 +279,18 @@ async fn handle_child_process( // Stop the memory stats worker and get its observed memory stats. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid).await; + let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, process::id()).await; let memory_stats = MemoryStats { #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] memory_tracker_stats, #[cfg(target_os = "linux")] - max_rss: extract_max_rss_stat(max_rss, worker_pid), + max_rss: extract_max_rss_stat(max_rss, process::id()), }; let response = Response { artifact_result, memory_stats }; - let bytes = bincode::serialize(&response).unwrap_or_else(|_| { - process::exit(libc::EXIT_FAILURE) - }); - - pipe_write.write_all(bytes.as_slice()).unwrap_or_else(|_| { + pipe_write.write_all(response.encode().as_slice()).unwrap_or_else(|_| { process::exit(libc::EXIT_FAILURE) }); @@ -314,29 +305,22 @@ async fn handle_child_process( async fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, - pipe_write: PipeWriter, temp_artifact_dest: PathBuf, - preparation_timeout: Duration, worker_pid: u32, ) -> Result { - drop(pipe_write); return match nix::sys::wait::wait() { Ok(nix::sys::wait::WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { - let data_size = mem::size_of::(); - let mut received_data = Vec::new(); pipe_read.read_to_end(&mut received_data).map_err(|_| { PrepareError::Panic(format!("error reading pipe for worker id {}", worker_pid)) })?; - let result: Response = bincode::deserialize(received_data.as_slice()).map_err(|e| { + let result: Response = parity_scale_codec::decode_from_bytes(bytes::Bytes::copy_from_slice(received_data.as_slice())).map_err(|e| { PrepareError::Panic(e.to_string()) })?; - drop(pipe_read); - match result.artifact_result { Err(err) => Err(err), Ok(artifact) => { From 5f100d97a345f95743e1f46c407f2c7f7bfde604 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Thu, 5 Oct 2023 05:41:18 -0300 Subject: [PATCH 07/47] prepare worker: fix artifact result map --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index b022b0cb01d0..8cf70c7d440b 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -241,7 +241,7 @@ async fn handle_child_process( // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. #[cfg(target_os = "linux")] let mut result = result - .map(|(artifact, elapsed)| (artifact, elapsed, get_max_rss_thread())); + .map(|artifact| (artifact, elapsed, get_max_rss_thread())); // If we are pre-checking, check for runtime construction errors. // From e64ff83ba33ad9e3887935d7af4614bdab53309a Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Thu, 5 Oct 2023 06:14:33 -0300 Subject: [PATCH 08/47] prepare worker: remove elapsed time --- .../node/core/pvf/prepare-worker/src/lib.rs | 118 +++++++++--------- 1 file changed, 56 insertions(+), 62 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 8cf70c7d440b..0a879fd83067 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -30,19 +30,29 @@ const LOG_TARGET: &str = "parachain::pvf-prepare-worker"; use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread}; #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop}; +use nix::sys::resource::Resource; use parity_scale_codec::{Decode, Encode}; -use polkadot_node_core_pvf_common::{error::{PrepareError, PrepareResult}, executor_intf::Executor, framed_recv_blocking, framed_send_blocking, prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, SecurityStatus, worker::{ - thread::{self, WaitOutcome}, - WorkerKind, - worker_event_loop, -}, worker_dir}; +use polkadot_node_core_pvf_common::{ + error::{PrepareError, PrepareResult}, + executor_intf::Executor, + framed_recv_blocking, framed_send_blocking, + prepare::{MemoryStats, PrepareJobKind, PrepareStats}, + pvf::PvfPrepData, + worker::{ + thread::{self, spawn_worker_thread, WaitOutcome}, + worker_event_loop, WorkerKind, + }, + worker_dir, SecurityStatus, +}; use polkadot_primitives::ExecutorParams; -use std::{path::PathBuf, process, time::Duration}; -use std::io::{Read, Write}; -use std::os::unix::net::UnixStream; -use std::sync::Arc; -use nix::sys::resource::Resource; -use polkadot_node_core_pvf_common::worker::thread::{spawn_worker_thread}; +use std::{ + io::{Read, Write}, + os::unix::net::UnixStream, + path::PathBuf, + process, + sync::Arc, + time::Duration, +}; use tokio::io; /// Contains the bytes for a successfully compiled artifact. @@ -146,7 +156,8 @@ pub fn worker_entrypoint( // error -1 => Err(PrepareError::Panic(String::from("error forking"))), // child - 0 => handle_child_process( + 0 => + handle_child_process( pvf, pipe_writer, preparation_timeout, @@ -159,13 +170,9 @@ pub fn worker_entrypoint( // the read end will wait until all ends have been closed, // this drop is necessary to avoid deadlock drop(pipe_writer); - handle_parent_process( - pipe_reader, - temp_artifact_dest.clone(), - worker_pid, - ) + handle_parent_process(pipe_reader, temp_artifact_dest.clone(), worker_pid) .await - } + }, }; send_response(&mut stream, result)?; } @@ -213,35 +220,31 @@ async fn handle_child_process( prepare_job_kind: PrepareJobKind, executor_params: ExecutorParams, ) -> ! { - nix::sys::resource::setrlimit( Resource::RLIMIT_CPU, preparation_timeout.as_secs(), - preparation_timeout.as_secs() - ).unwrap_or_else(|_| { - process::exit(libc::EXIT_FAILURE) - }); + preparation_timeout.as_secs(), + ) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - // Conditional variable to notify us when a thread is done. + // Conditional variable to notify us when a thread is done. let condvar = thread::get_condvar(); // Run the memory tracker in a regular, non-worker thread. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let condvar_memory = Arc::clone(&condvar); + let condvar_memory = Arc::clone(&condvar); #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); + let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); let prepare_thread = spawn_worker_thread( "prepare worker", move || { - #[allow(unused_mut)] - let mut result = prepare_artifact(pvf); + let mut result = prepare_artifact(pvf); - // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. - #[cfg(target_os = "linux")] - let mut result = result - .map(|artifact| (artifact, elapsed, get_max_rss_thread())); + // Get the `ru_maxrss` stat. If supported, call getrusage for the thread. + #[cfg(target_os = "linux")] + let mut result = result.map(|artifact| (artifact, get_max_rss_thread())); // If we are pre-checking, check for runtime construction errors. // @@ -257,19 +260,17 @@ async fn handle_child_process( result }, Arc::clone(&condvar), - WaitOutcome::Finished - ).unwrap_or_else(|_| { - process::exit(libc::EXIT_FAILURE) - }); + WaitOutcome::Finished, + ) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); let outcome = thread::wait_for_threads(condvar); match outcome { WaitOutcome::Finished => { - let result = prepare_thread.join().unwrap_or_else(|_| { - process::exit(libc::EXIT_FAILURE) - }); - cfg_if::cfg_if! { + let result = + prepare_thread.join().unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { let (artifact_result, max_rss) = result; } else { @@ -279,7 +280,8 @@ async fn handle_child_process( // Stop the memory stats worker and get its observed memory stats. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, process::id()).await; + let memory_tracker_stats = + get_memory_tracker_loop_stats(memory_tracker_thread, process::id()).await; let memory_stats = MemoryStats { #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] @@ -290,16 +292,13 @@ async fn handle_child_process( let response = Response { artifact_result, memory_stats }; - pipe_write.write_all(response.encode().as_slice()).unwrap_or_else(|_| { - process::exit(libc::EXIT_FAILURE) - }); + pipe_write + .write_all(response.encode().as_slice()) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); process::exit(libc::EXIT_SUCCESS); - }, - _ => { - process::exit(libc::EXIT_FAILURE) - } + _ => process::exit(libc::EXIT_FAILURE), } } @@ -308,7 +307,6 @@ async fn handle_parent_process( temp_artifact_dest: PathBuf, worker_pid: u32, ) -> Result { - return match nix::sys::wait::wait() { Ok(nix::sys::wait::WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { let mut received_data = Vec::new(); @@ -317,9 +315,10 @@ async fn handle_parent_process( PrepareError::Panic(format!("error reading pipe for worker id {}", worker_pid)) })?; - let result: Response = parity_scale_codec::decode_from_bytes(bytes::Bytes::copy_from_slice(received_data.as_slice())).map_err(|e| { - PrepareError::Panic(e.to_string()) - })?; + let result: Response = parity_scale_codec::decode_from_bytes( + bytes::Bytes::copy_from_slice(received_data.as_slice()), + ) + .map_err(|e| PrepareError::Panic(e.to_string()))?; match result.artifact_result { Err(err) => Err(err), @@ -341,17 +340,12 @@ async fn handle_parent_process( return Err(PrepareError::Panic(format!("{:?}", err))) }; - Ok(PrepareStats { - memory_stats: result.memory_stats, - }) + Ok(PrepareStats { memory_stats: result.memory_stats }) }, } }, - Ok(nix::sys::wait::WaitStatus::Signaled(_, nix::sys::signal::Signal::SIGXCPU, _)) => { - Err(PrepareError::TimedOut) - }, - _ => { - Err(PrepareError::Panic(format!("child failed"))) - }, + Ok(nix::sys::wait::WaitStatus::Signaled(_, nix::sys::signal::Signal::SIGXCPU, _)) => + Err(PrepareError::TimedOut), + _ => Err(PrepareError::Panic(format!("child failed"))), } -} \ No newline at end of file +} From 462d73cb6e64608546b4ea1cea1327e903b7ad4a Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Thu, 5 Oct 2023 07:02:28 -0300 Subject: [PATCH 09/47] prepare worker: fix result hadling when target os is linux --- .../node/core/pvf/prepare-worker/src/lib.rs | 72 +++++++++++-------- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 0a879fd83067..7960528227b6 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -209,7 +209,7 @@ fn runtime_construction_check( #[derive(Encode, Decode)] struct Response { - artifact_result: Result, + artifact: CompiledArtifact, memory_stats: MemoryStats, } @@ -253,7 +253,7 @@ async fn handle_child_process( // anyway. if let PrepareJobKind::Prechecking = prepare_job_kind { result = result.and_then(|output| { - runtime_construction_check(output.as_ref(), executor_params)?; + runtime_construction_check(output.0.as_ref(), executor_params)?; Ok(output) }); } @@ -270,31 +270,41 @@ async fn handle_child_process( WaitOutcome::Finished => { let result = prepare_thread.join().unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - let (artifact_result, max_rss) = result; - } else { - let artifact_result = result; - } - } - - // Stop the memory stats worker and get its observed memory stats. - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = - get_memory_tracker_loop_stats(memory_tracker_thread, process::id()).await; - - let memory_stats = MemoryStats { - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - memory_tracker_stats, - #[cfg(target_os = "linux")] - max_rss: extract_max_rss_stat(max_rss, process::id()), - }; + match result { + Ok(ok) => { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + let (artifact, max_rss) = ok; + } else { + let artifact = ok; + } + } + + // Stop the memory stats worker and get its observed memory stats. + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + let memory_tracker_stats = + get_memory_tracker_loop_stats(memory_tracker_thread, process::id()).await; + + let memory_stats = MemoryStats { + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + memory_tracker_stats, + #[cfg(target_os = "linux")] + max_rss: extract_max_rss_stat(max_rss, process::id()), + }; - let response = Response { artifact_result, memory_stats }; + let response: Result = + Ok(Response { artifact, memory_stats }); - pipe_write - .write_all(response.encode().as_slice()) - .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + pipe_write + .write_all(response.encode().as_slice()) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + }, + Err(err) => { + pipe_write + .write_all(Err::(err).encode().as_slice()) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + }, + } process::exit(libc::EXIT_SUCCESS); }, @@ -315,14 +325,14 @@ async fn handle_parent_process( PrepareError::Panic(format!("error reading pipe for worker id {}", worker_pid)) })?; - let result: Response = parity_scale_codec::decode_from_bytes( + let result: Result = parity_scale_codec::decode_from_bytes( bytes::Bytes::copy_from_slice(received_data.as_slice()), ) .map_err(|e| PrepareError::Panic(e.to_string()))?; - match result.artifact_result { + match result { Err(err) => Err(err), - Ok(artifact) => { + Ok(response) => { // Write the serialized artifact into a temp file. // // PVF host only keeps artifacts statuses in its memory, @@ -336,11 +346,13 @@ async fn handle_parent_process( "worker: writing artifact to {}", temp_artifact_dest.display(), ); - if let Err(err) = tokio::fs::write(&temp_artifact_dest, &artifact).await { + if let Err(err) = + tokio::fs::write(&temp_artifact_dest, &response.artifact).await + { return Err(PrepareError::Panic(format!("{:?}", err))) }; - Ok(PrepareStats { memory_stats: result.memory_stats }) + Ok(PrepareStats { memory_stats: response.memory_stats }) }, } }, From 7ef719dd6f5272ad68fd21f78a31631cb6609045 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Thu, 5 Oct 2023 07:12:57 -0300 Subject: [PATCH 10/47] prepare worker: remove useless use of format! --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 7960528227b6..bb16ea5f60f3 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -358,6 +358,6 @@ async fn handle_parent_process( }, Ok(nix::sys::wait::WaitStatus::Signaled(_, nix::sys::signal::Signal::SIGXCPU, _)) => Err(PrepareError::TimedOut), - _ => Err(PrepareError::Panic(format!("child failed"))), + _ => Err(PrepareError::Panic("child failed".to_string())), } } From b0fa175ee875df4707dd7c0b4fc4194f81e9fa49 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Thu, 5 Oct 2023 07:35:23 -0300 Subject: [PATCH 11/47] prepare worker: remove unused preparation_timeout --- polkadot/node/core/pvf/src/prepare/worker_intf.rs | 2 -- 1 file changed, 2 deletions(-) diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 1b15b80c8c6d..7e0125ce4362 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -158,7 +158,6 @@ pub async fn start_work( pid, tmp_artifact_file, artifact_path, - preparation_timeout, ) .await, Ok(Err(err)) => { @@ -197,7 +196,6 @@ async fn handle_response( worker_pid: u32, tmp_file: PathBuf, artifact_path: PathBuf, - preparation_timeout: Duration, ) -> Outcome { let PrepareStats { memory_stats } = match result.clone() { Ok(result) => result, From 220f06959f8b476e3bb0d57aa930415d9f8d079c Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Mon, 9 Oct 2023 07:01:01 -0300 Subject: [PATCH 12/47] prepare worker: move pipe read before child wait --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index bb16ea5f60f3..5292221b60aa 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -317,14 +317,13 @@ async fn handle_parent_process( temp_artifact_dest: PathBuf, worker_pid: u32, ) -> Result { + let mut received_data = Vec::new(); + + pipe_read.read_to_end(&mut received_data).map_err(|_| { + PrepareError::Panic(format!("error reading pipe for worker id {}", worker_pid)) + })?; return match nix::sys::wait::wait() { Ok(nix::sys::wait::WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { - let mut received_data = Vec::new(); - - pipe_read.read_to_end(&mut received_data).map_err(|_| { - PrepareError::Panic(format!("error reading pipe for worker id {}", worker_pid)) - })?; - let result: Result = parity_scale_codec::decode_from_bytes( bytes::Bytes::copy_from_slice(received_data.as_slice()), ) From 96de6567ef38355e80735eed596b082eb62ab9ec Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Thu, 12 Oct 2023 07:16:52 -0300 Subject: [PATCH 13/47] prepare worker: use getrusage for timeout --- polkadot/node/core/pvf/common/src/prepare.rs | 2 + .../node/core/pvf/prepare-worker/src/lib.rs | 59 ++++++++++++++----- .../node/core/pvf/src/prepare/worker_intf.rs | 17 +++++- 3 files changed, 61 insertions(+), 17 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/prepare.rs b/polkadot/node/core/pvf/common/src/prepare.rs index e2ac011c3184..c205eddfb8b1 100644 --- a/polkadot/node/core/pvf/common/src/prepare.rs +++ b/polkadot/node/core/pvf/common/src/prepare.rs @@ -19,6 +19,8 @@ use parity_scale_codec::{Decode, Encode}; /// Preparation statistics, including the CPU time and memory taken. #[derive(Debug, Clone, Default, Encode, Decode)] pub struct PrepareStats { + /// The CPU time that elapsed for the preparation job. + pub cpu_time_elapsed: std::time::Duration, /// The observed memory statistics for the preparation job. pub memory_stats: MemoryStats, } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 5292221b60aa..2801a66d69ab 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -30,7 +30,7 @@ const LOG_TARGET: &str = "parachain::pvf-prepare-worker"; use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread}; #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop}; -use nix::sys::resource::Resource; +use nix::sys::resource::{Resource, Usage, UsageWho}; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, @@ -151,6 +151,8 @@ pub fn worker_entrypoint( let (pipe_reader, pipe_writer) = os_pipe::pipe()?; + let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)?; + // SAFETY: new process is spawned within a single threaded process let result = match unsafe { libc::fork() } { // error @@ -170,8 +172,14 @@ pub fn worker_entrypoint( // the read end will wait until all ends have been closed, // this drop is necessary to avoid deadlock drop(pipe_writer); - handle_parent_process(pipe_reader, temp_artifact_dest.clone(), worker_pid) - .await + handle_parent_process( + pipe_reader, + temp_artifact_dest.clone(), + worker_pid, + usage_before, + preparation_timeout.as_secs(), + ) + .await }, }; send_response(&mut stream, result)?; @@ -316,19 +324,25 @@ async fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, temp_artifact_dest: PathBuf, worker_pid: u32, + usage_before: Usage, + timeout: u64, ) -> Result { let mut received_data = Vec::new(); - pipe_read.read_to_end(&mut received_data).map_err(|_| { - PrepareError::Panic(format!("error reading pipe for worker id {}", worker_pid)) - })?; - return match nix::sys::wait::wait() { + pipe_read + .read_to_end(&mut received_data) + .map_err(|err| PrepareError::Panic(err.to_string()))?; + let status = nix::sys::wait::wait(); + let usage_after = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) + .map_err(|err| PrepareError::Panic(err.to_string()))?; + let cpu_tv = (get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before)) as u64; + + return match status { Ok(nix::sys::wait::WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { let result: Result = parity_scale_codec::decode_from_bytes( bytes::Bytes::copy_from_slice(received_data.as_slice()), ) .map_err(|e| PrepareError::Panic(e.to_string()))?; - match result { Err(err) => Err(err), Ok(response) => { @@ -340,10 +354,10 @@ async fn handle_parent_process( // is only required to send `Ok` to the pool to indicate the // success. gum::debug!( - target: LOG_TARGET, - %worker_pid, - "worker: writing artifact to {}", - temp_artifact_dest.display(), + target: LOG_TARGET, + %worker_pid, + "worker: writing artifact to {}", + temp_artifact_dest.display(), ); if let Err(err) = tokio::fs::write(&temp_artifact_dest, &response.artifact).await @@ -351,12 +365,25 @@ async fn handle_parent_process( return Err(PrepareError::Panic(format!("{:?}", err))) }; - Ok(PrepareStats { memory_stats: response.memory_stats }) + Ok(PrepareStats { + memory_stats: response.memory_stats, + cpu_time_elapsed: Duration::from_secs(cpu_tv as u64), + }) }, } }, - Ok(nix::sys::wait::WaitStatus::Signaled(_, nix::sys::signal::Signal::SIGXCPU, _)) => - Err(PrepareError::TimedOut), - _ => Err(PrepareError::Panic("child failed".to_string())), + _ => { + if cpu_tv >= timeout { + return Err(PrepareError::TimedOut) + } + Err(PrepareError::Panic("child finished with unknown status".to_string())) + }, } } + +fn get_total_cpu_usage(rusage: Usage) -> u64 { + return (rusage.user_time().tv_sec() + + rusage.system_time().tv_sec() + + ((rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) / 1_000_000) as i64) + as u64 +} diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 7e0125ce4362..b66c36044343 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -158,6 +158,7 @@ pub async fn start_work( pid, tmp_artifact_file, artifact_path, + preparation_timeout, ) .await, Ok(Err(err)) => { @@ -196,14 +197,28 @@ async fn handle_response( worker_pid: u32, tmp_file: PathBuf, artifact_path: PathBuf, + preparation_timeout: Duration, ) -> Outcome { - let PrepareStats { memory_stats } = match result.clone() { + let PrepareStats { cpu_time_elapsed, memory_stats } = match result.clone() { Ok(result) => result, // Timed out on the child. This should already be logged by the child. Err(PrepareError::TimedOut) => return Outcome::TimedOut, Err(_) => return Outcome::Concluded { worker, result }, }; + if cpu_time_elapsed > preparation_timeout { + // The job didn't complete within the timeout. + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "prepare job took {}ms cpu time, exceeded preparation timeout {}ms. Clearing WIP artifact {}", + cpu_time_elapsed.as_millis(), + preparation_timeout.as_millis(), + tmp_file.display(), + ); + return Outcome::TimedOut + } + gum::debug!( target: LOG_TARGET, %worker_pid, From fdb3462e6e904dad9f7ff8cfb8de4766b9fa1aef Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Sat, 21 Oct 2023 12:48:49 +0200 Subject: [PATCH 14/47] Do some cleanup / addressing of issues --- Cargo.lock | 1 - .../node/core/pvf/prepare-worker/Cargo.toml | 1 - .../node/core/pvf/prepare-worker/src/lib.rs | 90 +++++++++---------- .../node/core/pvf/src/execute/worker_intf.rs | 2 +- 4 files changed, 41 insertions(+), 53 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 126918ea5c7c..f5cac7eee314 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12173,7 +12173,6 @@ dependencies = [ name = "polkadot-node-core-pvf-prepare-worker" version = "1.0.0" dependencies = [ - "bytes", "cfg-if", "futures", "libc", diff --git a/polkadot/node/core/pvf/prepare-worker/Cargo.toml b/polkadot/node/core/pvf/prepare-worker/Cargo.toml index ca61915cf6ee..edbc7337cd6b 100644 --- a/polkadot/node/core/pvf/prepare-worker/Cargo.toml +++ b/polkadot/node/core/pvf/prepare-worker/Cargo.toml @@ -16,7 +16,6 @@ tikv-jemalloc-ctl = { version = "0.5.0", optional = true } tokio = { version = "1.24.2", features = ["fs", "process"] } os_pipe = "1.1.4" nix = { version = "0.27.1", features = ["resource", "process"]} -bytes = { version = "1.1.0", default-features = false } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index eda09e517ea0..e5f7af1ca08d 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -229,6 +229,7 @@ async fn handle_child_process( prepare_job_kind: PrepareJobKind, executor_params: Arc, ) -> ! { + // Set a hard CPU time limit for the child process. nix::sys::resource::setrlimit( Resource::RLIMIT_CPU, preparation_timeout.as_secs(), @@ -273,52 +274,42 @@ async fn handle_child_process( ) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - let outcome = thread::wait_for_threads(condvar); - - match outcome { - WaitOutcome::Finished => { - let result = - prepare_thread.join().unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - match result { - Ok(ok) => { - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - let (artifact, max_rss) = ok; - } else { - let artifact = ok; - } - } - - // Stop the memory stats worker and get its observed memory stats. - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = - get_memory_tracker_loop_stats(memory_tracker_thread, process::id()).await; - - let memory_stats = MemoryStats { - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - memory_tracker_stats, - #[cfg(target_os = "linux")] - max_rss: extract_max_rss_stat(max_rss, process::id()), - }; + // There's only one thread that can trigger the condvar, so ignore the condvar outcome and + // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. + let result = prepare_thread.join().unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + + let response: Result = match result { + Ok(ok) => { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + let (artifact, max_rss) = ok; + } else { + let artifact = ok; + } + } - let response: Result = - Ok(Response { artifact, memory_stats }); + // Stop the memory stats worker and get its observed memory stats. + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + let memory_tracker_stats = + get_memory_tracker_loop_stats(memory_tracker_thread, process::id()).await; - pipe_write - .write_all(response.encode().as_slice()) - .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - }, - Err(err) => { - pipe_write - .write_all(Err::(err).encode().as_slice()) - .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - }, - } + let memory_stats = MemoryStats { + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + memory_tracker_stats, + #[cfg(target_os = "linux")] + max_rss: extract_max_rss_stat(max_rss, process::id()), + }; - process::exit(libc::EXIT_SUCCESS); + Ok(Response { artifact, memory_stats }) }, - _ => process::exit(libc::EXIT_FAILURE), - } + Err(err) => Err(err), + }; + + pipe_write + .write_all(response.encode().as_slice()) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + + process::exit(libc::EXIT_SUCCESS); } async fn handle_parent_process( @@ -340,10 +331,9 @@ async fn handle_parent_process( return match status { Ok(nix::sys::wait::WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { - let result: Result = parity_scale_codec::decode_from_bytes( - bytes::Bytes::copy_from_slice(received_data.as_slice()), - ) - .map_err(|e| PrepareError::Panic(e.to_string()))?; + let result: Result = + Result::decode(&mut received_data.as_slice()) + .map_err(|e| PrepareError::Panic(e.to_string()))?; match result { Err(err) => Err(err), Ok(response) => { @@ -355,10 +345,10 @@ async fn handle_parent_process( // is only required to send `Ok` to the pool to indicate the // success. gum::debug!( - target: LOG_TARGET, - %worker_pid, - "worker: writing artifact to {}", - temp_artifact_dest.display(), + target: LOG_TARGET, + %worker_pid, + "worker: writing artifact to {}", + temp_artifact_dest.display(), ); if let Err(err) = tokio::fs::write(&temp_artifact_dest, &response.artifact).await diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 783c7c7abbc8..d0c0e9985813 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -275,7 +275,7 @@ async fn send_request( async fn recv_response(stream: &mut UnixStream) -> io::Result { let response_bytes = framed_recv(stream).await?; - Response::decode(&mut &response_bytes[..]).map_err(|e| { + Response::decode(&mut response_bytes.as_slice()).map_err(|e| { io::Error::new( io::ErrorKind::Other, format!("execute pvf recv_response: decode error: {:?}", e), From f4bea3a89c5eb79f2781c8bbebbfe00f99cee340 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Sun, 22 Oct 2023 09:36:17 -0300 Subject: [PATCH 15/47] prepare worker: add docstring and pipe error from child process to parent --- Cargo.lock | 20 +-- .../node/core/pvf/prepare-worker/src/lib.rs | 115 +++++++++++++++--- 2 files changed, 106 insertions(+), 29 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index f5cac7eee314..bb145e2a0ff9 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5485,7 +5485,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" dependencies = [ - "rustix 0.38.8", + "rustix 0.38.20", "windows-sys 0.48.0", ] @@ -6438,7 +6438,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix 0.38.8", + "rustix 0.38.20", "windows-sys 0.48.0", ] @@ -6891,9 +6891,9 @@ checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libflate" @@ -7477,9 +7477,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "linux-raw-sys" -version = "0.4.5" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "lioness" @@ -14261,14 +14261,14 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.8" +version = "0.38.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f" +checksum = "67ce50cb2e16c2903e30d1cbccfd8387a74b9d4c938b6a4c5ec6cc7556f7a8a0" dependencies = [ "bitflags 2.4.0", "errno", "libc", - "linux-raw-sys 0.4.5", + "linux-raw-sys 0.4.10", "windows-sys 0.48.0", ] @@ -18210,7 +18210,7 @@ dependencies = [ "cfg-if", "fastrand 2.0.0", "redox_syscall 0.3.5", - "rustix 0.38.8", + "rustix 0.38.20", "windows-sys 0.48.0", ] diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index e5f7af1ca08d..bbe0c4ae3227 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -53,6 +53,7 @@ use std::{ sync::Arc, time::Duration, }; +use os_pipe::PipeWriter; use tokio::io; /// Contains the bytes for a successfully compiled artifact. @@ -222,6 +223,27 @@ struct Response { memory_stats: MemoryStats, } +/// This is used to handle child process during pvf prepare worker. +/// It prepare the artifact and track memory stats during preparation +/// and pipes back the response to the parent process +/// +/// # Arguments +/// +/// - `pvf`: `PvfPrepData` structure, containing data to prepare the artifact +/// +/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// +/// - `preparation_timeout`: The timeout in `Duration`. +/// +/// - `prepare_job_kind`: The kind of prepare job. +/// +/// - `executor_params`: Deterministically serialized execution environment semantics. +/// +/// # Returns +/// +/// - If any error occur, pipe response back with `PrepareError`. +/// +/// - If success, pipe back `Response`. async fn handle_child_process( pvf: PvfPrepData, mut pipe_write: os_pipe::PipeWriter, @@ -234,8 +256,7 @@ async fn handle_child_process( Resource::RLIMIT_CPU, preparation_timeout.as_secs(), preparation_timeout.as_secs(), - ) - .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + ).unwrap_or_else(|err|send_child_response(pipe_write, Err(PrepareError::Panic(err.to_string())))); // Conditional variable to notify us when a thread is done. let condvar = thread::get_condvar(); @@ -272,11 +293,14 @@ async fn handle_child_process( Arc::clone(&condvar), WaitOutcome::Finished, ) - .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + .unwrap_or_else(|err| + send_child_response(pipe_write, Err(PrepareError::Panic(err.to_string())))); // There's only one thread that can trigger the condvar, so ignore the condvar outcome and // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. - let result = prepare_thread.join().unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + let result = prepare_thread.join() + .unwrap_or_else(|err| + send_child_response(pipe_write, Err(PrepareError::Panic(err.to_string())))); let response: Result = match result { Ok(ok) => { @@ -305,13 +329,34 @@ async fn handle_child_process( Err(err) => Err(err), }; - pipe_write - .write_all(response.encode().as_slice()) - .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - - process::exit(libc::EXIT_SUCCESS); + send_child_response(pipe_write, response); } + + +/// Waits for child process to finish and handle child response from pipe. +/// +/// # Arguments +/// +/// - `pipe_read`: A `PipeReader` used to read data from the child process. +/// +/// - `temp_artifact_dest`: The destination `PathBuf` to write the temporary artifact file. +/// +/// - `worker_pid`: The PID of the child process. +/// +/// - `usage_before`: Resource usage statistics before executing the child process. +/// +/// - `timeout`: The maximum allowed time for the child process to finish, in milliseconds. +/// +/// # Returns +/// +/// - If the child send response without an error, this function returns `Ok(PrepareStats)` containing memory and CPU usage statistics. +/// +/// - If the child send response with an error, it returns a `PrepareError`. +/// +/// - If the child process timeout, it returns `PrepareError::TimedOut`. +/// +/// - If the child process exits with an unknown status, it returns `PrepareError`. async fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, temp_artifact_dest: PathBuf, @@ -327,10 +372,18 @@ async fn handle_parent_process( let status = nix::sys::wait::wait(); let usage_after = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) .map_err(|err| PrepareError::Panic(err.to_string()))?; - let cpu_tv = (get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before)) as u64; + + // Using `getrusage` is needed to check whether `setrlimit` was triggered. + // As `getrusage` returns resource usage from all terminated child processes, + // it is necessary to subtract the usage before the current child process to isolate its cpu time + let cpu_tv = (get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before)); + + if cpu_tv >= timeout { + return Err(PrepareError::TimedOut) + } return match status { - Ok(nix::sys::wait::WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { + Ok(_) => { let result: Result = Result::decode(&mut received_data.as_slice()) .map_err(|e| PrepareError::Panic(e.to_string()))?; @@ -364,17 +417,41 @@ async fn handle_parent_process( } }, _ => { - if cpu_tv >= timeout { - return Err(PrepareError::TimedOut) - } Err(PrepareError::Panic("child finished with unknown status".to_string())) }, } } -fn get_total_cpu_usage(rusage: Usage) -> u64 { - return (rusage.user_time().tv_sec() + - rusage.system_time().tv_sec() + - ((rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) / 1_000_000) as i64) - as u64 +/// Calculate the total CPU time from the given `nix::sys::Usage` structure, returned from `nix::sys::resource::getrusage`, +/// and calculates the total CPU time spent, including both user and system time. +/// +/// # Arguments +/// +/// - `rusage`: A `nix::sys::Usage` structure, contains resource usage information. +/// +/// # Returns +/// +/// Returns a `Duration` representing the total CPU time. +fn get_total_cpu_usage(rusage: Usage) -> Duration { + let millis = (((rusage.user_time().tv_sec() + + rusage.system_time().tv_sec()) * 1_000_000) + + (rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) + as u64; + + return Duration::from_millis(millis) +} + +/// Write response to the pipe and exit process after. +/// +/// # Arguments +/// +/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// +/// - `response`: Child process response +fn send_child_response(mut pipe_write: PipeWriter, response: Result) -> ! { + pipe_write + .write_all(response.encode().as_slice()) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); + + process::exit(libc::EXIT_SUCCESS) } From 6615598a9cd20d4fef68d0588edbc6bae3ec87d1 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Wed, 25 Oct 2023 05:59:54 -0300 Subject: [PATCH 16/47] prepare worker: closes stream in child process --- .../node/core/pvf/prepare-worker/src/lib.rs | 79 ++++++++++--------- 1 file changed, 40 insertions(+), 39 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index bbe0c4ae3227..53d55387b5fb 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -30,7 +30,9 @@ const LOG_TARGET: &str = "parachain::pvf-prepare-worker"; use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread}; #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop}; +use libc::printf; use nix::sys::resource::{Resource, Usage, UsageWho}; +use os_pipe::{self, PipeWriter}; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, @@ -39,21 +41,15 @@ use polkadot_node_core_pvf_common::{ prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ + stringify_panic_payload, thread::{self, spawn_worker_thread, WaitOutcome}, worker_event_loop, WorkerKind, }, worker_dir, SecurityStatus, }; use polkadot_primitives::ExecutorParams; -use std::{ - io::{Read, Write}, - os::unix::net::UnixStream, - path::PathBuf, - process, - sync::Arc, - time::Duration, -}; -use os_pipe::PipeWriter; +use std::{fs, io::{Read, Write}, os::unix::net::UnixStream, path::PathBuf, process, sync::Arc, time::Duration}; +use std::fmt::format; use tokio::io; /// Contains the bytes for a successfully compiled artifact. @@ -139,7 +135,7 @@ pub fn worker_entrypoint( worker_version, &security_status, |mut stream, worker_dir_path| async move { - let worker_pid = std::process::id(); + let worker_pid = process::id(); let temp_artifact_dest = worker_dir::prepare_tmp_artifact(&worker_dir_path); loop { @@ -163,7 +159,8 @@ pub fn worker_entrypoint( // error -1 => Err(PrepareError::Panic(String::from("error forking"))), // child - 0 => + 0 => { + drop(stream); handle_child_process( pvf, pipe_writer, @@ -171,7 +168,8 @@ pub fn worker_entrypoint( prepare_job_kind, executor_params, ) - .await, + .await + }, // parent _ => { // the read end will wait until all ends have been closed, @@ -246,7 +244,7 @@ struct Response { /// - If success, pipe back `Response`. async fn handle_child_process( pvf: PvfPrepData, - mut pipe_write: os_pipe::PipeWriter, + pipe_write: os_pipe::PipeWriter, preparation_timeout: Duration, prepare_job_kind: PrepareJobKind, executor_params: Arc, @@ -256,7 +254,10 @@ async fn handle_child_process( Resource::RLIMIT_CPU, preparation_timeout.as_secs(), preparation_timeout.as_secs(), - ).unwrap_or_else(|err|send_child_response(pipe_write, Err(PrepareError::Panic(err.to_string())))); + ) + .unwrap_or_else(|err| { + send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) + }); // Conditional variable to notify us when a thread is done. let condvar = thread::get_condvar(); @@ -293,14 +294,15 @@ async fn handle_child_process( Arc::clone(&condvar), WaitOutcome::Finished, ) - .unwrap_or_else(|err| - send_child_response(pipe_write, Err(PrepareError::Panic(err.to_string())))); + .unwrap_or_else(|err| { + send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) + }); // There's only one thread that can trigger the condvar, so ignore the condvar outcome and // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. - let result = prepare_thread.join() - .unwrap_or_else(|err| - send_child_response(pipe_write, Err(PrepareError::Panic(err.to_string())))); + let result = prepare_thread.join().unwrap_or_else(|err| { + send_child_response(&pipe_write, Err(PrepareError::Panic(stringify_panic_payload(err)))) + }); let response: Result = match result { Ok(ok) => { @@ -329,11 +331,9 @@ async fn handle_child_process( Err(err) => Err(err), }; - send_child_response(pipe_write, response); + send_child_response(&pipe_write, response); } - - /// Waits for child process to finish and handle child response from pipe. /// /// # Arguments @@ -350,7 +350,8 @@ async fn handle_child_process( /// /// # Returns /// -/// - If the child send response without an error, this function returns `Ok(PrepareStats)` containing memory and CPU usage statistics. +/// - If the child send response without an error, this function returns `Ok(PrepareStats)` +/// containing memory and CPU usage statistics. /// /// - If the child send response with an error, it returns a `PrepareError`. /// @@ -375,10 +376,13 @@ async fn handle_parent_process( // Using `getrusage` is needed to check whether `setrlimit` was triggered. // As `getrusage` returns resource usage from all terminated child processes, - // it is necessary to subtract the usage before the current child process to isolate its cpu time - let cpu_tv = (get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before)); + // it is necessary to subtract the usage before the current child process to isolate its cpu + // time + let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv >= timeout { + let mut f = fs::File::create("/Users/joaopedrosantos/parity/polkadot-sdk/polkadot/node/core/pvf/tests/it/log.txt").unwrap(); + f.write_all(format!("cpu_tv {}, timeout {}", cpu_tv.as_secs(), timeout).as_bytes()).unwrap(); + if cpu_tv.as_secs() >= timeout { return Err(PrepareError::TimedOut) } @@ -411,34 +415,31 @@ async fn handle_parent_process( Ok(PrepareStats { memory_stats: response.memory_stats, - cpu_time_elapsed: Duration::from_secs(cpu_tv as u64), + cpu_time_elapsed: cpu_tv, }) }, } }, - _ => { - Err(PrepareError::Panic("child finished with unknown status".to_string())) - }, + _ => Err(PrepareError::Panic("child finished with unknown status".to_string())), } } -/// Calculate the total CPU time from the given `nix::sys::Usage` structure, returned from `nix::sys::resource::getrusage`, -/// and calculates the total CPU time spent, including both user and system time. +/// Calculate the total CPU time from the given `usage` structure, returned from +/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user +/// and system time. /// /// # Arguments /// -/// - `rusage`: A `nix::sys::Usage` structure, contains resource usage information. +/// - `rusage`: Contains resource usage information. /// /// # Returns /// /// Returns a `Duration` representing the total CPU time. fn get_total_cpu_usage(rusage: Usage) -> Duration { - let millis = (((rusage.user_time().tv_sec() + - rusage.system_time().tv_sec()) * 1_000_000) + - (rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) - as u64; + let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) + + (rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64; - return Duration::from_millis(millis) + return Duration::from_micros(micros) } /// Write response to the pipe and exit process after. @@ -448,7 +449,7 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { /// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. /// /// - `response`: Child process response -fn send_child_response(mut pipe_write: PipeWriter, response: Result) -> ! { +fn send_child_response(mut pipe_write: &PipeWriter, response: Result) -> ! { pipe_write .write_all(response.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); From 2196c5f4f0d1e6827211012207c1e2d3e94aab61 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Wed, 25 Oct 2023 06:43:32 -0300 Subject: [PATCH 17/47] prepare worker: fix prepare job --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 53d55387b5fb..49bfd6ab6866 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -30,7 +30,6 @@ const LOG_TARGET: &str = "parachain::pvf-prepare-worker"; use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread}; #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop}; -use libc::printf; use nix::sys::resource::{Resource, Usage, UsageWho}; use os_pipe::{self, PipeWriter}; use parity_scale_codec::{Decode, Encode}; @@ -48,8 +47,15 @@ use polkadot_node_core_pvf_common::{ worker_dir, SecurityStatus, }; use polkadot_primitives::ExecutorParams; -use std::{fs, io::{Read, Write}, os::unix::net::UnixStream, path::PathBuf, process, sync::Arc, time::Duration}; -use std::fmt::format; +use std::{ + fs, + io::{Read, Write}, + os::unix::net::UnixStream, + path::PathBuf, + process, + sync::Arc, + time::Duration, +}; use tokio::io; /// Contains the bytes for a successfully compiled artifact. @@ -285,7 +291,7 @@ async fn handle_child_process( // anyway. if let PrepareJobKind::Prechecking = prepare_job_kind { result = result.and_then(|output| { - runtime_construction_check(&output.0, &executor_params)?; + runtime_construction_check(output.as_ref(), &executor_params)?; Ok(output) }); } @@ -380,8 +386,6 @@ async fn handle_parent_process( // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - let mut f = fs::File::create("/Users/joaopedrosantos/parity/polkadot-sdk/polkadot/node/core/pvf/tests/it/log.txt").unwrap(); - f.write_all(format!("cpu_tv {}, timeout {}", cpu_tv.as_secs(), timeout).as_bytes()).unwrap(); if cpu_tv.as_secs() >= timeout { return Err(PrepareError::TimedOut) } From 1682ed091d09f06a9f040be963ccd9391c928570 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Wed, 25 Oct 2023 06:58:53 -0300 Subject: [PATCH 18/47] prepare worker: handle prepare job when target_os is linux --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 49bfd6ab6866..f927e37afbd0 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -48,7 +48,6 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_primitives::ExecutorParams; use std::{ - fs, io::{Read, Write}, os::unix::net::UnixStream, path::PathBuf, @@ -291,7 +290,10 @@ async fn handle_child_process( // anyway. if let PrepareJobKind::Prechecking = prepare_job_kind { result = result.and_then(|output| { - runtime_construction_check(output.as_ref(), &executor_params)?; + #[cfg(target_os = "linux")] + runtime_construction_check(output.0.as_ref(), &executor_params)?; + #[cfg(not(target_os = "linux"))] + runtime_construction_check(output.as_ref(), executor_params.as_ref())?; Ok(output) }); } From 962fccb4f463a7bd2751ee5c8f5adf24674fd80e Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 1 Nov 2023 16:48:16 +0100 Subject: [PATCH 19/47] Some minor updates --- .../node/core/pvf/prepare-worker/src/lib.rs | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index e4c1f12a87ba..9ddd88a4d906 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -160,12 +160,17 @@ pub fn worker_entrypoint( let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)?; // SAFETY: new process is spawned within a single threaded process - let result = match unsafe { libc::fork() } { + let result = match unsafe { nix::unistd::fork() } { // error -1 => Err(PrepareError::Panic(String::from("error forking"))), // child 0 => { + // Dropping the stream closes the underlying socket. We want to make sure + // that the sandboxed child can't get any kind of information from the + // outside world. The only IPC it should be able to do is sending its + // response over the pipe. drop(stream); + handle_child_process( pvf, pipe_writer, @@ -176,9 +181,10 @@ pub fn worker_entrypoint( }, // parent _ => { - // the read end will wait until all ends have been closed, + // the read end will wait until all write ends have been closed, // this drop is necessary to avoid deadlock drop(pipe_writer); + handle_parent_process( pipe_reader, temp_artifact_dest.clone(), @@ -288,10 +294,7 @@ fn handle_child_process( // anyway. if let PrepareJobKind::Prechecking = prepare_job_kind { result = result.and_then(|output| { - #[cfg(target_os = "linux")] runtime_construction_check(output.0.as_ref(), &executor_params)?; - #[cfg(not(target_os = "linux"))] - runtime_construction_check(output.as_ref(), executor_params.as_ref())?; Ok(output) }); } @@ -358,10 +361,11 @@ fn handle_child_process( /// - If the child send response without an error, this function returns `Ok(PrepareStats)` /// containing memory and CPU usage statistics. /// -/// - If the child send response with an error, it returns a `PrepareError`. +/// - If the child send response with an error, it returns a `PrepareError` with that error. /// /// - If the child process timeout, it returns `PrepareError::TimedOut`. /// +/// TODO /// - If the child process exits with an unknown status, it returns `PrepareError`. fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, @@ -372,6 +376,7 @@ fn handle_parent_process( ) -> Result { let mut received_data = Vec::new(); + // Read from the child. pipe_read .read_to_end(&mut received_data) .map_err(|err| PrepareError::Panic(err.to_string()))?; From 7baf706774f8ea822e04f2db68b559f058cc1d20 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 1 Nov 2023 17:00:46 +0100 Subject: [PATCH 20/47] Clean up `fork` usage a bit --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 9ddd88a4d906..74b205829bc7 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -20,7 +20,6 @@ mod executor_intf; mod memory_stats; pub use executor_intf::{prepare, prevalidate}; -use libc; // NOTE: Initializing logging in e.g. tests will not have an effect in the workers, as they are // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-prepare-worker=trace`. @@ -30,7 +29,11 @@ const LOG_TARGET: &str = "parachain::pvf-prepare-worker"; use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread}; #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop}; -use nix::sys::resource::{Resource, Usage, UsageWho}; +use libc; +use nix::{ + sys::resource::{Resource, Usage, UsageWho}, + unistd::ForkResult, +}; use os_pipe::{self, PipeWriter}; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ @@ -161,10 +164,8 @@ pub fn worker_entrypoint( // SAFETY: new process is spawned within a single threaded process let result = match unsafe { nix::unistd::fork() } { - // error - -1 => Err(PrepareError::Panic(String::from("error forking"))), - // child - 0 => { + Err(_errno) => Err(PrepareError::Panic(String::from("error forking"))), + Ok(ForkResult::Child) => { // Dropping the stream closes the underlying socket. We want to make sure // that the sandboxed child can't get any kind of information from the // outside world. The only IPC it should be able to do is sending its @@ -180,7 +181,7 @@ pub fn worker_entrypoint( ) }, // parent - _ => { + Ok(ForkResult::Parent { child: _child }) => { // the read end will wait until all write ends have been closed, // this drop is necessary to avoid deadlock drop(pipe_writer); From 7687015b839a53bebd95a0caecfedac50a436b54 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 1 Nov 2023 17:07:36 +0100 Subject: [PATCH 21/47] Update prepare-worker-syscalls list --- polkadot/scripts/list-syscalls/prepare-worker-syscalls | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/polkadot/scripts/list-syscalls/prepare-worker-syscalls b/polkadot/scripts/list-syscalls/prepare-worker-syscalls index cab58e06692b..1f8c8527bcc1 100644 --- a/polkadot/scripts/list-syscalls/prepare-worker-syscalls +++ b/polkadot/scripts/list-syscalls/prepare-worker-syscalls @@ -16,6 +16,7 @@ 16 (ioctl) 19 (readv) 20 (writev) +22 (pipe) 24 (sched_yield) 25 (mremap) 28 (madvise) @@ -25,7 +26,9 @@ 45 (recvfrom) 46 (sendmsg) 56 (clone) +57 (fork) 60 (exit) +61 (wait4) 62 (kill) 72 (fcntl) 79 (getcwd) @@ -46,8 +49,10 @@ 144 (sched_setscheduler) 157 (prctl) 158 (arch_prctl) +160 (setrlimit) 165 (mount) 166 (umount2) +186 (gettid) 200 (tkill) 202 (futex) 203 (sched_setaffinity) @@ -62,6 +67,7 @@ 263 (unlinkat) 272 (unshare) 273 (set_robust_list) +293 (pipe2) 302 (prlimit64) 309 (getcpu) 318 (getrandom) From 0d5eb734d701fb98682d4a38164812093061a6b2 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Thu, 2 Nov 2023 15:10:14 +0100 Subject: [PATCH 22/47] Add test when forked process dies (e.g. OOM, seccomp violation) --- .../node/core/pvf/prepare-worker/src/lib.rs | 8 +- polkadot/node/core/pvf/src/testing.rs | 18 +++-- polkadot/node/core/pvf/tests/it/main.rs | 76 +++++++++++++++---- .../node/core/pvf/tests/it/worker_common.rs | 8 +- 4 files changed, 85 insertions(+), 25 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 74b205829bc7..9a9f17319bcf 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -232,7 +232,7 @@ struct Response { } /// This is used to handle child process during pvf prepare worker. -/// It prepare the artifact and track memory stats during preparation +/// It prepares the artifact and tracks memory stats during preparation /// and pipes back the response to the parent process /// /// # Arguments @@ -259,6 +259,12 @@ fn handle_child_process( prepare_job_kind: PrepareJobKind, executor_params: Arc, ) -> ! { + gum::debug!( + target: LOG_TARGET, + worker_job_pid = %std::process::id(), + "worker job: preparing artifact", + ); + // Set a hard CPU time limit for the child process. nix::sys::resource::setrlimit( Resource::RLIMIT_CPU, diff --git a/polkadot/node/core/pvf/src/testing.rs b/polkadot/node/core/pvf/src/testing.rs index 169b55d34b56..24767ef9d037 100644 --- a/polkadot/node/core/pvf/src/testing.rs +++ b/polkadot/node/core/pvf/src/testing.rs @@ -59,7 +59,7 @@ pub fn validate_candidate( /// /// NOTE: This should only be called in dev code (tests, benchmarks) as it relies on the relative /// paths of the built workers. -pub fn get_and_check_worker_paths() -> (PathBuf, PathBuf) { +pub fn build_workers_and_get_paths() -> (PathBuf, PathBuf) { // Only needs to be called once for the current process. static WORKER_PATHS: OnceLock> = OnceLock::new(); @@ -70,11 +70,15 @@ pub fn get_and_check_worker_paths() -> (PathBuf, PathBuf) { "--bin=polkadot-prepare-worker", "--bin=polkadot-execute-worker", ]; - let exit_status = std::process::Command::new("cargo") + let mut cargo = std::process::Command::new("cargo"); + let cmd = cargo // wasm runtime not needed .env("SKIP_WASM_BUILD", "1") .args(build_args) - .stdout(std::process::Stdio::piped()) + .stdout(std::process::Stdio::piped()); + + println!("INFO: calling `{cmd:?}`"); + let exit_status = cmd .status() .expect("Failed to run the build program"); @@ -95,19 +99,19 @@ pub fn get_and_check_worker_paths() -> (PathBuf, PathBuf) { // explain why a build happens if !prepare_worker_path.is_executable() { - eprintln!("Prepare worker does not exist or is not executable. Workers directory: {:?}", workers_path); + eprintln!("WARN: Prepare worker does not exist or is not executable. Workers directory: {:?}", workers_path); } if !execute_worker_path.is_executable() { - eprintln!("Execute worker does not exist or is not executable. Workers directory: {:?}", workers_path); + eprintln!("WARN: Execute worker does not exist or is not executable. Workers directory: {:?}", workers_path); } if let Ok(ver) = get_worker_version(&prepare_worker_path) { if ver != NODE_VERSION { - eprintln!("Prepare worker version {ver} does not match node version {NODE_VERSION}; worker path: {prepare_worker_path:?}"); + eprintln!("WARN: Prepare worker version {ver} does not match node version {NODE_VERSION}; worker path: {prepare_worker_path:?}"); } } if let Ok(ver) = get_worker_version(&execute_worker_path) { if ver != NODE_VERSION { - eprintln!("Execute worker version {ver} does not match node version {NODE_VERSION}; worker path: {execute_worker_path:?}"); + eprintln!("WARN: Execute worker version {ver} does not match node version {NODE_VERSION}; worker path: {execute_worker_path:?}"); } } diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index a69a488adb98..d4cb3d2c0a27 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -17,7 +17,7 @@ use assert_matches::assert_matches; use parity_scale_codec::Encode as _; use polkadot_node_core_pvf::{ - start, testing::get_and_check_worker_paths, Config, InvalidCandidate, Metrics, PrepareError, + start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError, PrepareJobKind, PrepareStats, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }; @@ -52,7 +52,7 @@ impl TestHost { where F: FnOnce(&mut Config), { - let (prepare_worker_path, execute_worker_path) = get_and_check_worker_paths(); + let (prepare_worker_path, execute_worker_path) = build_workers_and_get_paths(); let cache_dir = tempfile::tempdir().unwrap(); let mut config = Config::new( @@ -155,7 +155,12 @@ async fn terminates_on_timeout() { } #[cfg(target_os = "linux")] -fn kill_by_sid_and_name(sid: i32, exe_name: &'static str) { +const PREPARE_PROCESS_NAME: &'static str = "polkadot-prepare-worker"; +#[cfg(target_os = "linux")] +const EXECUTE_PROCESS_NAME: &'static str = "polkadot-execute-worker"; + +#[cfg(target_os = "linux")] +fn kill_by_sid_and_name(sid: i32, exe_name: &'static str, is_direct_child: bool) { use procfs::process; let all_processes: Vec = process::all_processes() @@ -172,13 +177,29 @@ fn kill_by_sid_and_name(sid: i32, exe_name: &'static str) { }) .collect(); + let mut found = 0; for process in all_processes { - if process.stat().unwrap().session == sid && - process.exe().unwrap().to_str().unwrap().contains(exe_name) - { - assert_eq!(unsafe { libc::kill(process.pid(), 9) }, 0); + let stat = process.stat().unwrap(); + + if stat.session != sid || !process.exe().unwrap().to_str().unwrap().contains(exe_name) { + continue + } + // The workers are direct children of the current process, the worker job processes are not + // (they are children of the workers). + if is_direct_child { + if stat.ppid as u32 != std::process::id() { + continue + } + } else { + if stat.ppid as u32 == std::process::id() { + continue + } } + + assert_eq!(unsafe { libc::kill(process.pid(), 9) }, 0); + found += 1; } + assert_eq!(found, 1); } // Run these tests in their own processes with rusty-fork. They work by each creating a new session, @@ -188,8 +209,6 @@ rusty_fork_test! { // What happens when the prepare worker dies in the middle of a job? #[test] fn prepare_worker_killed_during_job() { - const PROCESS_NAME: &'static str = "polkadot-prepare-worker"; - let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(async { let host = TestHost::new().await; @@ -204,7 +223,7 @@ rusty_fork_test! { // Run a future that kills the job in the middle of the timeout. async { tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; - kill_by_sid_and_name(sid, PROCESS_NAME); + kill_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true); } ); @@ -215,8 +234,6 @@ rusty_fork_test! { // What happens when the execute worker dies in the middle of a job? #[test] fn execute_worker_killed_during_job() { - const PROCESS_NAME: &'static str = "polkadot-execute-worker"; - let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(async { let host = TestHost::new().await; @@ -244,7 +261,7 @@ rusty_fork_test! { // Run a future that kills the job in the middle of the timeout. async { tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; - kill_by_sid_and_name(sid, PROCESS_NAME); + kill_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true); } ); @@ -254,6 +271,39 @@ rusty_fork_test! { ); }) } + + // What happens when the forked prepare job dies in the middle of its job? + #[test] + fn forked_prepare_job_killed_during_job() { + polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; + kill_by_sid_and_name(sid, PREPARE_PROCESS_NAME, false); + } + ); + + assert_matches!(result, Err(PrepareError::IoErr(_))); + }) + } + + // What happens when the forked execute job dies in the middle of its job? + #[test] + fn forked_execute_job_killed_during_job() { + todo!() + } } #[cfg(feature = "ci-only-tests")] diff --git a/polkadot/node/core/pvf/tests/it/worker_common.rs b/polkadot/node/core/pvf/tests/it/worker_common.rs index df64980dc806..4b736b08ba60 100644 --- a/polkadot/node/core/pvf/tests/it/worker_common.rs +++ b/polkadot/node/core/pvf/tests/it/worker_common.rs @@ -15,7 +15,7 @@ // along with Polkadot. If not, see . use polkadot_node_core_pvf::{ - testing::{get_and_check_worker_paths, spawn_with_program_path, SpawnErr}, + testing::{build_workers_and_get_paths, spawn_with_program_path, SpawnErr}, SecurityStatus, }; use std::{env, time::Duration}; @@ -23,7 +23,7 @@ use std::{env, time::Duration}; // Test spawning a program that immediately exits with a failure code. #[tokio::test] async fn spawn_immediate_exit() { - let (prepare_worker_path, _) = get_and_check_worker_paths(); + let (prepare_worker_path, _) = build_workers_and_get_paths(); // There's no explicit `exit` subcommand in the worker; it will panic on an unknown // subcommand anyway @@ -41,7 +41,7 @@ async fn spawn_immediate_exit() { #[tokio::test] async fn spawn_timeout() { - let (_, execute_worker_path) = get_and_check_worker_paths(); + let (_, execute_worker_path) = build_workers_and_get_paths(); let result = spawn_with_program_path( "integration-test", @@ -57,7 +57,7 @@ async fn spawn_timeout() { #[tokio::test] async fn should_connect() { - let (prepare_worker_path, _) = get_and_check_worker_paths(); + let (prepare_worker_path, _) = build_workers_and_get_paths(); let _ = spawn_with_program_path( "integration-test", From a932a7cf9f8efc2cb6a374de11a58ec5603b02c7 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Thu, 2 Nov 2023 17:47:35 +0100 Subject: [PATCH 23/47] Update for seccomp (add detection for job death); fix some errors - Changed error for pipe read failure (job died) - Updated seccomp detection (IoErr no longer means job died) - Updated test - Confirmed manually that a seccomp violation (with process killing) result in a JobDied error. - Updated some other errors (still more to do) --- polkadot/node/core/pvf/common/src/error.rs | 22 ++++++++---- .../node/core/pvf/prepare-worker/src/lib.rs | 35 ++++++++++++++----- polkadot/node/core/pvf/src/prepare/pool.rs | 23 +++++++++--- .../node/core/pvf/src/prepare/worker_intf.rs | 29 ++++++--------- polkadot/node/core/pvf/src/testing.rs | 14 ++++---- polkadot/node/core/pvf/tests/it/main.rs | 3 +- 6 files changed, 78 insertions(+), 48 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 6fdd06057c8b..4f3562454f42 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -38,13 +38,15 @@ pub enum PrepareError { /// An IO error occurred. This state is reported by either the validation host or by the /// worker. IoErr(String), + /// The preparation job process died, due to OOM, a seccomp violation, or some other factor. + JobDied, /// The temporary file for the artifact could not be created at the given cache path. This /// state is reported by the validation host (not by the worker). - CreateTmpFileErr(String), + CreateTmpFile(String), /// The response from the worker is received, but the file cannot be renamed (moved) to the /// final destination location. This state is reported by the validation host (not by the /// worker). - RenameTmpFileErr { + RenameTmpFile { err: String, // Unfortunately `PathBuf` doesn't implement `Encode`/`Decode`, so we do a fallible // conversion to `Option`. @@ -55,6 +57,8 @@ pub enum PrepareError { /// worker has to be killed to avoid jobs having access to data from other jobs. This state is /// reported by the validation host (not by the worker). ClearWorkerDir(String), + /// Some error occurred when interfacing with the kernel. + Kernel(String), } impl PrepareError { @@ -70,9 +74,11 @@ impl PrepareError { Prevalidation(_) | Preparation(_) | Panic(_) => true, TimedOut | IoErr(_) | - CreateTmpFileErr(_) | - RenameTmpFileErr { .. } | - ClearWorkerDir(_) => false, + JobDied | + CreateTmpFile(_) | + RenameTmpFile { .. } | + ClearWorkerDir(_) | + Kernel(_) => false, // Can occur due to issues with the PVF, but also due to local errors. RuntimeConstruction(_) => false, } @@ -89,10 +95,12 @@ impl fmt::Display for PrepareError { Panic(err) => write!(f, "panic: {}", err), TimedOut => write!(f, "prepare: timeout"), IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err), - CreateTmpFileErr(err) => write!(f, "prepare: error creating tmp file: {}", err), - RenameTmpFileErr { err, src, dest } => + JobDied => write!(f, "prepare: prepare job died"), + CreateTmpFile(err) => write!(f, "prepare: error creating tmp file: {}", err), + RenameTmpFile { err, src, dest } => write!(f, "prepare: error renaming tmp file ({:?} -> {:?}): {}", src, dest, err), ClearWorkerDir(err) => write!(f, "prepare: error clearing worker cache: {}", err), + Kernel(err) => write!(f, "prepare: error interfacing with the kernel: {}", err), } } } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 9a9f17319bcf..64fb849ba5c6 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -31,8 +31,9 @@ use crate::memory_stats::max_rss_stat::{extract_max_rss_stat, get_max_rss_thread use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_tracker_loop}; use libc; use nix::{ + errno::Errno, sys::resource::{Resource, Usage, UsageWho}, - unistd::ForkResult, + unistd::{ForkResult, Pid}, }; use os_pipe::{self, PipeWriter}; use parity_scale_codec::{Decode, Encode}; @@ -160,11 +161,18 @@ pub fn worker_entrypoint( let (pipe_reader, pipe_writer) = os_pipe::pipe()?; - let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)?; + let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { + Ok(usage) => usage, + Err(errno) => { + let result = Err(err_from_errno("getrusage before", errno)); + send_response(&mut stream, result)?; + continue + }, + }; // SAFETY: new process is spawned within a single threaded process let result = match unsafe { nix::unistd::fork() } { - Err(_errno) => Err(PrepareError::Panic(String::from("error forking"))), + Err(errno) => Err(err_from_errno("fork", errno)), Ok(ForkResult::Child) => { // Dropping the stream closes the underlying socket. We want to make sure // that the sandboxed child can't get any kind of information from the @@ -181,13 +189,14 @@ pub fn worker_entrypoint( ) }, // parent - Ok(ForkResult::Parent { child: _child }) => { + Ok(ForkResult::Parent { child }) => { // the read end will wait until all write ends have been closed, // this drop is necessary to avoid deadlock drop(pipe_writer); handle_parent_process( pipe_reader, + child, temp_artifact_dest.clone(), worker_pid, usage_before, @@ -376,6 +385,7 @@ fn handle_child_process( /// - If the child process exits with an unknown status, it returns `PrepareError`. fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, + child: Pid, temp_artifact_dest: PathBuf, worker_pid: u32, usage_before: Usage, @@ -386,10 +396,11 @@ fn handle_parent_process( // Read from the child. pipe_read .read_to_end(&mut received_data) - .map_err(|err| PrepareError::Panic(err.to_string()))?; - let status = nix::sys::wait::wait(); + // Swallow the error, it's not really helpful as to why the child died. + .map_err(|_errno| PrepareError::JobDied)?; + let status = nix::sys::wait::waitpid(child, None); let usage_after = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) - .map_err(|err| PrepareError::Panic(err.to_string()))?; + .map_err(|errno| err_from_errno("getrusage after", errno))?; // Using `getrusage` is needed to check whether `setrlimit` was triggered. // As `getrusage` returns resource usage from all terminated child processes, @@ -405,7 +416,8 @@ fn handle_parent_process( Ok(_) => { let result: Result = Result::decode(&mut received_data.as_slice()) - .map_err(|e| PrepareError::Panic(e.to_string()))?; + // This error happens when the job dies. + .map_err(|_err| PrepareError::JobDied)?; match result { Err(err) => Err(err), Ok(response) => { @@ -422,8 +434,9 @@ fn handle_parent_process( "worker: writing artifact to {}", temp_artifact_dest.display(), ); + // Write to the temp file created by the host. if let Err(err) = fs::write(&temp_artifact_dest, &response.artifact) { - return Err(PrepareError::Panic(format!("{:?}", err))) + return Err(PrepareError::IoErr(err.to_string())) }; Ok(PrepareStats { @@ -469,3 +482,7 @@ fn send_child_response(mut pipe_write: &PipeWriter, response: Result PrepareError { + PrepareError::Kernel(format!("{}: {}: {}", context, errno, io::Error::last_os_error())) +} diff --git a/polkadot/node/core/pvf/src/prepare/pool.rs b/polkadot/node/core/pvf/src/prepare/pool.rs index 7933b0319a6f..b0d4187d00ef 100644 --- a/polkadot/node/core/pvf/src/prepare/pool.rs +++ b/polkadot/node/core/pvf/src/prepare/pool.rs @@ -334,22 +334,22 @@ fn handle_mux( handle_concluded_no_rip(from_pool, spawned, worker, idle, result), // Return `Concluded`, but do not kill the worker since the error was on the host // side. - Outcome::CreateTmpFileErr { worker: idle, err } => handle_concluded_no_rip( + Outcome::CreateTmpFile { worker: idle, err } => handle_concluded_no_rip( from_pool, spawned, worker, idle, - Err(PrepareError::CreateTmpFileErr(err)), + Err(PrepareError::CreateTmpFile(err)), ), // Return `Concluded`, but do not kill the worker since the error was on the host // side. - Outcome::RenameTmpFileErr { worker: idle, result: _, err, src, dest } => + Outcome::RenameTmpFile { worker: idle, result: _, err, src, dest } => handle_concluded_no_rip( from_pool, spawned, worker, idle, - Err(PrepareError::RenameTmpFileErr { err, src, dest }), + Err(PrepareError::RenameTmpFile { err, src, dest }), ), // Could not clear worker cache. Kill the worker so other jobs can't see the data. Outcome::ClearWorkerDir { err } => { @@ -387,6 +387,21 @@ fn handle_mux( Ok(()) }, + // The worker might still be usable, but we kill it just in case. + Outcome::JobDied => { + if attempt_retire(metrics, spawned, worker) { + reply( + from_pool, + FromPool::Concluded { + worker, + rip: true, + result: Err(PrepareError::JobDied), + }, + )?; + } + + Ok(()) + }, Outcome::TimedOut => { if attempt_retire(metrics, spawned, worker) { reply( diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 1f6ab0b76556..0df618d6ee28 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -76,10 +76,10 @@ pub enum Outcome { /// killed by the system. Unreachable, /// The temporary file for the artifact could not be created at the given cache path. - CreateTmpFileErr { worker: IdleWorker, err: String }, + CreateTmpFile { worker: IdleWorker, err: String }, /// The response from the worker is received, but the tmp file cannot be renamed (moved) to the /// final destination location. - RenameTmpFileErr { + RenameTmpFile { worker: IdleWorker, result: PrepareResult, err: String, @@ -98,6 +98,10 @@ pub enum Outcome { /// /// This doesn't return an idle worker instance, thus this worker is no longer usable. IoErr(String), + /// The preparation job process died, due to OOM, a seccomp violation, or some other factor. + /// + /// The worker might still be usable, but we kill it just in case. + JobDied, } /// Given the idle token of a worker and parameters of work, communicates with the worker and @@ -185,21 +189,6 @@ pub async fn start_work( "failed to recv a prepare response: {:?}", err, ); - - // The worker died. Check if it was due to a seccomp violation. - // - // NOTE: Log, but don't change the outcome. Not all validators may have auditing - // enabled, so we don't want attackers to abuse a non-deterministic outcome. - for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { - gum::error!( - target: LOG_TARGET, - worker_pid = %pid, - %syscall, - ?pvf, - "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" - ); - } - Outcome::IoErr(err.to_string()) }, Err(_) => { @@ -234,6 +223,8 @@ async fn handle_response( Ok(result) => result, // Timed out on the child. This should already be logged by the child. Err(PrepareError::TimedOut) => return Outcome::TimedOut, + // The prepare job died. + Err(PrepareError::JobDied) => return Outcome::JobDied, Err(_) => return Outcome::Concluded { worker, result }, }; @@ -269,7 +260,7 @@ async fn handle_response( artifact_path.display(), err, ); - Outcome::RenameTmpFileErr { + Outcome::RenameTmpFile { worker, result, err: format!("{:?}", err), @@ -312,7 +303,7 @@ where "failed to create a temp file for the artifact: {:?}", err, ); - return Outcome::CreateTmpFileErr { + return Outcome::CreateTmpFile { worker: IdleWorker { stream, pid, worker_dir }, err: format!("{:?}", err), } diff --git a/polkadot/node/core/pvf/src/testing.rs b/polkadot/node/core/pvf/src/testing.rs index 24767ef9d037..a18909527d61 100644 --- a/polkadot/node/core/pvf/src/testing.rs +++ b/polkadot/node/core/pvf/src/testing.rs @@ -78,12 +78,10 @@ pub fn build_workers_and_get_paths() -> (PathBuf, PathBuf) { .stdout(std::process::Stdio::piped()); println!("INFO: calling `{cmd:?}`"); - let exit_status = cmd - .status() - .expect("Failed to run the build program"); + let exit_status = cmd.status().expect("Failed to run the build program"); if !exit_status.success() { - eprintln!("Failed to build workers: {}", exit_status.code().unwrap()); + eprintln!("ERROR: Failed to build workers: {}", exit_status.code().unwrap()); std::process::exit(1); } } @@ -99,19 +97,19 @@ pub fn build_workers_and_get_paths() -> (PathBuf, PathBuf) { // explain why a build happens if !prepare_worker_path.is_executable() { - eprintln!("WARN: Prepare worker does not exist or is not executable. Workers directory: {:?}", workers_path); + println!("WARN: Prepare worker does not exist or is not executable. Workers directory: {:?}", workers_path); } if !execute_worker_path.is_executable() { - eprintln!("WARN: Execute worker does not exist or is not executable. Workers directory: {:?}", workers_path); + println!("WARN: Execute worker does not exist or is not executable. Workers directory: {:?}", workers_path); } if let Ok(ver) = get_worker_version(&prepare_worker_path) { if ver != NODE_VERSION { - eprintln!("WARN: Prepare worker version {ver} does not match node version {NODE_VERSION}; worker path: {prepare_worker_path:?}"); + println!("WARN: Prepare worker version {ver} does not match node version {NODE_VERSION}; worker path: {prepare_worker_path:?}"); } } if let Ok(ver) = get_worker_version(&execute_worker_path) { if ver != NODE_VERSION { - eprintln!("WARN: Execute worker version {ver} does not match node version {NODE_VERSION}; worker path: {execute_worker_path:?}"); + println!("WARN: Execute worker version {ver} does not match node version {NODE_VERSION}; worker path: {execute_worker_path:?}"); } } diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index d4cb3d2c0a27..277a87563fa5 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -295,7 +295,8 @@ rusty_fork_test! { } ); - assert_matches!(result, Err(PrepareError::IoErr(_))); + // Note that we get a more specific error if the job died than if the whole worker died. + assert_matches!(result, Err(PrepareError::JobDied)); }) } From 7e1414e82a15bce76900699bae42bac44ab41b6c Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Thu, 2 Nov 2023 18:12:44 +0100 Subject: [PATCH 24/47] Some clarifications --- polkadot/node/core/pvf/common/src/error.rs | 5 +++-- polkadot/node/core/pvf/tests/it/main.rs | 12 ++++-------- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 4f3562454f42..210c900d2304 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -31,7 +31,7 @@ pub enum PrepareError { Preparation(String), /// Instantiation of the WASM module instance failed. RuntimeConstruction(String), - /// An unexpected panic has occurred in the preparation worker. + /// An unexpected panic has occurred in the preparation job. Panic(String), /// Failed to prepare the PVF due to the time limit. TimedOut, @@ -72,13 +72,14 @@ impl PrepareError { use PrepareError::*; match self { Prevalidation(_) | Preparation(_) | Panic(_) => true, - TimedOut | IoErr(_) | JobDied | CreateTmpFile(_) | RenameTmpFile { .. } | ClearWorkerDir(_) | Kernel(_) => false, + // Can occur due to issues with the PVF, but also due to factors like local load. + TimedOut => false, // Can occur due to issues with the PVF, but also due to local errors. RuntimeConstruction(_) => false, } diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 277a87563fa5..b5394a260739 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -14,6 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +#[allow(unused_imports)] use assert_matches::assert_matches; use parity_scale_codec::Encode as _; use polkadot_node_core_pvf::{ @@ -186,14 +187,9 @@ fn kill_by_sid_and_name(sid: i32, exe_name: &'static str, is_direct_child: bool) } // The workers are direct children of the current process, the worker job processes are not // (they are children of the workers). - if is_direct_child { - if stat.ppid as u32 != std::process::id() { - continue - } - } else { - if stat.ppid as u32 == std::process::id() { - continue - } + let process_is_direct_child = stat.ppid as u32 == std::process::id(); + if is_direct_child != process_is_direct_child { + continue } assert_eq!(unsafe { libc::kill(process.pid(), 9) }, 0); From d9fcb5dd9b1b534265c54b6fcbe49d323d75dbb0 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Thu, 2 Nov 2023 15:58:53 -0300 Subject: [PATCH 25/47] change execute worker from thread to fork --- Cargo.lock | 3 + polkadot/node/core/pvf/Cargo.toml | 1 + .../node/core/pvf/common/src/worker/mod.rs | 1 - .../node/core/pvf/execute-worker/Cargo.toml | 3 + .../node/core/pvf/execute-worker/src/lib.rs | 317 +++++++++++++----- .../node/core/pvf/prepare-worker/src/lib.rs | 26 +- 6 files changed, 253 insertions(+), 98 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c575e0b77735..cc06b20e7bba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12404,6 +12404,9 @@ name = "polkadot-node-core-pvf-execute-worker" version = "1.0.0" dependencies = [ "cpu-time", + "libc", + "nix 0.27.1", + "os_pipe", "parity-scale-codec", "polkadot-node-core-pvf-common", "polkadot-parachain-primitives", diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml index 430f7cd5e8ef..44c98f329eaa 100644 --- a/polkadot/node/core/pvf/Cargo.toml +++ b/polkadot/node/core/pvf/Cargo.toml @@ -39,6 +39,7 @@ polkadot-node-core-pvf-execute-worker = { path = "execute-worker", optional = tr assert_matches = "1.4.0" criterion = { version = "0.4.0", default-features = false, features = ["cargo_bench_support", "async_tokio"] } hex-literal = "0.4.1" +sp-tracing = { path = "../../../../substrate/primitives/tracing" } polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] } # For benches and integration tests, depend on ourselves with the test-utils diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 274a2fc80397..ff4e290d35e8 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -451,7 +451,6 @@ fn kill_parent_node_in_emergency() { } } } - /// Functionality related to threads spawned by the workers. /// /// The motivation for this module is to coordinate worker threads without using async Rust. diff --git a/polkadot/node/core/pvf/execute-worker/Cargo.toml b/polkadot/node/core/pvf/execute-worker/Cargo.toml index 77a9420961c0..40e0ff4f0a19 100644 --- a/polkadot/node/core/pvf/execute-worker/Cargo.toml +++ b/polkadot/node/core/pvf/execute-worker/Cargo.toml @@ -9,6 +9,9 @@ license.workspace = true [dependencies] cpu-time = "1.0.0" gum = { package = "tracing-gum", path = "../../../gum" } +os_pipe = "1.1.4" +nix = { version = "0.27.1", features = ["resource", "process"]} +libc = "0.2.139" parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 8872f9bc8dd3..92d02e8aa330 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -16,6 +16,8 @@ //! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary. +use nix::{sys::{resource::{Resource, Usage, UsageWho}, wait::WaitStatus, signal::Signal}, unistd::ForkResult}; +use os_pipe::PipeWriter; pub use polkadot_node_core_pvf_common::{ executor_intf::execute_artifact, worker_dir, SecurityStatus, }; @@ -24,14 +26,13 @@ pub use polkadot_node_core_pvf_common::{ // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`. const LOG_TARGET: &str = "parachain::pvf-execute-worker"; -use cpu_time::ProcessTime; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, execute::{Handshake, Response}, framed_recv_blocking, framed_send_blocking, worker::{ - cpu_time_monitor_loop, stringify_panic_payload, + stringify_panic_payload, thread::{self, WaitOutcome}, worker_event_loop, WorkerKind, }, @@ -39,11 +40,11 @@ use polkadot_node_core_pvf_common::{ use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams}; use std::{ - io, + io::{self, Write, Read}, os::unix::net::UnixStream, path::PathBuf, - sync::{mpsc::channel, Arc}, - time::Duration, + sync::Arc, + time::Duration, process, }; // Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code. @@ -165,83 +166,42 @@ pub fn worker_entrypoint( }, }; - // Conditional variable to notify us when a thread is done. - let condvar = thread::get_condvar(); - - let cpu_time_start = ProcessTime::now(); - - // Spawn a new thread that runs the CPU time monitor. - let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>(); - let cpu_time_monitor_thread = thread::spawn_worker_thread( - "cpu time monitor thread", - move || { - cpu_time_monitor_loop( - cpu_time_start, - execution_timeout, - cpu_time_monitor_rx, - ) - }, - Arc::clone(&condvar), - WaitOutcome::TimedOut, - )?; - - let executor_params_2 = executor_params.clone(); - let execute_thread = thread::spawn_worker_thread_with_stack_size( - "execute thread", - move || { - validate_using_artifact( - &compiled_artifact_blob, - &executor_params_2, - ¶ms, - cpu_time_start, - ) - }, - Arc::clone(&condvar), - WaitOutcome::Finished, - EXECUTE_THREAD_STACK_SIZE, - )?; - - let outcome = thread::wait_for_threads(condvar); - - let response = match outcome { - WaitOutcome::Finished => { - let _ = cpu_time_monitor_tx.send(()); - execute_thread - .join() - .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))) - }, - // If the CPU thread is not selected, we signal it to end, the join handle is - // dropped and the thread will finish in the background. - WaitOutcome::TimedOut => { - match cpu_time_monitor_thread.join() { - Ok(Some(cpu_time_elapsed)) => { - // Log if we exceed the timeout and the other thread hasn't - // finished. - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "execute job took {}ms cpu time, exceeded execute timeout {}ms", - cpu_time_elapsed.as_millis(), - execution_timeout.as_millis(), - ); - Response::TimedOut - }, - Ok(None) => Response::InternalError( - InternalValidationError::CpuTimeMonitorThread( - "error communicating over finished channel".into(), - ), - ), - Err(e) => Response::InternalError( - InternalValidationError::CpuTimeMonitorThread( - stringify_panic_payload(e), - ), - ), - } - }, - WaitOutcome::Pending => unreachable!( - "we run wait_while until the outcome is no longer pending; qed" - ), - }; + + let (pipe_reader, pipe_writer) = os_pipe::pipe()?; + + let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)?; + + // SAFETY: new process is spawned within a single threaded process + let response = match unsafe { nix::unistd::fork() } { + Err(_errno) => Response::Panic(String::from("error forking")), + Ok(ForkResult::Child) => { + // Dropping the stream closes the underlying socket. We want to make sure + // that the sandboxed child can't get any kind of information from the + // outside world. The only IPC it should be able to do is sending its + // response over the pipe. + drop(stream); + + handle_child_process( + pipe_writer, + compiled_artifact_blob, + executor_params, + params, + execution_timeout, + ) + }, + // parent + Ok(ForkResult::Parent { child: _child }) => { + // the read end will wait until all write ends have been closed, + // this drop is necessary to avoid deadlock + drop(pipe_writer); + + handle_parent_process( + pipe_reader, + usage_before, + execution_timeout, + ) + }, + }; gum::trace!( target: LOG_TARGET, @@ -259,7 +219,6 @@ fn validate_using_artifact( compiled_artifact_blob: &[u8], executor_params: &ExecutorParams, params: &[u8], - cpu_time_start: ProcessTime, ) -> Response { let descriptor_bytes = match unsafe { // SAFETY: this should be safe since the compiled artifact passed here comes from the @@ -277,9 +236,193 @@ fn validate_using_artifact( Ok(r) => r, }; - // Include the decoding in the measured time, to prevent any potential attacks exploiting some - // bug in decoding. - let duration = cpu_time_start.elapsed(); + // duration is set to 0 here because the process duration is calculated on the parent process + Response::Ok { result_descriptor, duration: Duration::from_secs(0) } +} + + + +/// This is used to handle child process during pvf execute worker. +/// It execute the artifact and pipes back the response to the parent process +/// +/// # Arguments +/// +/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// +/// - `compiled_artifact_blob`: The artifact bytes from compiled by the prepare worker`. +/// +/// - `executor_params`: Deterministically serialized execution environment semantics. +/// +/// - `params`: +/// +/// - `execution_timeout`: The timeout in `Duration`. +/// +/// # Returns +/// +/// - pipe back `Response` to the parent process. +fn handle_child_process( + pipe_write: os_pipe::PipeWriter, + compiled_artifact_blob: Vec, + executor_params: ExecutorParams, + params: Vec, + execution_timeout: Duration +) -> ! { + gum::debug!( + target: LOG_TARGET, + worker_job_pid = %std::process::id(), + "worker job: executing artifact", + ); + + // Set a hard CPU time limit for the child process. + nix::sys::resource::setrlimit( + Resource::RLIMIT_CPU, + execution_timeout.as_secs(), + execution_timeout.as_secs(), + ) + .unwrap_or_else(|err| { + send_child_response(&pipe_write, Response::Panic(err.to_string())); + }); + + // Conditional variable to notify us when a thread is done. + let condvar = thread::get_condvar(); + + let executor_params_2 = executor_params.clone(); + let execute_thread = thread::spawn_worker_thread_with_stack_size( + "execute thread", + move || { + validate_using_artifact( + &compiled_artifact_blob, + &executor_params_2, + ¶ms, + ) + }, + Arc::clone(&condvar), + WaitOutcome::Finished, + EXECUTE_THREAD_STACK_SIZE, + ) + .unwrap_or_else(|err| { + send_child_response(&pipe_write, Response::Panic(err.to_string())) + }); + + // There's only one thread that can trigger the condvar, so ignore the condvar outcome and + // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. + let response = execute_thread + .join() + .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))); + + + send_child_response(&pipe_write, response); +} + +/// Waits for child process to finish and handle child response from pipe. +/// +/// # Arguments +/// +/// - `pipe_read`: A `PipeReader` used to read data from the child process. +/// +/// - `usage_before`: Resource usage statistics before executing the child process. +/// +/// - `timeout`: The maximum allowed time for the child process to finish, in `Duration`. +/// +/// # Returns +/// +/// - If no unexpected error occurr, this function return child response +/// +/// - If an unexpected error occurr, this function returns `Response::Panic` +/// +/// - If the child process timeout, it returns `Response::TimedOut`. +fn handle_parent_process( + mut pipe_read: os_pipe::PipeReader, + usage_before: Usage, + timeout: Duration, +) -> Response { + let mut received_data = Vec::new(); + + // Read from the child. + if let Err(err) = pipe_read + .read_to_end(&mut received_data) { + return Response::Panic(err.to_string()) + } + + let status = nix::sys::wait::wait(); + + let usage_after: Usage; + + match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { + Ok(usage) => { + usage_after = usage + }, + Err(err) => { + return Response::Panic(err.to_string()) + } + }; + + // Using `getrusage` is needed to check whether `setrlimit` was triggered. + // As `getrusage` returns resource usage from all terminated child processes, + // it is necessary to subtract the usage before the current child process to isolate its cpu + // time + let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); + + if cpu_tv.as_secs() >= timeout.as_secs() { + return Response::TimedOut + } + + match status { + Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { + match Response::decode(&mut received_data.as_slice()) { + Ok(Response::Ok { result_descriptor, duration: _ }) => Response::Ok { result_descriptor, duration: cpu_tv }, + Ok(response) => response, + Err(err) => Response::Panic(err.to_string()) + } + }, + Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => { + Response::Panic("child exited with failure".to_string()) + }, + Ok(WaitStatus::Exited(_, exit_status)) => { + Response::Panic(format!("child exited with unexpected status {}", exit_status)) + }, + Ok(WaitStatus::Signaled(_, sig, _)) => { + Response::Panic(format!("child ended with unexpected signal {:?}, timeout {} cpu_tv {} after {} before {}", sig, timeout.as_secs(), cpu_tv.as_micros(), + usage_after.user_time().tv_sec() + usage_after.system_time().tv_sec(), get_total_cpu_usage(usage_before).as_micros())) + } + Ok(_) => { + Response::Panic("child ended unexpectedly".to_string()) + } + Err(err) => Response::Panic(err.to_string()) + } +} + + + +/// Calculate the total CPU time from the given `usage` structure, returned from +/// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user +/// and system time. +/// +/// # Arguments +/// +/// - `rusage`: Contains resource usage information. +/// +/// # Returns +/// +/// Returns a `Duration` representing the total CPU time. +fn get_total_cpu_usage(rusage: Usage) -> Duration { + let micros = (((rusage.user_time().tv_sec() + rusage.system_time().tv_sec()) * 1_000_000) + + (rusage.system_time().tv_usec() + rusage.user_time().tv_usec()) as i64) as u64; + + return Duration::from_micros(micros) +} + +/// Write response to the pipe and exit process after. +/// +/// # Arguments +/// +/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// +/// - `response`: Child process response +fn send_child_response(mut pipe_write: &PipeWriter, response: Response) -> ! { + pipe_write + .write_all(response.encode().as_slice()) + .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - Response::Ok { result_descriptor, duration } + process::exit(libc::EXIT_SUCCESS) } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 64fb849ba5c6..56df1f8a4eb7 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -32,7 +32,7 @@ use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_ use libc; use nix::{ errno::Errno, - sys::resource::{Resource, Usage, UsageWho}, + sys::{resource::{Resource, Usage, UsageWho}, wait::WaitStatus}, unistd::{ForkResult, Pid}, }; use os_pipe::{self, PipeWriter}; @@ -200,7 +200,7 @@ pub fn worker_entrypoint( temp_artifact_dest.clone(), worker_pid, usage_before, - preparation_timeout.as_secs(), + preparation_timeout, ) }, }; @@ -370,7 +370,7 @@ fn handle_child_process( /// /// - `usage_before`: Resource usage statistics before executing the child process. /// -/// - `timeout`: The maximum allowed time for the child process to finish, in milliseconds. +/// - `timeout`: The maximum allowed time for the child process to finish, in `Duration`. /// /// # Returns /// @@ -380,16 +380,13 @@ fn handle_child_process( /// - If the child send response with an error, it returns a `PrepareError` with that error. /// /// - If the child process timeout, it returns `PrepareError::TimedOut`. -/// -/// TODO -/// - If the child process exits with an unknown status, it returns `PrepareError`. fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, child: Pid, temp_artifact_dest: PathBuf, worker_pid: u32, usage_before: Usage, - timeout: u64, + timeout: Duration, ) -> Result { let mut received_data = Vec::new(); @@ -408,12 +405,12 @@ fn handle_parent_process( // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv.as_secs() >= timeout { + if cpu_tv.as_secs() >= timeout.as_secs() { return Err(PrepareError::TimedOut) } return match status { - Ok(_) => { + Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { let result: Result = Result::decode(&mut received_data.as_slice()) // This error happens when the job dies. @@ -446,7 +443,16 @@ fn handle_parent_process( }, } }, - _ => Err(PrepareError::Panic("child finished with unknown status".to_string())), + Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => { + Err(PrepareError::Panic("child exited with failure".to_string())) + }, + Ok(WaitStatus::Exited(_, exit_status)) => { + Err(PrepareError::Panic(format!("child exited with unexpected status {}", exit_status))) + }, + Ok(_) => { + Err(PrepareError::Panic("child ended unexpectedly".to_string())) + } + Err(err) => Err(PrepareError::Panic(err.to_string())) } } From f830b059c6ff3ad33c9af75a8e156ddb0fc453a0 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 3 Nov 2023 11:48:13 +0100 Subject: [PATCH 26/47] Fix bench --- .../pvf/benches/host_prepare_rococo_runtime.rs | 2 +- polkadot/node/core/pvf/common/src/worker/mod.rs | 1 + polkadot/node/core/pvf/src/testing.rs | 14 +++++++++----- polkadot/node/core/pvf/tests/it/main.rs | 2 +- polkadot/node/core/pvf/tests/it/worker_common.rs | 6 +++--- 5 files changed, 15 insertions(+), 10 deletions(-) diff --git a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs index acd80526262c..8f9caa86d8c8 100644 --- a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs +++ b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs @@ -37,7 +37,7 @@ impl TestHost { where F: FnOnce(&mut Config), { - let (prepare_worker_path, execute_worker_path) = testing::get_and_check_worker_paths(); + let (prepare_worker_path, execute_worker_path) = testing::build_workers_and_get_paths(true); let cache_dir = tempfile::tempdir().unwrap(); let mut config = Config::new( diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index ff4e290d35e8..274a2fc80397 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -451,6 +451,7 @@ fn kill_parent_node_in_emergency() { } } } + /// Functionality related to threads spawned by the workers. /// /// The motivation for this module is to coordinate worker threads without using async Rust. diff --git a/polkadot/node/core/pvf/src/testing.rs b/polkadot/node/core/pvf/src/testing.rs index a18909527d61..42fe8931e3f9 100644 --- a/polkadot/node/core/pvf/src/testing.rs +++ b/polkadot/node/core/pvf/src/testing.rs @@ -14,7 +14,7 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -//! Various things for testing other crates. +//! Various utilities for testing. pub use crate::{ host::{EXECUTE_BINARY_NAME, PREPARE_BINARY_NAME}, @@ -59,17 +59,21 @@ pub fn validate_candidate( /// /// NOTE: This should only be called in dev code (tests, benchmarks) as it relies on the relative /// paths of the built workers. -pub fn build_workers_and_get_paths() -> (PathBuf, PathBuf) { +pub fn build_workers_and_get_paths(is_bench: bool) -> (PathBuf, PathBuf) { // Only needs to be called once for the current process. static WORKER_PATHS: OnceLock> = OnceLock::new(); - fn build_workers() { - let build_args = vec![ + fn build_workers(is_bench: bool) { + let mut build_args = vec![ "build", "--package=polkadot", "--bin=polkadot-prepare-worker", "--bin=polkadot-execute-worker", ]; + if is_bench { + // Benches require --release. Regular tests are debug (no flag needed). + build_args.push("--release"); + } let mut cargo = std::process::Command::new("cargo"); let cmd = cargo // wasm runtime not needed @@ -113,7 +117,7 @@ pub fn build_workers_and_get_paths() -> (PathBuf, PathBuf) { } } - build_workers(); + build_workers(is_bench); Mutex::new((prepare_worker_path, execute_worker_path)) }); diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index b5394a260739..45ac22d8b3b5 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -53,7 +53,7 @@ impl TestHost { where F: FnOnce(&mut Config), { - let (prepare_worker_path, execute_worker_path) = build_workers_and_get_paths(); + let (prepare_worker_path, execute_worker_path) = build_workers_and_get_paths(false); let cache_dir = tempfile::tempdir().unwrap(); let mut config = Config::new( diff --git a/polkadot/node/core/pvf/tests/it/worker_common.rs b/polkadot/node/core/pvf/tests/it/worker_common.rs index 4b736b08ba60..0d33af7e096c 100644 --- a/polkadot/node/core/pvf/tests/it/worker_common.rs +++ b/polkadot/node/core/pvf/tests/it/worker_common.rs @@ -23,7 +23,7 @@ use std::{env, time::Duration}; // Test spawning a program that immediately exits with a failure code. #[tokio::test] async fn spawn_immediate_exit() { - let (prepare_worker_path, _) = build_workers_and_get_paths(); + let (prepare_worker_path, _) = build_workers_and_get_paths(false); // There's no explicit `exit` subcommand in the worker; it will panic on an unknown // subcommand anyway @@ -41,7 +41,7 @@ async fn spawn_immediate_exit() { #[tokio::test] async fn spawn_timeout() { - let (_, execute_worker_path) = build_workers_and_get_paths(); + let (_, execute_worker_path) = build_workers_and_get_paths(false); let result = spawn_with_program_path( "integration-test", @@ -57,7 +57,7 @@ async fn spawn_timeout() { #[tokio::test] async fn should_connect() { - let (prepare_worker_path, _) = build_workers_and_get_paths(); + let (prepare_worker_path, _) = build_workers_and_get_paths(false); let _ = spawn_with_program_path( "integration-test", From d9fcc7c17e3362e3a9c87cbbce3fa51dae718ccd Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 3 Nov 2023 11:49:03 +0100 Subject: [PATCH 27/47] cargo fmt --- .../node/core/pvf/execute-worker/src/lib.rs | 172 ++++++++---------- .../node/core/pvf/prepare-worker/src/lib.rs | 21 +-- 2 files changed, 89 insertions(+), 104 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 92d02e8aa330..a613a3a07301 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -16,7 +16,14 @@ //! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary. -use nix::{sys::{resource::{Resource, Usage, UsageWho}, wait::WaitStatus, signal::Signal}, unistd::ForkResult}; +use nix::{ + sys::{ + resource::{Resource, Usage, UsageWho}, + signal::Signal, + wait::WaitStatus, + }, + unistd::ForkResult, +}; use os_pipe::PipeWriter; pub use polkadot_node_core_pvf_common::{ executor_intf::execute_artifact, worker_dir, SecurityStatus, @@ -40,11 +47,12 @@ use polkadot_node_core_pvf_common::{ use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams}; use std::{ - io::{self, Write, Read}, + io::{self, Read, Write}, os::unix::net::UnixStream, path::PathBuf, + process, sync::Arc, - time::Duration, process, + time::Duration, }; // Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code. @@ -166,42 +174,37 @@ pub fn worker_entrypoint( }, }; - let (pipe_reader, pipe_writer) = os_pipe::pipe()?; let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)?; - // SAFETY: new process is spawned within a single threaded process - let response = match unsafe { nix::unistd::fork() } { - Err(_errno) => Response::Panic(String::from("error forking")), - Ok(ForkResult::Child) => { - // Dropping the stream closes the underlying socket. We want to make sure - // that the sandboxed child can't get any kind of information from the - // outside world. The only IPC it should be able to do is sending its - // response over the pipe. - drop(stream); - - handle_child_process( - pipe_writer, - compiled_artifact_blob, - executor_params, - params, - execution_timeout, - ) - }, - // parent - Ok(ForkResult::Parent { child: _child }) => { - // the read end will wait until all write ends have been closed, - // this drop is necessary to avoid deadlock - drop(pipe_writer); - - handle_parent_process( - pipe_reader, - usage_before, - execution_timeout, - ) - }, - }; + // SAFETY: new process is spawned within a single threaded process + let response = match unsafe { nix::unistd::fork() } { + Err(_errno) => Response::Panic(String::from("error forking")), + Ok(ForkResult::Child) => { + // Dropping the stream closes the underlying socket. We want to make sure + // that the sandboxed child can't get any kind of information from the + // outside world. The only IPC it should be able to do is sending its + // response over the pipe. + drop(stream); + + handle_child_process( + pipe_writer, + compiled_artifact_blob, + executor_params, + params, + execution_timeout, + ) + }, + // parent + Ok(ForkResult::Parent { child: _child }) => { + // the read end will wait until all write ends have been closed, + // this drop is necessary to avoid deadlock + drop(pipe_writer); + + handle_parent_process(pipe_reader, usage_before, execution_timeout) + }, + }; gum::trace!( target: LOG_TARGET, @@ -240,8 +243,6 @@ fn validate_using_artifact( Response::Ok { result_descriptor, duration: Duration::from_secs(0) } } - - /// This is used to handle child process during pvf execute worker. /// It execute the artifact and pipes back the response to the parent process /// @@ -265,7 +266,7 @@ fn handle_child_process( compiled_artifact_blob: Vec, executor_params: ExecutorParams, params: Vec, - execution_timeout: Duration + execution_timeout: Duration, ) -> ! { gum::debug!( target: LOG_TARGET, @@ -283,34 +284,25 @@ fn handle_child_process( send_child_response(&pipe_write, Response::Panic(err.to_string())); }); - // Conditional variable to notify us when a thread is done. - let condvar = thread::get_condvar(); - - let executor_params_2 = executor_params.clone(); - let execute_thread = thread::spawn_worker_thread_with_stack_size( - "execute thread", - move || { - validate_using_artifact( - &compiled_artifact_blob, - &executor_params_2, - ¶ms, - ) - }, - Arc::clone(&condvar), - WaitOutcome::Finished, - EXECUTE_THREAD_STACK_SIZE, - ) - .unwrap_or_else(|err| { - send_child_response(&pipe_write, Response::Panic(err.to_string())) - }); + // Conditional variable to notify us when a thread is done. + let condvar = thread::get_condvar(); - // There's only one thread that can trigger the condvar, so ignore the condvar outcome and - // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. - let response = execute_thread - .join() - .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))); + let executor_params_2 = executor_params.clone(); + let execute_thread = thread::spawn_worker_thread_with_stack_size( + "execute thread", + move || validate_using_artifact(&compiled_artifact_blob, &executor_params_2, ¶ms), + Arc::clone(&condvar), + WaitOutcome::Finished, + EXECUTE_THREAD_STACK_SIZE, + ) + .unwrap_or_else(|err| send_child_response(&pipe_write, Response::Panic(err.to_string()))); + + // There's only one thread that can trigger the condvar, so ignore the condvar outcome and + // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. + let response = execute_thread + .join() + .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))); - send_child_response(&pipe_write, response); } @@ -339,22 +331,17 @@ fn handle_parent_process( let mut received_data = Vec::new(); // Read from the child. - if let Err(err) = pipe_read - .read_to_end(&mut received_data) { + if let Err(err) = pipe_read.read_to_end(&mut received_data) { return Response::Panic(err.to_string()) } - - let status = nix::sys::wait::wait(); + + let status = nix::sys::wait::wait(); let usage_after: Usage; - + match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { - Ok(usage) => { - usage_after = usage - }, - Err(err) => { - return Response::Panic(err.to_string()) - } + Ok(usage) => usage_after = usage, + Err(err) => return Response::Panic(err.to_string()), }; // Using `getrusage` is needed to check whether `setrlimit` was triggered. @@ -370,30 +357,29 @@ fn handle_parent_process( match status { Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { match Response::decode(&mut received_data.as_slice()) { - Ok(Response::Ok { result_descriptor, duration: _ }) => Response::Ok { result_descriptor, duration: cpu_tv }, + Ok(Response::Ok { result_descriptor, duration: _ }) => + Response::Ok { result_descriptor, duration: cpu_tv }, Ok(response) => response, - Err(err) => Response::Panic(err.to_string()) + Err(err) => Response::Panic(err.to_string()), } }, - Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => { - Response::Panic("child exited with failure".to_string()) - }, - Ok(WaitStatus::Exited(_, exit_status)) => { - Response::Panic(format!("child exited with unexpected status {}", exit_status)) - }, - Ok(WaitStatus::Signaled(_, sig, _)) => { - Response::Panic(format!("child ended with unexpected signal {:?}, timeout {} cpu_tv {} after {} before {}", sig, timeout.as_secs(), cpu_tv.as_micros(), - usage_after.user_time().tv_sec() + usage_after.system_time().tv_sec(), get_total_cpu_usage(usage_before).as_micros())) - } - Ok(_) => { - Response::Panic("child ended unexpectedly".to_string()) - } - Err(err) => Response::Panic(err.to_string()) + Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => + Response::Panic("child exited with failure".to_string()), + Ok(WaitStatus::Exited(_, exit_status)) => + Response::Panic(format!("child exited with unexpected status {}", exit_status)), + Ok(WaitStatus::Signaled(_, sig, _)) => Response::Panic(format!( + "child ended with unexpected signal {:?}, timeout {} cpu_tv {} after {} before {}", + sig, + timeout.as_secs(), + cpu_tv.as_micros(), + usage_after.user_time().tv_sec() + usage_after.system_time().tv_sec(), + get_total_cpu_usage(usage_before).as_micros() + )), + Ok(_) => Response::Panic("child ended unexpectedly".to_string()), + Err(err) => Response::Panic(err.to_string()), } } - - /// Calculate the total CPU time from the given `usage` structure, returned from /// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user /// and system time. diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 56df1f8a4eb7..7a6b8ae81151 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -32,7 +32,10 @@ use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_ use libc; use nix::{ errno::Errno, - sys::{resource::{Resource, Usage, UsageWho}, wait::WaitStatus}, + sys::{ + resource::{Resource, Usage, UsageWho}, + wait::WaitStatus, + }, unistd::{ForkResult, Pid}, }; use os_pipe::{self, PipeWriter}; @@ -443,16 +446,12 @@ fn handle_parent_process( }, } }, - Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => { - Err(PrepareError::Panic("child exited with failure".to_string())) - }, - Ok(WaitStatus::Exited(_, exit_status)) => { - Err(PrepareError::Panic(format!("child exited with unexpected status {}", exit_status))) - }, - Ok(_) => { - Err(PrepareError::Panic("child ended unexpectedly".to_string())) - } - Err(err) => Err(PrepareError::Panic(err.to_string())) + Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => + Err(PrepareError::Panic("child exited with failure".to_string())), + Ok(WaitStatus::Exited(_, exit_status)) => + Err(PrepareError::Panic(format!("child exited with unexpected status {}", exit_status))), + Ok(_) => Err(PrepareError::Panic("child ended unexpectedly".to_string())), + Err(err) => Err(PrepareError::Panic(err.to_string())), } } From 7d86911eb1ff07d1f9f79efbe38aabafe8cc0933 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 3 Nov 2023 12:15:15 +0100 Subject: [PATCH 28/47] Fix some issues with prepare worker --- polkadot/node/core/pvf/Cargo.toml | 1 - polkadot/node/core/pvf/prepare-worker/src/lib.rs | 14 +++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml index 44c98f329eaa..430f7cd5e8ef 100644 --- a/polkadot/node/core/pvf/Cargo.toml +++ b/polkadot/node/core/pvf/Cargo.toml @@ -39,7 +39,6 @@ polkadot-node-core-pvf-execute-worker = { path = "execute-worker", optional = tr assert_matches = "1.4.0" criterion = { version = "0.4.0", default-features = false, features = ["cargo_bench_support", "async_tokio"] } hex-literal = "0.4.1" -sp-tracing = { path = "../../../../substrate/primitives/tracing" } polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] } # For benches and integration tests, depend on ourselves with the test-utils diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 7a6b8ae81151..97ee37bc52a3 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -408,7 +408,7 @@ fn handle_parent_process( // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv.as_secs() >= timeout.as_secs() { + if cpu_tv >= timeout { return Err(PrepareError::TimedOut) } @@ -446,12 +446,12 @@ fn handle_parent_process( }, } }, - Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => - Err(PrepareError::Panic("child exited with failure".to_string())), - Ok(WaitStatus::Exited(_, exit_status)) => - Err(PrepareError::Panic(format!("child exited with unexpected status {}", exit_status))), - Ok(_) => Err(PrepareError::Panic("child ended unexpectedly".to_string())), - Err(err) => Err(PrepareError::Panic(err.to_string())), + Err(errno) => Err(err_from_errno("waitpid", errno)), + // An attacker can make the child process return any exit status it wants. So we can treat + // all unexpected cases the same way. + Ok(unexpected_wait_status) => Err(PrepareError::IoErr(format!( + "unexpected status from wait: {unexpected_wait_status:?}" + ))), } } From 3b855d8b558d834119107c146d9093f947229a26 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 3 Nov 2023 15:26:35 +0100 Subject: [PATCH 29/47] Fix some issues with execute worker/job; update errors --- .../node/core/candidate-validation/src/lib.rs | 15 ++- polkadot/node/core/pvf/common/src/error.rs | 11 ++ polkadot/node/core/pvf/common/src/execute.rs | 9 ++ .../node/core/pvf/execute-worker/src/lib.rs | 117 ++++++++++-------- .../node/core/pvf/prepare-worker/src/lib.rs | 35 ++++-- polkadot/node/core/pvf/src/error.rs | 39 +++--- polkadot/node/core/pvf/src/execute/queue.rs | 11 ++ .../node/core/pvf/src/execute/worker_intf.rs | 25 +++- 8 files changed, 171 insertions(+), 91 deletions(-) diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 93db7d11cee8..b3729106c24c 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -647,6 +647,13 @@ async fn validate_candidate_exhaustive( ))), Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(err))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), + + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath)) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError( + "ambiguous job death".to_string(), + ))), + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::UnexpectedJobStatus(err))) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) => { // In principle if preparation of the `WASM` fails, the current candidate can not be the // reason for that. So we can't say whether it is invalid or not. In addition, with @@ -739,7 +746,7 @@ trait ValidationBackend { // Allow limited retries for each kind of error. let mut num_internal_retries_left = 1; - let mut num_awd_retries_left = 1; + let mut num_death_retries_left = 1; let mut num_panic_retries_left = 1; loop { // Stop retrying if we exceeded the timeout. @@ -749,8 +756,10 @@ trait ValidationBackend { match validation_result { Err(ValidationError::InvalidCandidate( - WasmInvalidCandidate::AmbiguousWorkerDeath, - )) if num_awd_retries_left > 0 => num_awd_retries_left -= 1, + WasmInvalidCandidate::AmbiguousWorkerDeath | + WasmInvalidCandidate::AmbiguousJobDeath | + WasmInvalidCandidate::UnexpectedJobStatus, + )) if num_death_retries_left > 0 => num_death_retries_left -= 1, Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(_))) if num_panic_retries_left > 0 => num_panic_retries_left -= 1, diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 210c900d2304..9a5cafb7b732 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -128,7 +128,15 @@ pub enum InternalValidationError { /// An error occurred in the CPU time monitor thread. Should be totally unrelated to /// validation. CpuTimeMonitorThread(String), + /// Could not spawn the execution job thread. + CouldNotSpawnJobThread(String), + /// Some error occurred when interfacing with the kernel. + Kernel(String), + /// Some non-deterministic preparation error occurred. + /// + /// It is OK if attackers can trigger such errors. We assume that such attacks are filtered + /// during pre-checking. NonDeterministicPrepareError(PrepareError), } @@ -152,6 +160,9 @@ impl fmt::Display for InternalValidationError { ), CpuTimeMonitorThread(err) => write!(f, "validation: an error occurred in the CPU time monitor thread: {}", err), + CouldNotSpawnJobThread(err) => + write!(f, "validation: could not spawn execution job thread: {}", err), + Kernel(err) => write!(f, "validation: error interfacing with the kernel: {}", err), NonDeterministicPrepareError(err) => write!(f, "validation: prepare: {}", err), } } diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index b89ab089af1c..887352def07a 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -44,6 +44,15 @@ pub enum Response { TimedOut, /// An unexpected panic has occurred in the execution worker. Panic(String), + /// The job process has died. We must kill the worker just in case. + /// + /// We cannot treat this as an internal error because malicious code may have caused this. + JobDied, + /// The execute job returned an unexpected status. + /// + /// We cannot treat this as an internal error because malicious code may have caused this. + UnexpectedJobStatus(String), + /// Some internal error occurred. InternalError(InternalValidationError), } diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index a613a3a07301..73e19434f979 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -16,15 +16,6 @@ //! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary. -use nix::{ - sys::{ - resource::{Resource, Usage, UsageWho}, - signal::Signal, - wait::WaitStatus, - }, - unistd::ForkResult, -}; -use os_pipe::PipeWriter; pub use polkadot_node_core_pvf_common::{ executor_intf::execute_artifact, worker_dir, SecurityStatus, }; @@ -33,6 +24,15 @@ pub use polkadot_node_core_pvf_common::{ // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`. const LOG_TARGET: &str = "parachain::pvf-execute-worker"; +use nix::{ + errno::Errno, + sys::{ + resource::{Resource, Usage, UsageWho}, + wait::WaitStatus, + }, + unistd::{ForkResult, Pid}, +}; +use os_pipe::PipeWriter; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, @@ -148,7 +148,7 @@ pub fn worker_entrypoint( worker_version, &security_status, |mut stream, worker_dir_path| { - let worker_pid = std::process::id(); + let worker_pid = process::id(); let artifact_path = worker_dir::execute_artifact(&worker_dir_path); let Handshake { executor_params } = recv_handshake(&mut stream)?; @@ -176,11 +176,18 @@ pub fn worker_entrypoint( let (pipe_reader, pipe_writer) = os_pipe::pipe()?; - let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)?; + let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { + Ok(usage) => usage, + Err(errno) => { + let response = internal_error_from_errno("getrusage before", errno); + send_response(&mut stream, response)?; + continue + }, + }; // SAFETY: new process is spawned within a single threaded process let response = match unsafe { nix::unistd::fork() } { - Err(_errno) => Response::Panic(String::from("error forking")), + Err(errno) => internal_error_from_errno("fork", errno), Ok(ForkResult::Child) => { // Dropping the stream closes the underlying socket. We want to make sure // that the sandboxed child can't get any kind of information from the @@ -196,13 +203,12 @@ pub fn worker_entrypoint( execution_timeout, ) }, - // parent - Ok(ForkResult::Parent { child: _child }) => { + Ok(ForkResult::Parent { child }) => { // the read end will wait until all write ends have been closed, // this drop is necessary to avoid deadlock drop(pipe_writer); - handle_parent_process(pipe_reader, usage_before, execution_timeout) + handle_parent_process(pipe_reader, child, usage_before, execution_timeout) }, }; @@ -240,7 +246,7 @@ fn validate_using_artifact( }; // duration is set to 0 here because the process duration is calculated on the parent process - Response::Ok { result_descriptor, duration: Duration::from_secs(0) } + Response::Ok { result_descriptor, duration: Duration::ZERO } } /// This is used to handle child process during pvf execute worker. @@ -254,7 +260,7 @@ fn validate_using_artifact( /// /// - `executor_params`: Deterministically serialized execution environment semantics. /// -/// - `params`: +/// - `params`: Validation parameters. /// /// - `execution_timeout`: The timeout in `Duration`. /// @@ -270,7 +276,7 @@ fn handle_child_process( ) -> ! { gum::debug!( target: LOG_TARGET, - worker_job_pid = %std::process::id(), + worker_job_pid = %process::id(), "worker job: executing artifact", ); @@ -280,13 +286,15 @@ fn handle_child_process( execution_timeout.as_secs(), execution_timeout.as_secs(), ) - .unwrap_or_else(|err| { - send_child_response(&pipe_write, Response::Panic(err.to_string())); + .unwrap_or_else(|errno| { + send_child_response(&pipe_write, internal_error_from_errno("setrlimit", errno)); }); // Conditional variable to notify us when a thread is done. let condvar = thread::get_condvar(); + // TODO: We may not need this since there's only one thread here now. We do still need to + // control the stack size (see EXECUTE_THREAD_STACK_SIZE). Look into simplifying. let executor_params_2 = executor_params.clone(); let execute_thread = thread::spawn_worker_thread_with_stack_size( "execute thread", @@ -295,7 +303,14 @@ fn handle_child_process( WaitOutcome::Finished, EXECUTE_THREAD_STACK_SIZE, ) - .unwrap_or_else(|err| send_child_response(&pipe_write, Response::Panic(err.to_string()))); + .unwrap_or_else(|err| { + send_child_response( + &pipe_write, + Response::InternalError(InternalValidationError::CouldNotSpawnJobThread( + err.to_string(), + )), + ) + }); // There's only one thread that can trigger the condvar, so ignore the condvar outcome and // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. @@ -312,36 +327,33 @@ fn handle_child_process( /// /// - `pipe_read`: A `PipeReader` used to read data from the child process. /// +/// - `child`: The child pid. +/// /// - `usage_before`: Resource usage statistics before executing the child process. /// /// - `timeout`: The maximum allowed time for the child process to finish, in `Duration`. /// /// # Returns /// -/// - If no unexpected error occurr, this function return child response -/// -/// - If an unexpected error occurr, this function returns `Response::Panic` -/// -/// - If the child process timeout, it returns `Response::TimedOut`. +/// - The response, either `Ok` or some error state. fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, + child: Pid, usage_before: Usage, timeout: Duration, ) -> Response { - let mut received_data = Vec::new(); - // Read from the child. - if let Err(err) = pipe_read.read_to_end(&mut received_data) { - return Response::Panic(err.to_string()) + let mut received_data = Vec::new(); + if let Err(_err) = pipe_read.read_to_end(&mut received_data) { + // Swallow the error, it's not really helpful as to why the child died. + return Response::JobDied } - let status = nix::sys::wait::wait(); + let status = nix::sys::wait::waitpid(child, None); - let usage_after: Usage; - - match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { - Ok(usage) => usage_after = usage, - Err(err) => return Response::Panic(err.to_string()), + let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { + Ok(usage) => usage, + Err(errno) => return internal_error_from_errno("getrusage after", errno), }; // Using `getrusage` is needed to check whether `setrlimit` was triggered. @@ -350,7 +362,7 @@ fn handle_parent_process( // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv.as_secs() >= timeout.as_secs() { + if cpu_tv >= timeout { return Response::TimedOut } @@ -360,23 +372,17 @@ fn handle_parent_process( Ok(Response::Ok { result_descriptor, duration: _ }) => Response::Ok { result_descriptor, duration: cpu_tv }, Ok(response) => response, - Err(err) => Response::Panic(err.to_string()), + // This error happens when the job dies. + Err(_err) => Response::JobDied, } }, - Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => - Response::Panic("child exited with failure".to_string()), - Ok(WaitStatus::Exited(_, exit_status)) => - Response::Panic(format!("child exited with unexpected status {}", exit_status)), - Ok(WaitStatus::Signaled(_, sig, _)) => Response::Panic(format!( - "child ended with unexpected signal {:?}, timeout {} cpu_tv {} after {} before {}", - sig, - timeout.as_secs(), - cpu_tv.as_micros(), - usage_after.user_time().tv_sec() + usage_after.system_time().tv_sec(), - get_total_cpu_usage(usage_before).as_micros() + Err(errno) => internal_error_from_errno("waitpid", errno), + + // It is within an attacker's power to send an unexpected exit status. So we cannot treat + // this as an internal error (which would make us abstain), but must vote against. + Ok(unexpected_wait_status) => Response::UnexpectedJobStatus(format!( + "unexpected status from wait: {unexpected_wait_status:?}" )), - Ok(_) => Response::Panic("child ended unexpectedly".to_string()), - Err(err) => Response::Panic(err.to_string()), } } @@ -412,3 +418,12 @@ fn send_child_response(mut pipe_write: &PipeWriter, response: Response) -> ! { process::exit(libc::EXIT_SUCCESS) } + +fn internal_error_from_errno(context: &'static str, errno: Errno) -> Response { + Response::InternalError(InternalValidationError::Kernel(format!( + "{}: {}: {}", + context, + errno, + io::Error::last_os_error() + ))) +} diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 97ee37bc52a3..0df610fccf2a 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -167,7 +167,7 @@ pub fn worker_entrypoint( let usage_before = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { Ok(usage) => usage, Err(errno) => { - let result = Err(err_from_errno("getrusage before", errno)); + let result = Err(error_from_errno("getrusage before", errno)); send_response(&mut stream, result)?; continue }, @@ -175,7 +175,7 @@ pub fn worker_entrypoint( // SAFETY: new process is spawned within a single threaded process let result = match unsafe { nix::unistd::fork() } { - Err(errno) => Err(err_from_errno("fork", errno)), + Err(errno) => Err(error_from_errno("fork", errno)), Ok(ForkResult::Child) => { // Dropping the stream closes the underlying socket. We want to make sure // that the sandboxed child can't get any kind of information from the @@ -191,7 +191,6 @@ pub fn worker_entrypoint( executor_params, ) }, - // parent Ok(ForkResult::Parent { child }) => { // the read end will wait until all write ends have been closed, // this drop is necessary to avoid deadlock @@ -207,6 +206,13 @@ pub fn worker_entrypoint( ) }, }; + + gum::trace!( + target: LOG_TARGET, + %worker_pid, + "worker: sending result to host: {:?}", + result + ); send_response(&mut stream, result)?; } }, @@ -273,7 +279,7 @@ fn handle_child_process( ) -> ! { gum::debug!( target: LOG_TARGET, - worker_job_pid = %std::process::id(), + worker_job_pid = %process::id(), "worker job: preparing artifact", ); @@ -283,8 +289,8 @@ fn handle_child_process( preparation_timeout.as_secs(), preparation_timeout.as_secs(), ) - .unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) + .unwrap_or_else(|errno| { + send_child_response(&pipe_write, Err(error_from_errno("setrlimit", errno))) }); // Conditional variable to notify us when a thread is done. @@ -323,7 +329,7 @@ fn handle_child_process( WaitOutcome::Finished, ) .unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) + send_child_response(&pipe_write, Err(PrepareError::IoErr(err.to_string()))) }); // There's only one thread that can trigger the condvar, so ignore the condvar outcome and @@ -367,6 +373,8 @@ fn handle_child_process( /// /// - `pipe_read`: A `PipeReader` used to read data from the child process. /// +/// - `child`: The child pid. +/// /// - `temp_artifact_dest`: The destination `PathBuf` to write the temporary artifact file. /// /// - `worker_pid`: The PID of the child process. @@ -391,16 +399,16 @@ fn handle_parent_process( usage_before: Usage, timeout: Duration, ) -> Result { - let mut received_data = Vec::new(); - // Read from the child. + let mut received_data = Vec::new(); pipe_read .read_to_end(&mut received_data) // Swallow the error, it's not really helpful as to why the child died. .map_err(|_errno| PrepareError::JobDied)?; + let status = nix::sys::wait::waitpid(child, None); let usage_after = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) - .map_err(|errno| err_from_errno("getrusage after", errno))?; + .map_err(|errno| error_from_errno("getrusage after", errno))?; // Using `getrusage` is needed to check whether `setrlimit` was triggered. // As `getrusage` returns resource usage from all terminated child processes, @@ -412,7 +420,7 @@ fn handle_parent_process( return Err(PrepareError::TimedOut) } - return match status { + match status { Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { let result: Result = Result::decode(&mut received_data.as_slice()) @@ -446,7 +454,8 @@ fn handle_parent_process( }, } }, - Err(errno) => Err(err_from_errno("waitpid", errno)), + Err(errno) => Err(error_from_errno("waitpid", errno)), + // An attacker can make the child process return any exit status it wants. So we can treat // all unexpected cases the same way. Ok(unexpected_wait_status) => Err(PrepareError::IoErr(format!( @@ -488,6 +497,6 @@ fn send_child_response(mut pipe_write: &PipeWriter, response: Result PrepareError { +fn error_from_errno(context: &'static str, errno: Errno) -> PrepareError { PrepareError::Kernel(format!("{}: {}: {}", context, errno, io::Error::last_os_error())) } diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs index 87ef0b54a040..5bfbf13d422d 100644 --- a/polkadot/node/core/pvf/src/error.rs +++ b/polkadot/node/core/pvf/src/error.rs @@ -35,25 +35,11 @@ pub enum InvalidCandidate { PrepareError(String), /// The failure is reported by the execution worker. The string contains the error message. WorkerReportedError(String), - /// The worker has died during validation of a candidate. That may fall in one of the following - /// categories, which we cannot distinguish programmatically: + /// The worker process (not the job) has died during validation of a candidate. /// - /// (a) Some sort of transient glitch caused the worker process to abort. An example would be - /// that the host machine ran out of free memory and the OOM killer started killing the - /// processes, and in order to save the parent it will "sacrifice child" first. - /// - /// (b) The candidate triggered a code path that has lead to the process death. For example, - /// the PVF found a way to consume unbounded amount of resources and then it either - /// exceeded an `rlimit` (if set) or, again, invited OOM killer. Another possibility is a - /// bug in wasmtime allowed the PVF to gain control over the execution worker. - /// - /// We attribute such an event to an *invalid candidate* in either case. - /// - /// The rationale for this is that a glitch may lead to unfair rejecting candidate by a single - /// validator. If the glitch is somewhat more persistent the validator will reject all - /// candidate thrown at it and hopefully the operator notices it by decreased reward - /// performance of the validator. On the other hand, if the worker died because of (b) we would - /// have better chances to stop the attack. + /// It's unlikely that this is caused by malicious code since workers now spawn separate job + /// processes, and those job processes are sandboxed. But, it is possible. We retry in this + /// case, and if the error persists, we assume it's caused by the candidate and vote against. AmbiguousWorkerDeath, /// PVF execution (compilation is not included) took more time than was allotted. HardTimeout, @@ -63,6 +49,23 @@ pub enum InvalidCandidate { /// then all validators would abstain, stalling finality on the chain. So we will first retry /// the candidate, and if the issue persists we are forced to vote invalid. Panic(String), + /// The job process (not the worker) has died for one of the following reasons: + /// + /// (a) A seccomp violation occurred, most likely due to an attempt by malicious code to + /// execute arbitrary code. Note that there is no foolproof way to detect this if the operator + /// has seccomp auditing disabled. + /// + /// (b) The host machine ran out of free memory and the OOM killer started killing the + /// processes, and in order to save the parent it will "sacrifice child" first. + /// + /// (c) Some other reason, perhaps transient or perhaps caused by malicious code. + /// + /// We cannot treat this as an internal error because malicious code may have caused this. + AmbiguousJobDeath, + /// The execute job returned an unexpected status. + /// + /// We cannot treat this as an internal error because malicious code may have caused this. + UnexpectedJobStatus(String), } impl From for ValidationError { diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index aca604f0de21..33f9528c9331 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -346,6 +346,7 @@ fn handle_job_finish( None, ), Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None), + // Treated as definitely-invalid, because if we timed out, there's no time left for a retry. Outcome::HardTimeout => (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None), // "Maybe invalid" errors (will retry). @@ -356,6 +357,16 @@ fn handle_job_finish( ), Outcome::Panic { err } => (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::Panic(err))), None), + Outcome::JobDied => ( + None, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath)), + None, + ), + Outcome::UnexpectedJobStatus { err } => ( + None, + Err(ValidationError::InvalidCandidate(InvalidCandidate::UnexpectedJobStatus(err))), + None, + ), }; queue.metrics.execute_finished(); diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index b282d0a51826..11c739ea30fc 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -88,12 +88,6 @@ pub enum Outcome { /// a trap. Errors related to the preparation process are not expected to be encountered by the /// execution workers. InvalidCandidate { err: String, idle_worker: IdleWorker }, - /// An internal error happened during the validation. Such an error is most likely related to - /// some transient glitch. - /// - /// Should only ever be used for errors independent of the candidate and PVF. Therefore it may - /// be a problem with the worker, so we terminate it. - InternalError { err: InternalValidationError }, /// The execution time exceeded the hard limit. The worker is terminated. HardTimeout, /// An I/O error happened during communication with the worker. This may mean that the worker @@ -101,6 +95,22 @@ pub enum Outcome { IoErr, /// An unexpected panic has occurred in the execution worker. Panic { err: String }, + /// The job process has died. We must kill the worker just in case. + /// + /// We cannot treat this as an internal error because malicious code may have caused this. + JobDied, + /// The execute job returned an unexpected status. We might be able to recover from this error + /// instead of killing the worker, but this should be very rare anyway. + /// + /// We cannot treat this as an internal error because malicious code may have caused this. + UnexpectedJobStatus { err: String }, + + /// An internal error happened during the validation. Such an error is most likely related to + /// some transient glitch. + /// + /// Should only ever be used for errors independent of the candidate and PVF. Therefore it may + /// be a problem with the worker, so we terminate it. + InternalError { err: InternalValidationError }, } /// Given the idle token of a worker and parameters of work, communicates with the worker and @@ -232,6 +242,9 @@ pub async fn start_work( }, Response::TimedOut => Outcome::HardTimeout, Response::Panic(err) => Outcome::Panic { err }, + Response::JobDied => Outcome::JobDied, + Response::UnexpectedJobStatus(err) => Outcome::UnexpectedJobStatus{err}, + Response::InternalError(err) => Outcome::InternalError { err }, } }) From 0c04842485b68cd216c258029ca6a7f7cce704fd Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 3 Nov 2023 16:13:21 +0100 Subject: [PATCH 30/47] Fix some test failures --- .../node/core/candidate-validation/src/lib.rs | 2 +- .../node/core/pvf/execute-worker/src/lib.rs | 23 ++++++----- .../node/core/pvf/prepare-worker/src/lib.rs | 8 ++-- polkadot/node/core/pvf/tests/it/main.rs | 40 ++++++++++++++++++- 4 files changed, 58 insertions(+), 15 deletions(-) diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index b3729106c24c..8d2aaddc574f 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -758,7 +758,7 @@ trait ValidationBackend { Err(ValidationError::InvalidCandidate( WasmInvalidCandidate::AmbiguousWorkerDeath | WasmInvalidCandidate::AmbiguousJobDeath | - WasmInvalidCandidate::UnexpectedJobStatus, + WasmInvalidCandidate::UnexpectedJobStatus(_), )) if num_death_retries_left > 0 => num_death_retries_left -= 1, Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(_))) if num_panic_retries_left > 0 => diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 73e19434f979..e02ffae45bce 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -208,7 +208,7 @@ pub fn worker_entrypoint( // this drop is necessary to avoid deadlock drop(pipe_writer); - handle_parent_process(pipe_reader, child, usage_before, execution_timeout) + handle_parent_process(pipe_reader, child, usage_before, execution_timeout)? }, }; @@ -341,19 +341,19 @@ fn handle_parent_process( child: Pid, usage_before: Usage, timeout: Duration, -) -> Response { +) -> io::Result { // Read from the child. let mut received_data = Vec::new(); if let Err(_err) = pipe_read.read_to_end(&mut received_data) { // Swallow the error, it's not really helpful as to why the child died. - return Response::JobDied + return Ok(Response::JobDied) } let status = nix::sys::wait::waitpid(child, None); let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { Ok(usage) => usage, - Err(errno) => return internal_error_from_errno("getrusage after", errno), + Err(errno) => return Ok(internal_error_from_errno("getrusage after", errno)), }; // Using `getrusage` is needed to check whether `setrlimit` was triggered. @@ -361,21 +361,23 @@ fn handle_parent_process( // it is necessary to subtract the usage before the current child process to isolate its cpu // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv >= timeout { - return Response::TimedOut + return Ok(Response::TimedOut) } - match status { + let response = match status { Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { match Response::decode(&mut received_data.as_slice()) { Ok(Response::Ok { result_descriptor, duration: _ }) => Response::Ok { result_descriptor, duration: cpu_tv }, Ok(response) => response, - // This error happens when the job dies. - Err(_err) => Response::JobDied, + // There is either a bug or the job was hijacked. Should retry at any rate. + Err(err) => return Err(io::Error::new(io::ErrorKind::Other, err.to_string())), } }, + // The job gets SIGSYS on seccomp violations. We can also treat other termination signals as + // death. But also, receiving any signal is unexpected, so treat them all the same. + Ok(WaitStatus::Signaled(..)) => Response::JobDied, Err(errno) => internal_error_from_errno("waitpid", errno), // It is within an attacker's power to send an unexpected exit status. So we cannot treat @@ -383,7 +385,8 @@ fn handle_parent_process( Ok(unexpected_wait_status) => Response::UnexpectedJobStatus(format!( "unexpected status from wait: {unexpected_wait_status:?}" )), - } + }; + Ok(response) } /// Calculate the total CPU time from the given `usage` structure, returned from diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 0df610fccf2a..735732a840bf 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -415,7 +415,6 @@ fn handle_parent_process( // it is necessary to subtract the usage before the current child process to isolate its cpu // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv >= timeout { return Err(PrepareError::TimedOut) } @@ -424,8 +423,8 @@ fn handle_parent_process( Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { let result: Result = Result::decode(&mut received_data.as_slice()) - // This error happens when the job dies. - .map_err(|_err| PrepareError::JobDied)?; + // There is either a bug or the job was hijacked. + .map_err(|err| PrepareError::IoErr(err.to_string()))?; match result { Err(err) => Err(err), Ok(response) => { @@ -454,6 +453,9 @@ fn handle_parent_process( }, } }, + // The job gets SIGSYS on seccomp violations. We can also treat other termination signals as + // death. But also, receiving any signal is unexpected, so treat them all the same. + Ok(WaitStatus::Signaled(..)) => Err(PrepareError::JobDied), Err(errno) => Err(error_from_errno("waitpid", errno)), // An attacker can make the child process return any exit status it wants. So we can treat diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 45ac22d8b3b5..d4b5cad910a4 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -299,7 +299,45 @@ rusty_fork_test! { // What happens when the forked execute job dies in the middle of its job? #[test] fn forked_execute_job_killed_during_job() { - todo!() + polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; + kill_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, false); + } + ); + + // Note that we get a more specific error if the job died than if the whole worker died. + assert_matches!( + result, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath)) + ); + }) } } From 87e96d051ea58e744e5f154af0cea38ee83d9a56 Mon Sep 17 00:00:00 2001 From: Joao pedro Santos Date: Sat, 4 Nov 2023 09:37:52 -0300 Subject: [PATCH 31/47] add cpu monitor thread back to prepare and execute worker --- .../node/core/pvf/execute-worker/src/lib.rs | 235 ++++++++++-------- .../node/core/pvf/prepare-worker/src/lib.rs | 153 ++++++++---- 2 files changed, 235 insertions(+), 153 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 92d02e8aa330..27542f8b66f2 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -16,7 +16,13 @@ //! Contains the logic for executing PVFs. Used by the polkadot-execute-worker binary. -use nix::{sys::{resource::{Resource, Usage, UsageWho}, wait::WaitStatus, signal::Signal}, unistd::ForkResult}; +use nix::{ + sys::{ + resource::{Resource, Usage, UsageWho}, + wait::WaitStatus, + }, + unistd::ForkResult, +}; use os_pipe::PipeWriter; pub use polkadot_node_core_pvf_common::{ executor_intf::execute_artifact, worker_dir, SecurityStatus, @@ -26,13 +32,14 @@ pub use polkadot_node_core_pvf_common::{ // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`. const LOG_TARGET: &str = "parachain::pvf-execute-worker"; +use cpu_time::ProcessTime; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, execute::{Handshake, Response}, framed_recv_blocking, framed_send_blocking, worker::{ - stringify_panic_payload, + cpu_time_monitor_loop, stringify_panic_payload, thread::{self, WaitOutcome}, worker_event_loop, WorkerKind, }, @@ -40,11 +47,12 @@ use polkadot_node_core_pvf_common::{ use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams}; use std::{ - io::{self, Write, Read}, + io::{self, Read, Write}, os::unix::net::UnixStream, path::PathBuf, - sync::Arc, - time::Duration, process, + process, + sync::{mpsc::channel, Arc}, + time::Duration, }; // Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code. @@ -166,42 +174,37 @@ pub fn worker_entrypoint( }, }; - let (pipe_reader, pipe_writer) = os_pipe::pipe()?; let usage_before = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN)?; - // SAFETY: new process is spawned within a single threaded process - let response = match unsafe { nix::unistd::fork() } { - Err(_errno) => Response::Panic(String::from("error forking")), - Ok(ForkResult::Child) => { - // Dropping the stream closes the underlying socket. We want to make sure - // that the sandboxed child can't get any kind of information from the - // outside world. The only IPC it should be able to do is sending its - // response over the pipe. - drop(stream); - - handle_child_process( - pipe_writer, - compiled_artifact_blob, - executor_params, - params, - execution_timeout, - ) - }, - // parent - Ok(ForkResult::Parent { child: _child }) => { - // the read end will wait until all write ends have been closed, - // this drop is necessary to avoid deadlock - drop(pipe_writer); - - handle_parent_process( - pipe_reader, - usage_before, - execution_timeout, - ) - }, - }; + // SAFETY: new process is spawned within a single threaded process + let response = match unsafe { nix::unistd::fork() } { + Err(_errno) => Response::Panic(String::from("error forking")), + Ok(ForkResult::Child) => { + // Dropping the stream closes the underlying socket. We want to make sure + // that the sandboxed child can't get any kind of information from the + // outside world. The only IPC it should be able to do is sending its + // response over the pipe. + drop(stream); + + handle_child_process( + pipe_writer, + compiled_artifact_blob, + executor_params, + params, + execution_timeout, + ) + }, + // parent + Ok(ForkResult::Parent { child: _child }) => { + // the read end will wait until all write ends have been closed, + // this drop is necessary to avoid deadlock + drop(pipe_writer); + + handle_parent_process(pipe_reader, usage_before, execution_timeout) + }, + }; gum::trace!( target: LOG_TARGET, @@ -240,8 +243,6 @@ fn validate_using_artifact( Response::Ok { result_descriptor, duration: Duration::from_secs(0) } } - - /// This is used to handle child process during pvf execute worker. /// It execute the artifact and pipes back the response to the parent process /// @@ -265,7 +266,7 @@ fn handle_child_process( compiled_artifact_blob: Vec, executor_params: ExecutorParams, params: Vec, - execution_timeout: Duration + execution_timeout: Duration, ) -> ! { gum::debug!( target: LOG_TARGET, @@ -283,34 +284,54 @@ fn handle_child_process( send_child_response(&pipe_write, Response::Panic(err.to_string())); }); - // Conditional variable to notify us when a thread is done. - let condvar = thread::get_condvar(); - - let executor_params_2 = executor_params.clone(); - let execute_thread = thread::spawn_worker_thread_with_stack_size( - "execute thread", - move || { - validate_using_artifact( - &compiled_artifact_blob, - &executor_params_2, - ¶ms, - ) - }, - Arc::clone(&condvar), - WaitOutcome::Finished, - EXECUTE_THREAD_STACK_SIZE, - ) - .unwrap_or_else(|err| { - send_child_response(&pipe_write, Response::Panic(err.to_string())) - }); + // Conditional variable to notify us when a thread is done. + let condvar = thread::get_condvar(); + let cpu_time_start = ProcessTime::now(); + + // Spawn a new thread that runs the CPU time monitor. + let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>(); + let cpu_time_monitor_thread = thread::spawn_worker_thread( + "cpu time monitor thread", + move || cpu_time_monitor_loop(cpu_time_start, execution_timeout, cpu_time_monitor_rx), + Arc::clone(&condvar), + WaitOutcome::TimedOut, + ) + .unwrap_or_else(|err| send_child_response(&pipe_write, Response::Panic(err.to_string()))); + + let executor_params_2 = executor_params.clone(); + let execute_thread = thread::spawn_worker_thread_with_stack_size( + "execute thread", + move || validate_using_artifact(&compiled_artifact_blob, &executor_params_2, ¶ms), + Arc::clone(&condvar), + WaitOutcome::Finished, + EXECUTE_THREAD_STACK_SIZE, + ) + .unwrap_or_else(|err| send_child_response(&pipe_write, Response::Panic(err.to_string()))); + + let outcome = thread::wait_for_threads(condvar); - // There's only one thread that can trigger the condvar, so ignore the condvar outcome and - // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. - let response = execute_thread - .join() - .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))); + let response = match outcome { + WaitOutcome::Finished => { + let _ = cpu_time_monitor_tx.send(()); + execute_thread + .join() + .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))) + }, + // If the CPU thread is not selected, we signal it to end, the join handle is + // dropped and the thread will finish in the background. + WaitOutcome::TimedOut => match cpu_time_monitor_thread.join() { + Ok(Some(cpu_time_elapsed)) => Response::TimedOut, + Ok(None) => Response::InternalError(InternalValidationError::CpuTimeMonitorThread( + "error communicating over finished channel".into(), + )), + Err(e) => Response::InternalError(InternalValidationError::CpuTimeMonitorThread( + stringify_panic_payload(e), + )), + }, + WaitOutcome::Pending => + unreachable!("we run wait_while until the outcome is no longer pending; qed"), + }; - send_child_response(&pipe_write, response); } @@ -336,25 +357,21 @@ fn handle_parent_process( usage_before: Usage, timeout: Duration, ) -> Response { + let worker_pid = std::process::id(); let mut received_data = Vec::new(); // Read from the child. - if let Err(err) = pipe_read - .read_to_end(&mut received_data) { + if let Err(err) = pipe_read.read_to_end(&mut received_data) { return Response::Panic(err.to_string()) } - - let status = nix::sys::wait::wait(); + + let status = nix::sys::wait::wait(); let usage_after: Usage; - + match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { - Ok(usage) => { - usage_after = usage - }, - Err(err) => { - return Response::Panic(err.to_string()) - } + Ok(usage) => usage_after = usage, + Err(err) => return Response::Panic(err.to_string()), }; // Using `getrusage` is needed to check whether `setrlimit` was triggered. @@ -363,37 +380,57 @@ fn handle_parent_process( // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv.as_secs() >= timeout.as_secs() { - return Response::TimedOut - } - match status { Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { match Response::decode(&mut received_data.as_slice()) { - Ok(Response::Ok { result_descriptor, duration: _ }) => Response::Ok { result_descriptor, duration: cpu_tv }, + Ok(Response::Ok { result_descriptor, duration: _ }) => { + if cpu_tv.as_secs() >= timeout.as_secs() { + // Log if we exceed the timeout and the other thread hasn't + // finished. + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "execute job took {}ms cpu time, exceeded execute timeout {}ms", + cpu_tv.as_millis(), + timeout.as_millis(), + ); + return Response::TimedOut + } + Response::Ok { result_descriptor, duration: cpu_tv } + }, + Ok(Response::TimedOut) => { + // Log if we exceed the timeout and the other thread hasn't + // finished. + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "execute job took {}ms cpu time, exceeded execute timeout {}ms", + cpu_tv.as_millis(), + timeout.as_millis(), + ); + return Response::TimedOut + }, Ok(response) => response, - Err(err) => Response::Panic(err.to_string()) + Err(err) => Response::Panic(err.to_string()), } }, - Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => { - Response::Panic("child exited with failure".to_string()) - }, - Ok(WaitStatus::Exited(_, exit_status)) => { - Response::Panic(format!("child exited with unexpected status {}", exit_status)) - }, - Ok(WaitStatus::Signaled(_, sig, _)) => { - Response::Panic(format!("child ended with unexpected signal {:?}, timeout {} cpu_tv {} after {} before {}", sig, timeout.as_secs(), cpu_tv.as_micros(), - usage_after.user_time().tv_sec() + usage_after.system_time().tv_sec(), get_total_cpu_usage(usage_before).as_micros())) - } - Ok(_) => { - Response::Panic("child ended unexpectedly".to_string()) - } - Err(err) => Response::Panic(err.to_string()) + Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => + Response::Panic("child exited with failure".to_string()), + Ok(WaitStatus::Exited(_, exit_status)) => + Response::Panic(format!("child exited with unexpected status {}", exit_status)), + Ok(WaitStatus::Signaled(_, sig, _)) => Response::Panic(format!( + "child ended with unexpected signal {:?}, timeout {} cpu_tv {} after {} before {}", + sig, + timeout.as_secs(), + cpu_tv.as_micros(), + usage_after.user_time().tv_sec() + usage_after.system_time().tv_sec(), + get_total_cpu_usage(usage_before).as_micros() + )), + Ok(_) => Response::Panic("child ended unexpectedly".to_string()), + Err(err) => Response::Panic(err.to_string()), } } - - /// Calculate the total CPU time from the given `usage` structure, returned from /// [`nix::sys::resource::getrusage`], and calculates the total CPU time spent, including both user /// and system time. diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 56df1f8a4eb7..2bf3c7ed8bbe 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -32,7 +32,10 @@ use crate::memory_stats::memory_tracker::{get_memory_tracker_loop_stats, memory_ use libc; use nix::{ errno::Errno, - sys::{resource::{Resource, Usage, UsageWho}, wait::WaitStatus}, + sys::{ + resource::{Resource, Usage, UsageWho}, + wait::WaitStatus, + }, unistd::{ForkResult, Pid}, }; use os_pipe::{self, PipeWriter}; @@ -44,11 +47,11 @@ use polkadot_node_core_pvf_common::{ prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ - stringify_panic_payload, + cpu_time_monitor_loop, stringify_panic_payload, thread::{self, spawn_worker_thread, WaitOutcome}, worker_event_loop, WorkerKind, }, - worker_dir, SecurityStatus, + worker_dir, ProcessTime, SecurityStatus, }; use polkadot_primitives::ExecutorParams; use std::{ @@ -57,9 +60,10 @@ use std::{ os::unix::net::UnixStream, path::PathBuf, process, - sync::Arc, + sync::{mpsc::channel, Arc}, time::Duration, }; +use std::fs::File; /// Contains the bytes for a successfully compiled artifact. #[derive(Encode, Decode)] @@ -274,16 +278,6 @@ fn handle_child_process( "worker job: preparing artifact", ); - // Set a hard CPU time limit for the child process. - nix::sys::resource::setrlimit( - Resource::RLIMIT_CPU, - preparation_timeout.as_secs(), - preparation_timeout.as_secs(), - ) - .unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) - }); - // Conditional variable to notify us when a thread is done. let condvar = thread::get_condvar(); @@ -293,6 +287,20 @@ fn handle_child_process( #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] let memory_tracker_thread = std::thread::spawn(|| memory_tracker_loop(condvar_memory)); + let cpu_time_start = ProcessTime::now(); + + // Spawn a new thread that runs the CPU time monitor. + let (cpu_time_monitor_tx, cpu_time_monitor_rx) = channel::<()>(); + let cpu_time_monitor_thread = thread::spawn_worker_thread( + "cpu time monitor thread", + move || cpu_time_monitor_loop(cpu_time_start, preparation_timeout, cpu_time_monitor_rx), + Arc::clone(&condvar), + WaitOutcome::TimedOut, + ) + .unwrap_or_else(|err| { + send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) + }); + let prepare_thread = spawn_worker_thread( "prepare worker", move || { @@ -323,39 +331,62 @@ fn handle_child_process( send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) }); - // There's only one thread that can trigger the condvar, so ignore the condvar outcome and - // simply join. We don't have to be concerned with timeouts, setrlimit will kill the process. - let result = prepare_thread.join().unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(PrepareError::Panic(stringify_panic_payload(err)))) - }); + let outcome = thread::wait_for_threads(condvar); - let response: Result = match result { - Ok(ok) => { - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - let (artifact, max_rss) = ok; - } else { - let artifact = ok; - } - } + let result = match outcome { + WaitOutcome::Finished => { + let _ = cpu_time_monitor_tx.send(()); - // Stop the memory stats worker and get its observed memory stats. - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, process::id()); + match prepare_thread.join().unwrap_or_else(|err| { + send_child_response( + &pipe_write, + Err(PrepareError::Panic(stringify_panic_payload(err))), + ) + }) { + Err(err) => { + Err(err) + }, + Ok(ok) => { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + let (artifact, max_rss) = ok; + } else { + let artifact = ok; + } + } + + // Stop the memory stats worker and get its observed memory stats. + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, process::id()); + + let memory_stats = MemoryStats { + #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] + memory_tracker_stats, + #[cfg(target_os = "linux")] + max_rss: extract_max_rss_stat(max_rss, process::id()), + }; - let memory_stats = MemoryStats { - #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - memory_tracker_stats, - #[cfg(target_os = "linux")] - max_rss: extract_max_rss_stat(max_rss, process::id()), - }; + Ok(Response { artifact, memory_stats }) + }, + } + }, - Ok(Response { artifact, memory_stats }) + // If the CPU thread is not selected, we signal it to end, the join handle is + // dropped and the thread will finish in the background. + WaitOutcome::TimedOut => { + match cpu_time_monitor_thread.join() { + Ok(Some(cpu_time_elapsed)) => Err(PrepareError::TimedOut), + Ok(None) => + Err(PrepareError::IoErr("error communicating over closed channel".into())), + // Errors in this thread are independent of the PVF. + Err(err) => Err(PrepareError::IoErr(stringify_panic_payload(err))), + } }, - Err(err) => Err(err), + WaitOutcome::Pending => + unreachable!("we run wait_while until the outcome is no longer pending; qed"), }; - send_child_response(&pipe_write, response); + send_child_response(&pipe_write, result); } /// Waits for child process to finish and handle child response from pipe. @@ -405,10 +436,6 @@ fn handle_parent_process( // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); - if cpu_tv.as_secs() >= timeout.as_secs() { - return Err(PrepareError::TimedOut) - } - return match status { Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { let result: Result = @@ -416,8 +443,30 @@ fn handle_parent_process( // This error happens when the job dies. .map_err(|_err| PrepareError::JobDied)?; match result { + Err(PrepareError::TimedOut) => { + // Log if we exceed the timeout and the other thread hasn't + // finished. + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", + cpu_tv.as_millis(), + timeout.as_millis(), + ); + Err(PrepareError::TimedOut) + }, Err(err) => Err(err), Ok(response) => { + if cpu_tv >= timeout { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", + cpu_tv.as_millis(), + timeout.as_millis(), + ); + return Err(PrepareError::TimedOut); + } // Write the serialized artifact into a temp file. // // PVF host only keeps artifacts statuses in its memory, @@ -443,16 +492,12 @@ fn handle_parent_process( }, } }, - Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => { - Err(PrepareError::Panic("child exited with failure".to_string())) - }, - Ok(WaitStatus::Exited(_, exit_status)) => { - Err(PrepareError::Panic(format!("child exited with unexpected status {}", exit_status))) - }, - Ok(_) => { - Err(PrepareError::Panic("child ended unexpectedly".to_string())) - } - Err(err) => Err(PrepareError::Panic(err.to_string())) + Ok(WaitStatus::Exited(_, libc::EXIT_FAILURE)) => + Err(PrepareError::Panic("child exited with failure".to_string())), + Ok(WaitStatus::Exited(_, exit_status)) => + Err(PrepareError::Panic(format!("child exited with unexpected status {}", exit_status))), + Ok(_) => Err(PrepareError::Panic("child ended unexpectedly".to_string())), + Err(err) => Err(PrepareError::Panic(err.to_string())), } } From 6bd7d20b0b40c73b5e66756428a11232b3762c0c Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Sun, 5 Nov 2023 12:45:56 +0100 Subject: [PATCH 32/47] cargo fmt --- .../node/core/pvf/execute-worker/src/lib.rs | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 58b1adb83241..d3baaf1de2f2 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -24,6 +24,7 @@ pub use polkadot_node_core_pvf_common::{ // separate spawned processes. Run with e.g. `RUST_LOG=parachain::pvf-execute-worker=trace`. const LOG_TARGET: &str = "parachain::pvf-execute-worker"; +use cpu_time::ProcessTime; use nix::{ errno::Errno, sys::{ @@ -33,7 +34,6 @@ use nix::{ unistd::{ForkResult, Pid}, }; use os_pipe::PipeWriter; -use cpu_time::ProcessTime; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, @@ -188,7 +188,7 @@ pub fn worker_entrypoint( // SAFETY: new process is spawned within a single threaded process let response = match unsafe { nix::unistd::fork() } { - Err(errno) => internal_error_from_errno("fork", errno), + Err(errno) => internal_error_from_errno("fork", errno), Ok(ForkResult::Child) => { // Dropping the stream closes the underlying socket. We want to make sure // that the sandboxed child can't get any kind of information from the @@ -372,7 +372,7 @@ fn handle_parent_process( usage_before: Usage, timeout: Duration, ) -> io::Result { - let worker_pid = std::process::id(); + let worker_pid = std::process::id(); // Read from the child. let mut received_data = Vec::new(); @@ -388,8 +388,8 @@ fn handle_parent_process( Err(errno) => return Ok(internal_error_from_errno("getrusage after", errno)), }; - // Using `getrusage` is needed to check whether child has timedout since we cannot rely on child. - // to report its own time. + // Using `getrusage` is needed to check whether child has timedout since we cannot rely on + // child. to report its own time. // As `getrusage` returns resource usage from all terminated child processes, // it is necessary to subtract the usage before the current child process to isolate its cpu // time @@ -425,21 +425,21 @@ fn handle_parent_process( ); return Ok(Response::TimedOut) }, - Ok(response) => Ok(response), - // There is either a bug or the job was hijacked. Should retry at any rate. - Err(err) => return Err(io::Error::new(io::ErrorKind::Other, err.to_string())), + Ok(response) => Ok(response), + // There is either a bug or the job was hijacked. Should retry at any rate. + Err(err) => return Err(io::Error::new(io::ErrorKind::Other, err.to_string())), } }, - // The job gets SIGSYS on seccomp violations. We can also treat other termination signals as - // death. But also, receiving any signal is unexpected, so treat them all the same. - Ok(WaitStatus::Signaled(..)) => Ok(Response::JobDied), - Err(errno) => Ok(internal_error_from_errno("waitpid", errno)), - - // It is within an attacker's power to send an unexpected exit status. So we cannot treat - // this as an internal error (which would make us abstain), but must vote against. - Ok(unexpected_wait_status) => Ok(Response::UnexpectedJobStatus(format!( - "unexpected status from wait: {unexpected_wait_status:?}" - ))), + // The job gets SIGSYS on seccomp violations. We can also treat other termination signals as + // death. But also, receiving any signal is unexpected, so treat them all the same. + Ok(WaitStatus::Signaled(..)) => Ok(Response::JobDied), + Err(errno) => Ok(internal_error_from_errno("waitpid", errno)), + + // It is within an attacker's power to send an unexpected exit status. So we cannot treat + // this as an internal error (which would make us abstain), but must vote against. + Ok(unexpected_wait_status) => Ok(Response::UnexpectedJobStatus(format!( + "unexpected status from wait: {unexpected_wait_status:?}" + ))), } } From 1fb8c13c0e91e3953cd77ff81c7eade10eb4dde9 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Sun, 5 Nov 2023 18:48:11 +0100 Subject: [PATCH 33/47] Clean up error handling a bit --- .../node/core/candidate-validation/src/lib.rs | 13 ++++----- polkadot/node/core/pvf/common/src/error.rs | 6 ++--- polkadot/node/core/pvf/common/src/execute.rs | 6 +---- .../node/core/pvf/execute-worker/src/lib.rs | 27 +++++++++++-------- .../node/core/pvf/prepare-worker/src/lib.rs | 16 ++++++----- polkadot/node/core/pvf/src/error.rs | 6 +---- polkadot/node/core/pvf/src/execute/queue.rs | 9 ++----- .../node/core/pvf/src/execute/worker_intf.rs | 10 ++----- polkadot/node/core/pvf/src/prepare/pool.rs | 4 +-- .../node/core/pvf/src/prepare/worker_intf.rs | 4 +-- polkadot/node/core/pvf/tests/it/main.rs | 2 +- 11 files changed, 44 insertions(+), 59 deletions(-) diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 8d2aaddc574f..781f1a19463c 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -648,12 +648,10 @@ async fn validate_candidate_exhaustive( Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(err))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath)) => - Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError( - "ambiguous job death".to_string(), - ))), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::UnexpectedJobStatus(err))) => - Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(err))) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!( + "ambiguous job death: {err}" + )))), Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::PrepareError(e))) => { // In principle if preparation of the `WASM` fails, the current candidate can not be the // reason for that. So we can't say whether it is invalid or not. In addition, with @@ -757,8 +755,7 @@ trait ValidationBackend { match validation_result { Err(ValidationError::InvalidCandidate( WasmInvalidCandidate::AmbiguousWorkerDeath | - WasmInvalidCandidate::AmbiguousJobDeath | - WasmInvalidCandidate::UnexpectedJobStatus(_), + WasmInvalidCandidate::AmbiguousJobDeath(_), )) if num_death_retries_left > 0 => num_death_retries_left -= 1, Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(_))) if num_panic_retries_left > 0 => diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 2374b6577ccc..24376c3a0cbf 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -69,7 +69,7 @@ pub enum PrepareError { #[codec(index = 9)] ClearWorkerDir(String), /// The preparation job process died, due to OOM, a seccomp violation, or some other factor. - JobDied, + JobDied(String), #[codec(index = 10)] /// Some error occurred when interfacing with the kernel. #[codec(index = 11)] @@ -91,7 +91,7 @@ impl PrepareError { match self { Prevalidation(_) | Preparation(_) | Panic(_) | OutOfMemory => true, IoErr(_) | - JobDied | + JobDied(_) | CreateTmpFile(_) | RenameTmpFile { .. } | ClearWorkerDir(_) | @@ -114,7 +114,7 @@ impl fmt::Display for PrepareError { Panic(err) => write!(f, "panic: {}", err), TimedOut => write!(f, "prepare: timeout"), IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err), - JobDied => write!(f, "prepare: prepare job died"), + JobDied(err) => write!(f, "prepare: prepare job died: {}", err), CreateTmpFile(err) => write!(f, "prepare: error creating tmp file: {}", err), RenameTmpFile { err, src, dest } => write!(f, "prepare: error renaming tmp file ({:?} -> {:?}): {}", src, dest, err), diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index 887352def07a..9f3799bc842e 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -47,11 +47,7 @@ pub enum Response { /// The job process has died. We must kill the worker just in case. /// /// We cannot treat this as an internal error because malicious code may have caused this. - JobDied, - /// The execute job returned an unexpected status. - /// - /// We cannot treat this as an internal error because malicious code may have caused this. - UnexpectedJobStatus(String), + JobDied(String), /// Some internal error occurred. InternalError(InternalValidationError), diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index d3baaf1de2f2..2107108bee99 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -209,7 +209,13 @@ pub fn worker_entrypoint( // this drop is necessary to avoid deadlock drop(pipe_writer); - handle_parent_process(pipe_reader, child, usage_before, execution_timeout)? + handle_parent_process( + pipe_reader, + child, + worker_pid, + usage_before, + execution_timeout, + )? }, }; @@ -369,17 +375,13 @@ fn handle_child_process( fn handle_parent_process( mut pipe_read: os_pipe::PipeReader, child: Pid, + worker_pid: u32, usage_before: Usage, timeout: Duration, ) -> io::Result { - let worker_pid = std::process::id(); - // Read from the child. let mut received_data = Vec::new(); - if let Err(_err) = pipe_read.read_to_end(&mut received_data) { - // Swallow the error, it's not really helpful as to why the child died. - return Ok(Response::JobDied) - } + pipe_read.read_to_end(&mut received_data)?; let status = nix::sys::wait::waitpid(child, None); @@ -430,14 +432,17 @@ fn handle_parent_process( Err(err) => return Err(io::Error::new(io::ErrorKind::Other, err.to_string())), } }, - // The job gets SIGSYS on seccomp violations. We can also treat other termination signals as - // death. But also, receiving any signal is unexpected, so treat them all the same. - Ok(WaitStatus::Signaled(..)) => Ok(Response::JobDied), + // The job was killed by the given signal. + // + // The job gets SIGSYS on seccomp violations, but this signal may have been sent for some + // other reason, so we still need to check for seccomp violations elsewhere. + Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => + Ok(Response::JobDied(format!("received signal: {signal:?}"))), Err(errno) => Ok(internal_error_from_errno("waitpid", errno)), // It is within an attacker's power to send an unexpected exit status. So we cannot treat // this as an internal error (which would make us abstain), but must vote against. - Ok(unexpected_wait_status) => Ok(Response::UnexpectedJobStatus(format!( + Ok(unexpected_wait_status) => Ok(Response::JobDied(format!( "unexpected status from wait: {unexpected_wait_status:?}" ))), } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index f6c31217ed1f..e260c8c4cd0e 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -509,8 +509,7 @@ fn handle_parent_process( let mut received_data = Vec::new(); pipe_read .read_to_end(&mut received_data) - // Swallow the error, it's not really helpful as to why the child died. - .map_err(|_errno| PrepareError::JobDied)?; + .map_err(|err| PrepareError::IoErr(err.to_string()))?; let status = nix::sys::wait::waitpid(child, None); let usage_after = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) @@ -524,7 +523,7 @@ fn handle_parent_process( let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); return match status { - Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { + Ok(WaitStatus::Exited(_pid, libc::EXIT_SUCCESS)) => { let result: Result = Result::decode(&mut received_data.as_slice()) // There is either a bug or the job was hijacked. @@ -579,14 +578,17 @@ fn handle_parent_process( }, } }, - // The job gets SIGSYS on seccomp violations. We can also treat other termination signals as - // death. But also, receiving any signal is unexpected, so treat them all the same. - Ok(WaitStatus::Signaled(..)) => Err(PrepareError::JobDied), + // The job was killed by the given signal. + // + // The job gets SIGSYS on seccomp violations, but this signal may have been sent for some + // other reason, so we still need to check for seccomp violations elsewhere. + Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => + Err(PrepareError::JobDied(format!("received signal: {signal:?}"))), Err(errno) => Err(error_from_errno("waitpid", errno)), // An attacker can make the child process return any exit status it wants. So we can treat // all unexpected cases the same way. - Ok(unexpected_wait_status) => Err(PrepareError::IoErr(format!( + Ok(unexpected_wait_status) => Err(PrepareError::JobDied(format!( "unexpected status from wait: {unexpected_wait_status:?}" ))), } diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs index 5bfbf13d422d..12ed22d015ee 100644 --- a/polkadot/node/core/pvf/src/error.rs +++ b/polkadot/node/core/pvf/src/error.rs @@ -61,11 +61,7 @@ pub enum InvalidCandidate { /// (c) Some other reason, perhaps transient or perhaps caused by malicious code. /// /// We cannot treat this as an internal error because malicious code may have caused this. - AmbiguousJobDeath, - /// The execute job returned an unexpected status. - /// - /// We cannot treat this as an internal error because malicious code may have caused this. - UnexpectedJobStatus(String), + AmbiguousJobDeath(String), } impl From for ValidationError { diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index 33f9528c9331..5c713f84d796 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -357,14 +357,9 @@ fn handle_job_finish( ), Outcome::Panic { err } => (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::Panic(err))), None), - Outcome::JobDied => ( + Outcome::JobDied { err } => ( None, - Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath)), - None, - ), - Outcome::UnexpectedJobStatus { err } => ( - None, - Err(ValidationError::InvalidCandidate(InvalidCandidate::UnexpectedJobStatus(err))), + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))), None, ), }; diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 11c739ea30fc..666a4cf38c4d 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -98,12 +98,7 @@ pub enum Outcome { /// The job process has died. We must kill the worker just in case. /// /// We cannot treat this as an internal error because malicious code may have caused this. - JobDied, - /// The execute job returned an unexpected status. We might be able to recover from this error - /// instead of killing the worker, but this should be very rare anyway. - /// - /// We cannot treat this as an internal error because malicious code may have caused this. - UnexpectedJobStatus { err: String }, + JobDied { err: String }, /// An internal error happened during the validation. Such an error is most likely related to /// some transient glitch. @@ -242,8 +237,7 @@ pub async fn start_work( }, Response::TimedOut => Outcome::HardTimeout, Response::Panic(err) => Outcome::Panic { err }, - Response::JobDied => Outcome::JobDied, - Response::UnexpectedJobStatus(err) => Outcome::UnexpectedJobStatus{err}, + Response::JobDied(err) => Outcome::JobDied { err }, Response::InternalError(err) => Outcome::InternalError { err }, } diff --git a/polkadot/node/core/pvf/src/prepare/pool.rs b/polkadot/node/core/pvf/src/prepare/pool.rs index e65b9a2416fd..a4a44eb50951 100644 --- a/polkadot/node/core/pvf/src/prepare/pool.rs +++ b/polkadot/node/core/pvf/src/prepare/pool.rs @@ -388,14 +388,14 @@ fn handle_mux( Ok(()) }, // The worker might still be usable, but we kill it just in case. - Outcome::JobDied => { + Outcome::JobDied(err) => { if attempt_retire(metrics, spawned, worker) { reply( from_pool, FromPool::Concluded { worker, rip: true, - result: Err(PrepareError::JobDied), + result: Err(PrepareError::JobDied(err)), }, )?; } diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 5d4f142f273b..fc83ad3fb5ff 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -103,7 +103,7 @@ pub enum Outcome { /// The preparation job process died, due to OOM, a seccomp violation, or some other factor. /// /// The worker might still be usable, but we kill it just in case. - JobDied, + JobDied(String), } /// Given the idle token of a worker and parameters of work, communicates with the worker and @@ -225,7 +225,7 @@ async fn handle_response( Ok(result) => result, // Timed out on the child. This should already be logged by the child. Err(PrepareError::TimedOut) => return Outcome::TimedOut, - Err(PrepareError::JobDied) => return Outcome::JobDied, + Err(PrepareError::JobDied(err)) => return Outcome::JobDied(err), Err(PrepareError::OutOfMemory) => return Outcome::OutOfMemory, Err(_) => return Outcome::Concluded { worker, result }, }; diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 7b657e5e3529..b1b20a6094af 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -333,7 +333,7 @@ rusty_fork_test! { // Note that we get a more specific error if the job died than if the whole worker died. assert_matches!( result, - Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath)) + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))) if err == "asdf" ); }) } From 9c666b43e3ced0475e28944e414ec0eb4548b97b Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Sun, 5 Nov 2023 19:05:18 +0100 Subject: [PATCH 34/47] Fix some tests --- polkadot/node/core/pvf/execute-worker/src/lib.rs | 12 +----------- polkadot/node/core/pvf/tests/it/main.rs | 8 ++++++-- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 2107108bee99..6bfbfb73c71c 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -28,7 +28,7 @@ use cpu_time::ProcessTime; use nix::{ errno::Errno, sys::{ - resource::{Resource, Usage, UsageWho}, + resource::{Usage, UsageWho}, wait::WaitStatus, }, unistd::{ForkResult, Pid}, @@ -287,16 +287,6 @@ fn handle_child_process( "worker job: executing artifact", ); - // Set a hard CPU time limit for the child process. - nix::sys::resource::setrlimit( - Resource::RLIMIT_CPU, - execution_timeout.as_secs(), - execution_timeout.as_secs(), - ) - .unwrap_or_else(|errno| { - send_child_response(&pipe_write, internal_error_from_errno("setrlimit", errno)); - }); - // Conditional variable to notify us when a thread is done. let condvar = thread::get_condvar(); let cpu_time_start = ProcessTime::now(); diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index b1b20a6094af..58b47dffe5a9 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -290,7 +290,10 @@ rusty_fork_test! { ); // Note that we get a more specific error if the job died than if the whole worker died. - assert_matches!(result, Err(PrepareError::JobDied)); + assert_matches!( + result, + Err(PrepareError::JobDied(err)) if err == "received signal: SIGKILL" + ); }) } @@ -333,7 +336,8 @@ rusty_fork_test! { // Note that we get a more specific error if the job died than if the whole worker died. assert_matches!( result, - Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))) if err == "asdf" + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))) + if err == "received signal: SIGKILL" ); }) } From 8c7c5199029b9c77408b5752a5b46b9b0eb70e49 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Mon, 6 Nov 2023 19:17:37 +0100 Subject: [PATCH 35/47] Make error reporting more robust and secure - Created a `JobResponse` and `JobError` types for the job to self-report its status. Note that these reports must be treated as untrusted due to the job running untrusted code. Previously some errors from the job were being treated as internal! That's why it's important that the error handling here is robust and accurate. - Made `WorkerResponse::JobError(String)` hold any unexpected error from the job process. - Renamed some of the response types and variants to be more accurate. --- .../node/core/candidate-validation/src/lib.rs | 14 +-- .../core/candidate-validation/src/tests.rs | 10 +- polkadot/node/core/pvf/common/Cargo.toml | 2 +- polkadot/node/core/pvf/common/src/error.rs | 12 -- polkadot/node/core/pvf/common/src/execute.rs | 46 +++++-- .../node/core/pvf/execute-worker/src/lib.rs | 113 ++++++++---------- .../node/core/pvf/prepare-worker/src/lib.rs | 12 +- polkadot/node/core/pvf/src/error.rs | 20 ++-- polkadot/node/core/pvf/src/execute/queue.rs | 11 +- .../node/core/pvf/src/execute/worker_intf.rs | 36 +++--- 10 files changed, 146 insertions(+), 130 deletions(-) diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 781f1a19463c..c962c9fa24f6 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -639,13 +639,13 @@ async fn validate_candidate_exhaustive( }, Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)) => Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WorkerReportedError(e))) => + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WorkerReportedInvalid(e))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))), Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError( "ambiguous worker death".to_string(), ))), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(err))) => + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError(err))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath(err))) => @@ -743,9 +743,9 @@ trait ValidationBackend { }; // Allow limited retries for each kind of error. - let mut num_internal_retries_left = 1; let mut num_death_retries_left = 1; - let mut num_panic_retries_left = 1; + let mut num_job_error_retries_left = 1; + let mut num_internal_retries_left = 1; loop { // Stop retrying if we exceeded the timeout. if total_time_start.elapsed() + retry_delay > exec_timeout { @@ -757,9 +757,9 @@ trait ValidationBackend { WasmInvalidCandidate::AmbiguousWorkerDeath | WasmInvalidCandidate::AmbiguousJobDeath(_), )) if num_death_retries_left > 0 => num_death_retries_left -= 1, - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic(_))) - if num_panic_retries_left > 0 => - num_panic_retries_left -= 1, + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError(_))) + if num_job_error_retries_left > 0 => + num_job_error_retries_left -= 1, Err(ValidationError::InternalError(_)) if num_internal_retries_left > 0 => num_internal_retries_left -= 1, _ => break, diff --git a/polkadot/node/core/candidate-validation/src/tests.rs b/polkadot/node/core/candidate-validation/src/tests.rs index af530a20c4e0..61de8c90e908 100644 --- a/polkadot/node/core/candidate-validation/src/tests.rs +++ b/polkadot/node/core/candidate-validation/src/tests.rs @@ -695,11 +695,13 @@ fn candidate_validation_retry_panic_errors() { let v = executor::block_on(validate_candidate_exhaustive( MockValidateCandidateBackend::with_hardcoded_result_list(vec![ - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic("foo".into()))), - // Throw an AWD error, we should still retry again. - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousWorkerDeath)), + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("foo".into()))), + // Throw an AJD error, we should still retry again. + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbiguousJobDeath( + "baz".into(), + ))), // Throw another panic error. - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Panic("bar".into()))), + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::JobError("bar".into()))), ]), validation_data, validation_code, diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 7dc8d307026e..e3fda06963e3 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -12,6 +12,7 @@ cpu-time = "1.0.0" futures = "0.3.21" gum = { package = "tracing-gum", path = "../../../gum" } libc = "0.2.139" +thiserror = "1.0.31" parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } @@ -30,7 +31,6 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.3.0" seccompiler = "0.4.0" -thiserror = "1.0.31" [dev-dependencies] assert_matches = "1.4.0" diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 24376c3a0cbf..53565b92eb37 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -144,18 +144,10 @@ pub enum InternalValidationError { // conversion to `Option`. path: Option, }, - /// An error occurred in the CPU time monitor thread. Should be totally unrelated to - /// validation. - CpuTimeMonitorThread(String), - /// Could not spawn the execution job thread. - CouldNotSpawnJobThread(String), /// Some error occurred when interfacing with the kernel. Kernel(String), /// Some non-deterministic preparation error occurred. - /// - /// It is OK if attackers can trigger such errors. We assume that such attacks are filtered - /// during pre-checking. NonDeterministicPrepareError(PrepareError), } @@ -177,10 +169,6 @@ impl fmt::Display for InternalValidationError { "validation: host could not clear the worker cache ({:?}) after a job: {}", path, err ), - CpuTimeMonitorThread(err) => - write!(f, "validation: an error occurred in the CPU time monitor thread: {}", err), - CouldNotSpawnJobThread(err) => - write!(f, "validation: could not spawn execution job thread: {}", err), Kernel(err) => write!(f, "validation: error interfacing with the kernel: {}", err), NonDeterministicPrepareError(err) => write!(f, "validation: prepare: {}", err), } diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index 9f3799bc842e..bb1eccfad2f2 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -28,9 +28,9 @@ pub struct Handshake { pub executor_params: ExecutorParams, } -/// The response from an execution job on the worker. +/// The response from the execution worker. #[derive(Debug, Encode, Decode)] -pub enum Response { +pub enum WorkerResponse { /// The job completed successfully. Ok { /// The result of parachain validation. @@ -41,19 +41,38 @@ pub enum Response { /// The candidate is invalid. InvalidCandidate(String), /// The job timed out. - TimedOut, - /// An unexpected panic has occurred in the execution worker. - Panic(String), + JobTimedOut, /// The job process has died. We must kill the worker just in case. /// - /// We cannot treat this as an internal error because malicious code may have caused this. + /// We cannot treat this as an internal error because malicious code may have killed the job. + /// We still retry it, because in the non-malicious case it is likely spurious. JobDied(String), + /// An unexpected error occurred in the job process, e.g. failing to spawn a thread, panic, + /// etc. + /// + /// Because malicious code can cause a job error, we must not treat it as an internal error. We + /// still retry it, because in the non-malicious case it is likely spurious. + JobError(String), /// Some internal error occurred. InternalError(InternalValidationError), } -impl Response { +/// The result of a job on the execution worker. +pub type JobResult = Result; + +/// The successful response from a job on the execution worker. +#[derive(Debug, Encode, Decode)] +pub enum JobResponse { + Ok { + /// The result of parachain validation. + result_descriptor: ValidationResult, + }, + /// The candidate is invalid. + InvalidCandidate(String), +} + +impl JobResponse { /// Creates an invalid response from a context `ctx` and a message `msg` (which can be empty). pub fn format_invalid(ctx: &'static str, msg: &str) -> Self { if msg.is_empty() { @@ -63,3 +82,16 @@ impl Response { } } } + +/// An unexpected error occurred in the execution job. +#[derive(thiserror::Error, Debug, Encode, Decode)] +pub enum JobError { + #[error("The job timed out")] + TimedOut, + #[error("An unexpected panic has occurred in the execution job: {0}")] + Panic(String), + #[error("Could not spawn the requested thread: {0}")] + CouldNotSpawnThread(String), + #[error("An error occurred in the CPU time monitor thread: {0}")] + CpuTimeMonitorThread(String), +} diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 6bfbfb73c71c..b9c66e860ab8 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -37,7 +37,7 @@ use os_pipe::PipeWriter; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, - execute::{Handshake, Response}, + execute::{Handshake, JobError, JobResponse, JobResult, WorkerResponse}, framed_recv_blocking, framed_send_blocking, worker::{ cpu_time_monitor_loop, stringify_panic_payload, @@ -115,7 +115,7 @@ fn recv_request(stream: &mut UnixStream) -> io::Result<(Vec, Duration)> { Ok((params, execution_timeout)) } -fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> { +fn send_response(stream: &mut UnixStream, response: WorkerResponse) -> io::Result<()> { framed_send_blocking(stream, &response.encode()) } @@ -167,7 +167,7 @@ pub fn worker_entrypoint( let compiled_artifact_blob = match std::fs::read(&artifact_path) { Ok(bytes) => bytes, Err(err) => { - let response = Response::InternalError( + let response = WorkerResponse::InternalError( InternalValidationError::CouldNotOpenFile(err.to_string()), ); send_response(&mut stream, response)?; @@ -235,25 +235,27 @@ fn validate_using_artifact( compiled_artifact_blob: &[u8], executor_params: &ExecutorParams, params: &[u8], -) -> Response { +) -> JobResponse { let descriptor_bytes = match unsafe { // SAFETY: this should be safe since the compiled artifact passed here comes from the // file created by the prepare workers. These files are obtained by calling // [`executor_intf::prepare`]. execute_artifact(compiled_artifact_blob, executor_params, params) } { - Err(err) => return Response::format_invalid("execute", &err), + Err(err) => return JobResponse::format_invalid("execute", &err), Ok(d) => d, }; let result_descriptor = match ValidationResult::decode(&mut &descriptor_bytes[..]) { Err(err) => - return Response::format_invalid("validation result decoding failed", &err.to_string()), + return JobResponse::format_invalid( + "validation result decoding failed", + &err.to_string(), + ), Ok(r) => r, }; - // duration is set to 0 here because the process duration is calculated on the parent process - Response::Ok { result_descriptor, duration: Duration::ZERO } + JobResponse::Ok { result_descriptor } } /// This is used to handle child process during pvf execute worker. @@ -273,7 +275,7 @@ fn validate_using_artifact( /// /// # Returns /// -/// - pipe back `Response` to the parent process. +/// - pipe back `JobResponse` to the parent process. fn handle_child_process( pipe_write: os_pipe::PipeWriter, compiled_artifact_blob: Vec, @@ -299,10 +301,10 @@ fn handle_child_process( Arc::clone(&condvar), WaitOutcome::TimedOut, ) - .unwrap_or_else(|err| send_child_response(&pipe_write, Response::Panic(err.to_string()))); + .unwrap_or_else(|err| { + send_child_response(&pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string()))) + }); - // TODO: We may not need this since there's only one thread here now. We do still need to - // control the stack size (see EXECUTE_THREAD_STACK_SIZE). Look into simplifying. let executor_params_2 = executor_params.clone(); let execute_thread = thread::spawn_worker_thread_with_stack_size( "execute thread", @@ -312,12 +314,7 @@ fn handle_child_process( EXECUTE_THREAD_STACK_SIZE, ) .unwrap_or_else(|err| { - send_child_response( - &pipe_write, - Response::InternalError(InternalValidationError::CouldNotSpawnJobThread( - err.to_string(), - )), - ) + send_child_response(&pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string()))) }); let outcome = thread::wait_for_threads(condvar); @@ -325,20 +322,16 @@ fn handle_child_process( let response = match outcome { WaitOutcome::Finished => { let _ = cpu_time_monitor_tx.send(()); - execute_thread - .join() - .unwrap_or_else(|e| Response::Panic(stringify_panic_payload(e))) + execute_thread.join().map_err(|e| JobError::Panic(stringify_panic_payload(e))) }, // If the CPU thread is not selected, we signal it to end, the join handle is // dropped and the thread will finish in the background. WaitOutcome::TimedOut => match cpu_time_monitor_thread.join() { - Ok(Some(_cpu_time_elapsed)) => Response::TimedOut, - Ok(None) => Response::InternalError(InternalValidationError::CpuTimeMonitorThread( + Ok(Some(_cpu_time_elapsed)) => Err(JobError::TimedOut), + Ok(None) => Err(JobError::CpuTimeMonitorThread( "error communicating over finished channel".into(), )), - Err(e) => Response::InternalError(InternalValidationError::CpuTimeMonitorThread( - stringify_panic_payload(e), - )), + Err(e) => Err(JobError::CpuTimeMonitorThread(stringify_panic_payload(e))), }, WaitOutcome::Pending => unreachable!("we run wait_while until the outcome is no longer pending; qed"), @@ -368,7 +361,7 @@ fn handle_parent_process( worker_pid: u32, usage_before: Usage, timeout: Duration, -) -> io::Result { +) -> io::Result { // Read from the child. let mut received_data = Vec::new(); pipe_read.read_to_end(&mut received_data)?; @@ -386,40 +379,40 @@ fn handle_parent_process( // it is necessary to subtract the usage before the current child process to isolate its cpu // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); + if cpu_tv >= timeout { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "execute job took {}ms cpu time, exceeded execute timeout {}ms", + cpu_tv.as_millis(), + timeout.as_millis(), + ); + return Ok(WorkerResponse::JobTimedOut) + } match status { Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { - match Response::decode(&mut received_data.as_slice()) { - Ok(Response::Ok { result_descriptor, duration: _ }) => { - if cpu_tv.as_secs() >= timeout.as_secs() { - // Log if we exceed the timeout and the other thread hasn't - // finished. - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "execute job took {}ms cpu time, exceeded execute timeout {}ms", - cpu_tv.as_millis(), - timeout.as_millis(), - ); - return Ok(Response::TimedOut) - } - Ok(Response::Ok { result_descriptor, duration: cpu_tv }) - }, - Ok(Response::TimedOut) => { - // Log if we exceed the timeout and the other thread hasn't - // finished. + match JobResult::decode(&mut received_data.as_slice()) { + Ok(Ok(JobResponse::Ok { result_descriptor })) => + Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv }), + Ok(Ok(JobResponse::InvalidCandidate(err))) => + Ok(WorkerResponse::InvalidCandidate(err)), + Ok(Err(job_error)) => { gum::warn!( target: LOG_TARGET, %worker_pid, - "execute job took {}ms cpu time, exceeded execute timeout {}ms", - cpu_tv.as_millis(), - timeout.as_millis(), + "execute job error: {}", + job_error, ); - return Ok(Response::TimedOut) + if matches!(job_error, JobError::TimedOut) { + Ok(WorkerResponse::JobTimedOut) + } else { + Ok(WorkerResponse::JobError(job_error.to_string())) + } }, - Ok(response) => Ok(response), - // There is either a bug or the job was hijacked. Should retry at any rate. - Err(err) => return Err(io::Error::new(io::ErrorKind::Other, err.to_string())), + // Could not decode job response. There is either a bug or the job was hijacked. + // Should retry at any rate. + Err(err) => Err(io::Error::new(io::ErrorKind::Other, err.to_string())), } }, // The job was killed by the given signal. @@ -427,12 +420,12 @@ fn handle_parent_process( // The job gets SIGSYS on seccomp violations, but this signal may have been sent for some // other reason, so we still need to check for seccomp violations elsewhere. Ok(WaitStatus::Signaled(_pid, signal, _core_dump)) => - Ok(Response::JobDied(format!("received signal: {signal:?}"))), + Ok(WorkerResponse::JobDied(format!("received signal: {signal:?}"))), Err(errno) => Ok(internal_error_from_errno("waitpid", errno)), // It is within an attacker's power to send an unexpected exit status. So we cannot treat // this as an internal error (which would make us abstain), but must vote against. - Ok(unexpected_wait_status) => Ok(Response::JobDied(format!( + Ok(unexpected_wait_status) => Ok(WorkerResponse::JobDied(format!( "unexpected status from wait: {unexpected_wait_status:?}" ))), } @@ -462,17 +455,17 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { /// /// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. /// -/// - `response`: Child process response -fn send_child_response(mut pipe_write: &PipeWriter, response: Response) -> ! { +/// - `result`: Child process response, or error. +fn send_child_response(mut pipe_write: &PipeWriter, result: JobResult) -> ! { pipe_write - .write_all(response.encode().as_slice()) + .write_all(result.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); process::exit(libc::EXIT_SUCCESS) } -fn internal_error_from_errno(context: &'static str, errno: Errno) -> Response { - Response::InternalError(InternalValidationError::Kernel(format!( +fn internal_error_from_errno(context: &'static str, errno: Errno) -> WorkerResponse { + WorkerResponse::InternalError(InternalValidationError::Kernel(format!( "{}: {}: {}", context, errno, diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index e260c8c4cd0e..5acdc94a1e68 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -457,14 +457,10 @@ fn handle_child_process( // If the CPU thread is not selected, we signal it to end, the join handle is // dropped and the thread will finish in the background. - WaitOutcome::TimedOut => { - match cpu_time_monitor_thread.join() { - Ok(Some(_cpu_time_elapsed)) => Err(PrepareError::TimedOut), - Ok(None) => - Err(PrepareError::IoErr("error communicating over closed channel".into())), - // Errors in this thread are independent of the PVF. - Err(err) => Err(PrepareError::IoErr(stringify_panic_payload(err))), - } + WaitOutcome::TimedOut => match cpu_time_monitor_thread.join() { + Ok(Some(_cpu_time_elapsed)) => Err(PrepareError::TimedOut), + Ok(None) => Err(PrepareError::IoErr("error communicating over closed channel".into())), + Err(err) => Err(PrepareError::IoErr(stringify_panic_payload(err))), }, WaitOutcome::Pending => unreachable!("we run wait_while until the outcome is no longer pending; qed"), diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs index 12ed22d015ee..7fdb8c56ec92 100644 --- a/polkadot/node/core/pvf/src/error.rs +++ b/polkadot/node/core/pvf/src/error.rs @@ -33,22 +33,17 @@ pub enum ValidationError { pub enum InvalidCandidate { /// PVF preparation ended up with a deterministic error. PrepareError(String), - /// The failure is reported by the execution worker. The string contains the error message. - WorkerReportedError(String), + /// The candidate is reported to be invalid by the execution worker. The string contains the + /// error message. + WorkerReportedInvalid(String), /// The worker process (not the job) has died during validation of a candidate. /// - /// It's unlikely that this is caused by malicious code since workers now spawn separate job + /// It's unlikely that this is caused by malicious code since workers spawn separate job /// processes, and those job processes are sandboxed. But, it is possible. We retry in this /// case, and if the error persists, we assume it's caused by the candidate and vote against. AmbiguousWorkerDeath, /// PVF execution (compilation is not included) took more time than was allotted. HardTimeout, - /// A panic occurred and we can't be sure whether the candidate is really invalid or some - /// internal glitch occurred. Whenever we are unsure, we can never treat an error as internal - /// as we would abstain from voting. This is bad because if the issue was due to the candidate, - /// then all validators would abstain, stalling finality on the chain. So we will first retry - /// the candidate, and if the issue persists we are forced to vote invalid. - Panic(String), /// The job process (not the worker) has died for one of the following reasons: /// /// (a) A seccomp violation occurred, most likely due to an attempt by malicious code to @@ -62,6 +57,13 @@ pub enum InvalidCandidate { /// /// We cannot treat this as an internal error because malicious code may have caused this. AmbiguousJobDeath(String), + /// An unexpected error occurred in the job process and we can't be sure whether the candidate + /// is really invalid or some internal glitch occurred. Whenever we are unsure, we can never + /// treat an error as internal as we would abstain from voting. This is bad because if the + /// issue was due to the candidate, then all validators would abstain, stalling finality on the + /// chain. So we will first retry the candidate, and if the issue persists we are forced to + /// vote invalid. + JobError(String), } impl From for ValidationError { diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index 5c713f84d796..257377df3f48 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -342,26 +342,27 @@ fn handle_job_finish( }, Outcome::InvalidCandidate { err, idle_worker } => ( Some(idle_worker), - Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedError(err))), + Err(ValidationError::InvalidCandidate(InvalidCandidate::WorkerReportedInvalid(err))), None, ), Outcome::InternalError { err } => (None, Err(ValidationError::InternalError(err)), None), - // Treated as definitely-invalid, because if we timed out, there's no time left for a retry. + // Either the worker or the job timed out. Kill the worker in either case. Treated as + // definitely-invalid, because if we timed out, there's no time left for a retry. Outcome::HardTimeout => (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)), None), // "Maybe invalid" errors (will retry). - Outcome::IoErr => ( + Outcome::WorkerIntfErr => ( None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)), None, ), - Outcome::Panic { err } => - (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::Panic(err))), None), Outcome::JobDied { err } => ( None, Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))), None, ), + Outcome::JobError { err } => + (None, Err(ValidationError::InvalidCandidate(InvalidCandidate::JobError(err))), None), }; queue.metrics.execute_finished(); diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 666a4cf38c4d..bf44ba017250 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -30,7 +30,7 @@ use futures_timer::Delay; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, - execute::{Handshake, Response}, + execute::{Handshake, WorkerResponse}, worker_dir, SecurityStatus, }; use polkadot_parachain_primitives::primitives::ValidationResult; @@ -92,13 +92,15 @@ pub enum Outcome { HardTimeout, /// An I/O error happened during communication with the worker. This may mean that the worker /// process already died. The token is not returned in any case. - IoErr, - /// An unexpected panic has occurred in the execution worker. - Panic { err: String }, + WorkerIntfErr, /// The job process has died. We must kill the worker just in case. /// /// We cannot treat this as an internal error because malicious code may have caused this. JobDied { err: String }, + /// An unexpected error occurred in the job process. + /// + /// Because malicious code can cause a job error, we must not treat it as an internal error. + JobError { err: String }, /// An internal error happened during the validation. Such an error is most likely related to /// some transient glitch. @@ -142,7 +144,7 @@ pub async fn start_work( ?error, "failed to send an execute request", ); - return Outcome::IoErr + return Outcome::WorkerIntfErr } // We use a generous timeout here. This is in addition to the one in the child process, in @@ -178,7 +180,7 @@ pub async fn start_work( ); } - return Outcome::IoErr + return Outcome::WorkerIntfErr }, Ok(response) => { // Check if any syscall violations occurred during the job. For now this is @@ -194,7 +196,7 @@ pub async fn start_work( ); } - if let Response::Ok{duration, ..} = response { + if let WorkerResponse::Ok{duration, ..} = response { if duration > execution_timeout { // The job didn't complete within the timeout. gum::warn!( @@ -206,7 +208,7 @@ pub async fn start_work( ); // Return a timeout error. - return Outcome::HardTimeout; + return Outcome::HardTimeout } } @@ -221,25 +223,25 @@ pub async fn start_work( validation_code_hash = ?artifact.id.code_hash, "execution worker exceeded lenient timeout for execution, child worker likely stalled", ); - Response::TimedOut + WorkerResponse::JobTimedOut }, }; match response { - Response::Ok { result_descriptor, duration } => Outcome::Ok { + WorkerResponse::Ok { result_descriptor, duration } => Outcome::Ok { result_descriptor, duration, idle_worker: IdleWorker { stream, pid, worker_dir }, }, - Response::InvalidCandidate(err) => Outcome::InvalidCandidate { + WorkerResponse::InvalidCandidate(err) => Outcome::InvalidCandidate { err, idle_worker: IdleWorker { stream, pid, worker_dir }, }, - Response::TimedOut => Outcome::HardTimeout, - Response::Panic(err) => Outcome::Panic { err }, - Response::JobDied(err) => Outcome::JobDied { err }, + WorkerResponse::JobTimedOut => Outcome::HardTimeout, + WorkerResponse::JobDied(err) => Outcome::JobDied { err }, + WorkerResponse::JobError(err) => Outcome::JobError { err }, - Response::InternalError(err) => Outcome::InternalError { err }, + WorkerResponse::InternalError(err) => Outcome::InternalError { err }, } }) .await @@ -313,9 +315,9 @@ async fn send_request( framed_send(stream, &execution_timeout.encode()).await } -async fn recv_response(stream: &mut UnixStream) -> io::Result { +async fn recv_response(stream: &mut UnixStream) -> io::Result { let response_bytes = framed_recv(stream).await?; - Response::decode(&mut response_bytes.as_slice()).map_err(|e| { + WorkerResponse::decode(&mut response_bytes.as_slice()).map_err(|e| { io::Error::new( io::ErrorKind::Other, format!("execute pvf recv_response: decode error: {:?}", e), From af96dfa2d8ead31e197dee5bc6b190f79f266d8f Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Mon, 6 Nov 2023 20:43:41 +0100 Subject: [PATCH 36/47] Some fixes --- .../node/core/pvf/execute-worker/src/lib.rs | 8 ++++ .../node/core/pvf/prepare-worker/src/lib.rs | 45 +++++++++---------- polkadot/node/core/pvf/tests/it/main.rs | 12 ++--- 3 files changed, 36 insertions(+), 29 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index b9c66e860ab8..1b084c99f87b 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -195,6 +195,8 @@ pub fn worker_entrypoint( // outside world. The only IPC it should be able to do is sending its // response over the pipe. drop(stream); + // Drop the read end so we don't have too many FDs open. + drop(pipe_reader); handle_child_process( pipe_writer, @@ -367,6 +369,12 @@ fn handle_parent_process( pipe_read.read_to_end(&mut received_data)?; let status = nix::sys::wait::waitpid(child, None); + gum::trace!( + target: LOG_TARGET, + %worker_pid, + "execute worker received wait status from job: {:?}", + status, + ); let usage_after = match nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) { Ok(usage) => usage, diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 5acdc94a1e68..6b0091f5d9e5 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -232,6 +232,8 @@ pub fn worker_entrypoint( // outside world. The only IPC it should be able to do is sending its // response over the pipe. drop(stream); + // Drop the read end so we don't have too many FDs open. + drop(pipe_reader); handle_child_process( pvf, @@ -331,6 +333,8 @@ fn handle_child_process( gum::debug!( target: LOG_TARGET, %worker_job_pid, + ?prepare_job_kind, + ?preparation_timeout, "worker job: preparing artifact", ); @@ -508,6 +512,13 @@ fn handle_parent_process( .map_err(|err| PrepareError::IoErr(err.to_string()))?; let status = nix::sys::wait::waitpid(child, None); + gum::trace!( + target: LOG_TARGET, + %worker_pid, + "prepare worker received wait status from job: {:?}", + status, + ); + let usage_after = nix::sys::resource::getrusage(UsageWho::RUSAGE_CHILDREN) .map_err(|errno| error_from_errno("getrusage after", errno))?; @@ -517,38 +528,26 @@ fn handle_parent_process( // it is necessary to subtract the usage before the current child process to isolate its cpu // time let cpu_tv = get_total_cpu_usage(usage_after) - get_total_cpu_usage(usage_before); + if cpu_tv >= timeout { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", + cpu_tv.as_millis(), + timeout.as_millis(), + ); + return Err(PrepareError::TimedOut) + } - return match status { + match status { Ok(WaitStatus::Exited(_pid, libc::EXIT_SUCCESS)) => { let result: Result = Result::decode(&mut received_data.as_slice()) // There is either a bug or the job was hijacked. .map_err(|err| PrepareError::IoErr(err.to_string()))?; match result { - Err(PrepareError::TimedOut) => { - // Log if we exceed the timeout and the other thread hasn't - // finished. - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", - cpu_tv.as_millis(), - timeout.as_millis(), - ); - Err(PrepareError::TimedOut) - }, Err(err) => Err(err), Ok(response) => { - if cpu_tv >= timeout { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "prepare job took {}ms cpu time, exceeded prepare timeout {}ms", - cpu_tv.as_millis(), - timeout.as_millis(), - ); - return Err(PrepareError::TimedOut); - } // Write the serialized artifact into a temp file. // // PVF host only keeps artifacts statuses in its memory, diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 58b47dffe5a9..54a686a3e798 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -34,8 +34,8 @@ use tokio::sync::Mutex; mod adder; mod worker_common; -const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3); -const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(3); +const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(6); +const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(6); struct TestHost { cache_dir: tempfile::TempDir, @@ -282,9 +282,9 @@ rusty_fork_test! { let (result, _) = futures::join!( // Choose a job that would normally take the entire timeout. host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), - // Run a future that kills the job in the middle of the timeout. + // Run a future that kills the job while it's running. async { - tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; + tokio::time::sleep(Duration::from_secs(1)).await; kill_by_sid_and_name(sid, PREPARE_PROCESS_NAME, false); } ); @@ -326,9 +326,9 @@ rusty_fork_test! { }, Default::default(), ), - // Run a future that kills the job in the middle of the timeout. + // Run a future that kills the job while it's running. async { - tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; + tokio::time::sleep(Duration::from_secs(1)).await; kill_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, false); } ); From 67736c44af5b82f02587943ab3a680a5a4648058 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Mon, 6 Nov 2023 21:26:15 +0100 Subject: [PATCH 37/47] Make sure `pre_encoded_payloads` tests the correct type --- polkadot/node/core/pvf/common/src/error.rs | 11 ----------- .../node/core/pvf/prepare-worker/src/lib.rs | 18 ++++++++++++++++-- polkadot/node/core/pvf/tests/it/main.rs | 2 +- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 53565b92eb37..384a0781d6dc 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -76,9 +76,6 @@ pub enum PrepareError { Kernel(String), } -/// Pre-encoded length-prefixed `PrepareResult::Err(PrepareError::OutOfMemory)` -pub const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08"; - impl PrepareError { /// Returns whether this is a deterministic error, i.e. one that should trigger reliably. Those /// errors depend on the PVF itself and the sc-executor/wasmtime logic. @@ -174,11 +171,3 @@ impl fmt::Display for InternalValidationError { } } } - -#[test] -fn pre_encoded_payloads() { - let oom_enc = PrepareResult::Err(PrepareError::OutOfMemory).encode(); - let mut oom_payload = oom_enc.len().to_le_bytes().to_vec(); - oom_payload.extend(oom_enc); - assert_eq!(oom_payload, OOM_PAYLOAD); -} diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 6b0091f5d9e5..58fc292eb456 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -40,7 +40,7 @@ use nix::{ use os_pipe::{self, PipeWriter}; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ - error::{PrepareError, PrepareResult, OOM_PAYLOAD}, + error::{PrepareError, PrepareResult}, executor_intf::create_runtime_from_artifact_bytes, framed_recv_blocking, framed_send_blocking, prepare::{MemoryStats, PrepareJobKind, PrepareStats}, @@ -614,7 +614,7 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { /// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. /// /// - `response`: Child process response -fn send_child_response(mut pipe_write: &PipeWriter, response: Result) -> ! { +fn send_child_response(mut pipe_write: &PipeWriter, response: JobResponse) -> ! { pipe_write .write_all(response.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); @@ -625,3 +625,17 @@ fn send_child_response(mut pipe_write: &PipeWriter, response: Result PrepareError { PrepareError::Kernel(format!("{}: {}: {}", context, errno, io::Error::last_os_error())) } + +type JobResponse = Result; + +/// Pre-encoded length-prefixed `Result::Err(PrepareError::OutOfMemory)` +const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08"; + +#[test] +fn pre_encoded_payloads() { + // NOTE: This must match the type of `response` in `send_child_response`. + let oom_enc: JobResponse = Result::Err(PrepareError::OutOfMemory).encode(); + let mut oom_payload = oom_enc.len().to_le_bytes().to_vec(); + oom_payload.extend(oom_enc); + assert_eq!(oom_payload, OOM_PAYLOAD); +} diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 54a686a3e798..9c299147471a 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -126,7 +126,7 @@ impl TestHost { } #[tokio::test] -async fn terminates_on_timeout() { +async fn execute_job_terminates_on_timeout() { let host = TestHost::new().await; let start = std::time::Instant::now(); From e25344c31e535052b25b5ccd422b232633f690a5 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 7 Nov 2023 08:28:16 +0100 Subject: [PATCH 38/47] Some fixes --- polkadot/node/core/pvf/execute-worker/src/lib.rs | 12 ++++++++---- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 13 +++++++++---- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 1b084c99f87b..bdea4f050e46 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -463,13 +463,17 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { /// /// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. /// -/// - `result`: Child process response, or error. -fn send_child_response(mut pipe_write: &PipeWriter, result: JobResult) -> ! { +/// - `response`: Child process response, or error. +fn send_child_response(mut pipe_write: &PipeWriter, response: JobResult) -> ! { pipe_write - .write_all(result.encode().as_slice()) + .write_all(response.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - process::exit(libc::EXIT_SUCCESS) + if response.is_ok() { + process::exit(libc::EXIT_SUCCESS) + } else { + process::exit(libc::EXIT_FAILURE) + } } fn internal_error_from_errno(context: &'static str, errno: Errno) -> WorkerResponse { diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 58fc292eb456..32fc20255ae2 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -619,7 +619,11 @@ fn send_child_response(mut pipe_write: &PipeWriter, response: JobResponse) -> ! .write_all(response.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); - process::exit(libc::EXIT_SUCCESS) + if response.is_ok() { + process::exit(libc::EXIT_SUCCESS) + } else { + process::exit(libc::EXIT_FAILURE) + } } fn error_from_errno(context: &'static str, errno: Errno) -> PrepareError { @@ -634,8 +638,9 @@ const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08"; #[test] fn pre_encoded_payloads() { // NOTE: This must match the type of `response` in `send_child_response`. - let oom_enc: JobResponse = Result::Err(PrepareError::OutOfMemory).encode(); - let mut oom_payload = oom_enc.len().to_le_bytes().to_vec(); - oom_payload.extend(oom_enc); + let unencoded: JobResponse = Result::Err(PrepareError::OutOfMemory); + let oom_encoded = unencoded.encode(); + let mut oom_payload = oom_encoded.len().to_le_bytes().to_vec(); + oom_payload.extend(oom_encoded); assert_eq!(oom_payload, OOM_PAYLOAD); } From 203ce2572807ac6aa2ae6cf034834e44cbb53660 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 7 Nov 2023 10:45:57 +0100 Subject: [PATCH 39/47] Fix `prechecking_out_of_memory` test There were several simultaneous issues: - We were using the wrong exit calls in the allocation failure handler. This is some evil stuff that I don't understand. - We were ignoring responses from the child when the exit status was 1 - We were sending a response encoded with a length prefix, but not decoding with `framed_recv` --- .../node/core/pvf/execute-worker/src/lib.rs | 80 ++++++++++--------- .../node/core/pvf/prepare-worker/src/lib.rs | 63 +++++++++------ 2 files changed, 80 insertions(+), 63 deletions(-) diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index bdea4f050e46..2b7412efb670 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -33,7 +33,7 @@ use nix::{ }, unistd::{ForkResult, Pid}, }; -use os_pipe::PipeWriter; +use os_pipe::{self, PipeReader, PipeWriter}; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::InternalValidationError, @@ -265,7 +265,7 @@ fn validate_using_artifact( /// /// # Arguments /// -/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe. /// /// - `compiled_artifact_blob`: The artifact bytes from compiled by the prepare worker`. /// @@ -279,7 +279,7 @@ fn validate_using_artifact( /// /// - pipe back `JobResponse` to the parent process. fn handle_child_process( - pipe_write: os_pipe::PipeWriter, + mut pipe_write: PipeWriter, compiled_artifact_blob: Vec, executor_params: ExecutorParams, params: Vec, @@ -304,7 +304,7 @@ fn handle_child_process( WaitOutcome::TimedOut, ) .unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string()))) + send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string()))) }); let executor_params_2 = executor_params.clone(); @@ -316,7 +316,7 @@ fn handle_child_process( EXECUTE_THREAD_STACK_SIZE, ) .unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string()))) + send_child_response(&mut pipe_write, Err(JobError::CouldNotSpawnThread(err.to_string()))) }); let outcome = thread::wait_for_threads(condvar); @@ -339,7 +339,7 @@ fn handle_child_process( unreachable!("we run wait_while until the outcome is no longer pending; qed"), }; - send_child_response(&pipe_write, response); + send_child_response(&mut pipe_write, response); } /// Waits for child process to finish and handle child response from pipe. @@ -358,15 +358,17 @@ fn handle_child_process( /// /// - The response, either `Ok` or some error state. fn handle_parent_process( - mut pipe_read: os_pipe::PipeReader, + mut pipe_read: PipeReader, child: Pid, worker_pid: u32, usage_before: Usage, timeout: Duration, ) -> io::Result { // Read from the child. - let mut received_data = Vec::new(); - pipe_read.read_to_end(&mut received_data)?; + let result = recv_child_response(&mut pipe_read) + // Could not decode job response. There is either a bug or the job was hijacked. + // Should retry at any rate. + .map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?; let status = nix::sys::wait::waitpid(child, None); gum::trace!( @@ -382,7 +384,7 @@ fn handle_parent_process( }; // Using `getrusage` is needed to check whether child has timedout since we cannot rely on - // child. to report its own time. + // child to report its own time. // As `getrusage` returns resource usage from all terminated child processes, // it is necessary to subtract the usage before the current child process to isolate its cpu // time @@ -399,29 +401,23 @@ fn handle_parent_process( } match status { - Ok(WaitStatus::Exited(_, libc::EXIT_SUCCESS)) => { - match JobResult::decode(&mut received_data.as_slice()) { - Ok(Ok(JobResponse::Ok { result_descriptor })) => - Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv }), - Ok(Ok(JobResponse::InvalidCandidate(err))) => - Ok(WorkerResponse::InvalidCandidate(err)), - Ok(Err(job_error)) => { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "execute job error: {}", - job_error, - ); - if matches!(job_error, JobError::TimedOut) { - Ok(WorkerResponse::JobTimedOut) - } else { - Ok(WorkerResponse::JobError(job_error.to_string())) - } - }, - // Could not decode job response. There is either a bug or the job was hijacked. - // Should retry at any rate. - Err(err) => Err(io::Error::new(io::ErrorKind::Other, err.to_string())), - } + Ok(WaitStatus::Exited(_, _exit_status)) => match result { + Ok(JobResponse::Ok { result_descriptor }) => + Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv }), + Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)), + Err(job_error) => { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "execute job error: {}", + job_error, + ); + if matches!(job_error, JobError::TimedOut) { + Ok(WorkerResponse::JobTimedOut) + } else { + Ok(WorkerResponse::JobError(job_error.to_string())) + } + }, }, // The job was killed by the given signal. // @@ -457,16 +453,26 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { return Duration::from_micros(micros) } +/// Get a job response. +fn recv_child_response(pipe_read: &mut PipeReader) -> io::Result { + let response_bytes = framed_recv_blocking(pipe_read)?; + JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("execute pvf recv_child_response: decode error: {:?}", e), + ) + }) +} + /// Write response to the pipe and exit process after. /// /// # Arguments /// -/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe. /// /// - `response`: Child process response, or error. -fn send_child_response(mut pipe_write: &PipeWriter, response: JobResult) -> ! { - pipe_write - .write_all(response.encode().as_slice()) +fn send_child_response(pipe_write: &mut PipeWriter, response: JobResult) -> ! { + framed_send_blocking(pipe_write, response.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); if response.is_ok() { diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 32fc20255ae2..aec8a0d72d2c 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -37,7 +37,7 @@ use nix::{ }, unistd::{ForkResult, Pid}, }; -use os_pipe::{self, PipeWriter}; +use os_pipe::{self, PipeReader, PipeWriter}; use parity_scale_codec::{Decode, Encode}; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, @@ -93,6 +93,7 @@ impl AsRef<[u8]> for CompiledArtifact { } } +/// Get a worker request. fn recv_request(stream: &mut UnixStream) -> io::Result { let pvf = framed_recv_blocking(stream)?; let pvf = PvfPrepData::decode(&mut &pvf[..]).map_err(|e| { @@ -104,6 +105,7 @@ fn recv_request(stream: &mut UnixStream) -> io::Result { Ok(pvf) } +/// Send a worker response. fn send_response(stream: &mut UnixStream, result: PrepareResult) -> io::Result<()> { framed_send_blocking(stream, &result.encode()) } @@ -124,7 +126,11 @@ fn start_memory_tracking(fd: RawFd, limit: Option) { // Syscalls never allocate or deallocate, so this is safe. libc::syscall(libc::SYS_write, fd, OOM_PAYLOAD.as_ptr(), OOM_PAYLOAD.len()); libc::syscall(libc::SYS_close, fd); - libc::syscall(libc::SYS_exit, 1); + // Make sure we exit from all threads. Copied from libc. + libc::syscall(libc::SYS_exit_group, 1); + loop { + libc::syscall(libc::SYS_exit, 1); + } } #[cfg(not(target_os = "linux"))] { @@ -135,7 +141,7 @@ fn start_memory_tracking(fd: RawFd, limit: Option) { // code is only run by a validator, it's a lesser evil. libc::write(fd, OOM_PAYLOAD.as_ptr().cast(), OOM_PAYLOAD.len()); libc::close(fd); - std::process::exit(1); + libc::_exit(1); } })), ); @@ -309,7 +315,7 @@ struct Response { /// /// - `pvf`: `PvfPrepData` structure, containing data to prepare the artifact /// -/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe. /// /// - `preparation_timeout`: The timeout in `Duration`. /// @@ -324,7 +330,7 @@ struct Response { /// - If success, pipe back `Response`. fn handle_child_process( pvf: PvfPrepData, - pipe_write: os_pipe::PipeWriter, + mut pipe_write: PipeWriter, preparation_timeout: Duration, prepare_job_kind: PrepareJobKind, executor_params: Arc, @@ -373,7 +379,7 @@ fn handle_child_process( WaitOutcome::TimedOut, ) .unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(PrepareError::Panic(err.to_string()))) + send_child_response(&mut pipe_write, Err(PrepareError::Panic(err.to_string()))) }); let prepare_thread = spawn_worker_thread( @@ -403,7 +409,7 @@ fn handle_child_process( WaitOutcome::Finished, ) .unwrap_or_else(|err| { - send_child_response(&pipe_write, Err(PrepareError::IoErr(err.to_string()))) + send_child_response(&mut pipe_write, Err(PrepareError::IoErr(err.to_string()))) }); let outcome = thread::wait_for_threads(condvar); @@ -425,7 +431,7 @@ fn handle_child_process( match prepare_thread.join().unwrap_or_else(|err| { send_child_response( - &pipe_write, + &mut pipe_write, Err(PrepareError::Panic(stringify_panic_payload(err))), ) }) { @@ -470,7 +476,7 @@ fn handle_child_process( unreachable!("we run wait_while until the outcome is no longer pending; qed"), }; - send_child_response(&pipe_write, result); + send_child_response(&mut pipe_write, result); } /// Waits for child process to finish and handle child response from pipe. @@ -498,7 +504,7 @@ fn handle_child_process( /// /// - If the child process timeout, it returns `PrepareError::TimedOut`. fn handle_parent_process( - mut pipe_read: os_pipe::PipeReader, + mut pipe_read: PipeReader, child: Pid, temp_artifact_dest: PathBuf, worker_pid: u32, @@ -506,10 +512,8 @@ fn handle_parent_process( timeout: Duration, ) -> Result { // Read from the child. - let mut received_data = Vec::new(); - pipe_read - .read_to_end(&mut received_data) - .map_err(|err| PrepareError::IoErr(err.to_string()))?; + let result = + recv_child_response(&mut pipe_read).map_err(|err| PrepareError::IoErr(err.to_string()))?; let status = nix::sys::wait::waitpid(child, None); gum::trace!( @@ -540,11 +544,7 @@ fn handle_parent_process( } match status { - Ok(WaitStatus::Exited(_pid, libc::EXIT_SUCCESS)) => { - let result: Result = - Result::decode(&mut received_data.as_slice()) - // There is either a bug or the job was hijacked. - .map_err(|err| PrepareError::IoErr(err.to_string()))?; + Ok(WaitStatus::Exited(_pid, _exit_status)) => { match result { Err(err) => Err(err), Ok(response) => { @@ -607,16 +607,26 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { return Duration::from_micros(micros) } -/// Write response to the pipe and exit process after. +/// Get a job response. +fn recv_child_response(pipe_read: &mut PipeReader) -> io::Result { + let response_bytes = framed_recv_blocking(pipe_read)?; + JobResponse::decode(&mut response_bytes.as_slice()).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("prepare pvf recv_child_response: decode error: {:?}", e), + ) + }) +} + +/// Write a job response to the pipe and exit process after. /// /// # Arguments /// -/// - `pipe_write`: A `os_pipe::PipeWriter` structure, the writing end of a pipe. +/// - `pipe_write`: A `PipeWriter` structure, the writing end of a pipe. /// /// - `response`: Child process response -fn send_child_response(mut pipe_write: &PipeWriter, response: JobResponse) -> ! { - pipe_write - .write_all(response.encode().as_slice()) +fn send_child_response(pipe_write: &mut PipeWriter, response: JobResponse) -> ! { + framed_send_blocking(pipe_write, response.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); if response.is_ok() { @@ -638,8 +648,9 @@ const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08"; #[test] fn pre_encoded_payloads() { // NOTE: This must match the type of `response` in `send_child_response`. - let unencoded: JobResponse = Result::Err(PrepareError::OutOfMemory); - let oom_encoded = unencoded.encode(); + let oom_unencoded: JobResponse = Result::Err(PrepareError::OutOfMemory); + let oom_encoded = oom_unencoded.encode(); + // The payload is prefixed with its length in `framed_send`. let mut oom_payload = oom_encoded.len().to_le_bytes().to_vec(); oom_payload.extend(oom_encoded); assert_eq!(oom_payload, OOM_PAYLOAD); From de7d19d411e663fff3ba1651e6e5b145b93d69c2 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 7 Nov 2023 12:18:46 +0100 Subject: [PATCH 40/47] Add tests, fix some more issues --- polkadot/node/core/pvf/common/src/error.rs | 8 +- .../node/core/pvf/execute-worker/src/lib.rs | 63 ++-- .../node/core/pvf/prepare-worker/src/lib.rs | 36 ++- polkadot/node/core/pvf/tests/it/main.rs | 213 ++----------- polkadot/node/core/pvf/tests/it/process.rs | 284 ++++++++++++++++++ 5 files changed, 375 insertions(+), 229 deletions(-) create mode 100644 polkadot/node/core/pvf/tests/it/process.rs diff --git a/polkadot/node/core/pvf/common/src/error.rs b/polkadot/node/core/pvf/common/src/error.rs index 384a0781d6dc..34475c481f73 100644 --- a/polkadot/node/core/pvf/common/src/error.rs +++ b/polkadot/node/core/pvf/common/src/error.rs @@ -35,9 +35,9 @@ pub enum PrepareError { /// Instantiation of the WASM module instance failed. #[codec(index = 2)] RuntimeConstruction(String), - /// An unexpected panic has occurred in the preparation job. + /// An unexpected error has occurred in the preparation job. #[codec(index = 3)] - Panic(String), + JobError(String), /// Failed to prepare the PVF due to the time limit. #[codec(index = 4)] TimedOut, @@ -86,7 +86,7 @@ impl PrepareError { pub fn is_deterministic(&self) -> bool { use PrepareError::*; match self { - Prevalidation(_) | Preparation(_) | Panic(_) | OutOfMemory => true, + Prevalidation(_) | Preparation(_) | JobError(_) | OutOfMemory => true, IoErr(_) | JobDied(_) | CreateTmpFile(_) | @@ -108,7 +108,7 @@ impl fmt::Display for PrepareError { Prevalidation(err) => write!(f, "prevalidation: {}", err), Preparation(err) => write!(f, "preparation: {}", err), RuntimeConstruction(err) => write!(f, "runtime construction: {}", err), - Panic(err) => write!(f, "panic: {}", err), + JobError(err) => write!(f, "panic: {}", err), TimedOut => write!(f, "prepare: timeout"), IoErr(err) => write!(f, "prepare: io error while receiving response: {}", err), JobDied(err) => write!(f, "prepare: prepare job died: {}", err), diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 2b7412efb670..2a16f9c1f707 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -48,7 +48,7 @@ use polkadot_node_core_pvf_common::{ use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams}; use std::{ - io::{self, Read, Write}, + io::{self, Read}, os::unix::net::UnixStream, path::PathBuf, process, @@ -364,8 +364,10 @@ fn handle_parent_process( usage_before: Usage, timeout: Duration, ) -> io::Result { - // Read from the child. - let result = recv_child_response(&mut pipe_read) + // Read from the child. Don't decode unless the process exited normally, which we check later. + let mut received_data = Vec::new(); + pipe_read + .read_to_end(&mut received_data) // Could not decode job response. There is either a bug or the job was hijacked. // Should retry at any rate. .map_err(|err| io::Error::new(io::ErrorKind::Other, err.to_string()))?; @@ -401,23 +403,40 @@ fn handle_parent_process( } match status { - Ok(WaitStatus::Exited(_, _exit_status)) => match result { - Ok(JobResponse::Ok { result_descriptor }) => - Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv }), - Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)), - Err(job_error) => { - gum::warn!( - target: LOG_TARGET, - %worker_pid, - "execute job error: {}", - job_error, - ); - if matches!(job_error, JobError::TimedOut) { - Ok(WorkerResponse::JobTimedOut) - } else { - Ok(WorkerResponse::JobError(job_error.to_string())) - } - }, + Ok(WaitStatus::Exited(_, exit_status)) => { + let mut reader = io::BufReader::new(received_data.as_slice()); + let result = match recv_child_response(&mut reader) { + Ok(result) => result, + Err(err) => return Ok(WorkerResponse::JobError(err.to_string())), + }; + + match result { + Ok(JobResponse::Ok { result_descriptor }) => { + // The exit status should have been zero if no error occurred. + if exit_status != 0 { + return Ok(WorkerResponse::JobError(format!( + "unexpected exit status: {}", + exit_status + ))) + } + + Ok(WorkerResponse::Ok { result_descriptor, duration: cpu_tv }) + }, + Ok(JobResponse::InvalidCandidate(err)) => Ok(WorkerResponse::InvalidCandidate(err)), + Err(job_error) => { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "execute job error: {}", + job_error, + ); + if matches!(job_error, JobError::TimedOut) { + Ok(WorkerResponse::JobTimedOut) + } else { + Ok(WorkerResponse::JobError(job_error.to_string())) + } + }, + } }, // The job was killed by the given signal. // @@ -454,8 +473,8 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { } /// Get a job response. -fn recv_child_response(pipe_read: &mut PipeReader) -> io::Result { - let response_bytes = framed_recv_blocking(pipe_read)?; +fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result { + let response_bytes = framed_recv_blocking(received_data)?; JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| { io::Error::new( io::ErrorKind::Other, diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index aec8a0d72d2c..d5cd0d76d463 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -55,7 +55,7 @@ use polkadot_node_core_pvf_common::{ use polkadot_primitives::ExecutorParams; use std::{ fs, - io::{self, Read, Write}, + io::{self, Read}, os::{ fd::{AsRawFd, RawFd}, unix::net::UnixStream, @@ -126,7 +126,7 @@ fn start_memory_tracking(fd: RawFd, limit: Option) { // Syscalls never allocate or deallocate, so this is safe. libc::syscall(libc::SYS_write, fd, OOM_PAYLOAD.as_ptr(), OOM_PAYLOAD.len()); libc::syscall(libc::SYS_close, fd); - // Make sure we exit from all threads. Copied from libc. + // Make sure we exit from all threads. Copied from glibc. libc::syscall(libc::SYS_exit_group, 1); loop { libc::syscall(libc::SYS_exit, 1); @@ -379,7 +379,7 @@ fn handle_child_process( WaitOutcome::TimedOut, ) .unwrap_or_else(|err| { - send_child_response(&mut pipe_write, Err(PrepareError::Panic(err.to_string()))) + send_child_response(&mut pipe_write, Err(PrepareError::IoErr(err.to_string()))) }); let prepare_thread = spawn_worker_thread( @@ -432,7 +432,7 @@ fn handle_child_process( match prepare_thread.join().unwrap_or_else(|err| { send_child_response( &mut pipe_write, - Err(PrepareError::Panic(stringify_panic_payload(err))), + Err(PrepareError::JobError(stringify_panic_payload(err))), ) }) { Err(err) => Err(err), @@ -511,9 +511,11 @@ fn handle_parent_process( usage_before: Usage, timeout: Duration, ) -> Result { - // Read from the child. - let result = - recv_child_response(&mut pipe_read).map_err(|err| PrepareError::IoErr(err.to_string()))?; + // Read from the child. Don't decode unless the process exited normally, which we check later. + let mut received_data = Vec::new(); + pipe_read + .read_to_end(&mut received_data) + .map_err(|err| PrepareError::IoErr(err.to_string()))?; let status = nix::sys::wait::waitpid(child, None); gum::trace!( @@ -527,7 +529,7 @@ fn handle_parent_process( .map_err(|errno| error_from_errno("getrusage after", errno))?; // Using `getrusage` is needed to check whether child has timedout since we cannot rely on - // child. to report its own time. + // child to report its own time. // As `getrusage` returns resource usage from all terminated child processes, // it is necessary to subtract the usage before the current child process to isolate its cpu // time @@ -544,10 +546,22 @@ fn handle_parent_process( } match status { - Ok(WaitStatus::Exited(_pid, _exit_status)) => { + Ok(WaitStatus::Exited(_pid, exit_status)) => { + let mut reader = io::BufReader::new(received_data.as_slice()); + let result = recv_child_response(&mut reader) + .map_err(|err| PrepareError::JobError(err.to_string()))?; + match result { Err(err) => Err(err), Ok(response) => { + // The exit status should have been zero if no error occurred. + if exit_status != 0 { + return Err(PrepareError::JobError(format!( + "unexpected exit status: {}", + exit_status + ))) + } + // Write the serialized artifact into a temp file. // // PVF host only keeps artifacts statuses in its memory, @@ -608,8 +622,8 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { } /// Get a job response. -fn recv_child_response(pipe_read: &mut PipeReader) -> io::Result { - let response_bytes = framed_recv_blocking(pipe_read)?; +fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result { + let response_bytes = framed_recv_blocking(received_data)?; JobResponse::decode(&mut response_bytes.as_slice()).map_err(|e| { io::Error::new( io::ErrorKind::Other, diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 9c299147471a..ac0e76934012 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -25,13 +25,13 @@ use polkadot_node_core_pvf::{ }; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::{ExecutorParam, ExecutorParams}; -#[cfg(target_os = "linux")] -use rusty_fork::rusty_fork_test; use std::time::Duration; use tokio::sync::Mutex; mod adder; +#[cfg(target_os = "linux")] +mod process; mod worker_common; const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(6); @@ -125,6 +125,25 @@ impl TestHost { } } +#[tokio::test] +async fn prepare_job_terminates_on_timeout() { + let host = TestHost::new().await; + + let start = std::time::Instant::now(); + let result = host + .precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()) + .await; + + match result { + Err(PrepareError::TimedOut) => {}, + r => panic!("{:?}", r), + } + + let duration = std::time::Instant::now().duration_since(start); + assert!(duration >= TEST_PREPARATION_TIMEOUT); + assert!(duration < TEST_PREPARATION_TIMEOUT * JOB_TIMEOUT_WALL_CLOCK_FACTOR); +} + #[tokio::test] async fn execute_job_terminates_on_timeout() { let host = TestHost::new().await; @@ -153,196 +172,6 @@ async fn execute_job_terminates_on_timeout() { assert!(duration < TEST_EXECUTION_TIMEOUT * JOB_TIMEOUT_WALL_CLOCK_FACTOR); } -#[cfg(target_os = "linux")] -const PREPARE_PROCESS_NAME: &'static str = "polkadot-prepare-worker"; -#[cfg(target_os = "linux")] -const EXECUTE_PROCESS_NAME: &'static str = "polkadot-execute-worker"; - -#[cfg(target_os = "linux")] -fn kill_by_sid_and_name(sid: i32, exe_name: &'static str, is_direct_child: bool) { - use procfs::process; - - let all_processes: Vec = process::all_processes() - .expect("Can't read /proc") - .filter_map(|p| match p { - Ok(p) => Some(p), // happy path - Err(e) => match e { - // process vanished during iteration, ignore it - procfs::ProcError::NotFound(_) => None, - x => { - panic!("some unknown error: {}", x); - }, - }, - }) - .collect(); - - let mut found = 0; - for process in all_processes { - let stat = process.stat().unwrap(); - - if stat.session != sid || !process.exe().unwrap().to_str().unwrap().contains(exe_name) { - continue - } - // The workers are direct children of the current process, the worker job processes are not - // (they are children of the workers). - let process_is_direct_child = stat.ppid as u32 == std::process::id(); - if is_direct_child != process_is_direct_child { - continue - } - - assert_eq!(unsafe { libc::kill(process.pid(), 9) }, 0); - found += 1; - } - assert_eq!(found, 1); -} - -// Run these tests in their own processes with rusty-fork. They work by each creating a new session, -// then killing the worker process that matches the session ID and expected worker name. -#[cfg(target_os = "linux")] -rusty_fork_test! { - // What happens when the prepare worker dies in the middle of a job? - #[test] - fn prepare_worker_killed_during_job() { - let rt = tokio::runtime::Runtime::new().unwrap(); - rt.block_on(async { - let host = TestHost::new().await; - - // Create a new session and get the session ID. - let sid = unsafe { libc::setsid() }; - assert!(sid > 0); - - let (result, _) = futures::join!( - // Choose a job that would normally take the entire timeout. - host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), - // Run a future that kills the job in the middle of the timeout. - async { - tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; - kill_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true); - } - ); - - assert_matches!(result, Err(PrepareError::IoErr(_))); - }) - } - - // What happens when the execute worker dies in the middle of a job? - #[test] - fn execute_worker_killed_during_job() { - let rt = tokio::runtime::Runtime::new().unwrap(); - rt.block_on(async { - let host = TestHost::new().await; - - // Create a new session and get the session ID. - let sid = unsafe { libc::setsid() }; - assert!(sid > 0); - - // Prepare the artifact ahead of time. - let binary = halt::wasm_binary_unwrap(); - host.precheck_pvf(binary, Default::default()).await.unwrap(); - - let (result, _) = futures::join!( - // Choose an job that would normally take the entire timeout. - host.validate_candidate( - binary, - ValidationParams { - block_data: BlockData(Vec::new()), - parent_head: Default::default(), - relay_parent_number: 1, - relay_parent_storage_root: Default::default(), - }, - Default::default(), - ), - // Run a future that kills the job in the middle of the timeout. - async { - tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; - kill_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true); - } - ); - - assert_matches!( - result, - Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)) - ); - }) - } - - // What happens when the forked prepare job dies in the middle of its job? - #[test] - fn forked_prepare_job_killed_during_job() { - polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); - - let rt = tokio::runtime::Runtime::new().unwrap(); - rt.block_on(async { - let host = TestHost::new().await; - - // Create a new session and get the session ID. - let sid = unsafe { libc::setsid() }; - assert!(sid > 0); - - let (result, _) = futures::join!( - // Choose a job that would normally take the entire timeout. - host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), - // Run a future that kills the job while it's running. - async { - tokio::time::sleep(Duration::from_secs(1)).await; - kill_by_sid_and_name(sid, PREPARE_PROCESS_NAME, false); - } - ); - - // Note that we get a more specific error if the job died than if the whole worker died. - assert_matches!( - result, - Err(PrepareError::JobDied(err)) if err == "received signal: SIGKILL" - ); - }) - } - - // What happens when the forked execute job dies in the middle of its job? - #[test] - fn forked_execute_job_killed_during_job() { - polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); - - let rt = tokio::runtime::Runtime::new().unwrap(); - rt.block_on(async { - let host = TestHost::new().await; - - // Create a new session and get the session ID. - let sid = unsafe { libc::setsid() }; - assert!(sid > 0); - - // Prepare the artifact ahead of time. - let binary = halt::wasm_binary_unwrap(); - host.precheck_pvf(binary, Default::default()).await.unwrap(); - - let (result, _) = futures::join!( - // Choose a job that would normally take the entire timeout. - host.validate_candidate( - binary, - ValidationParams { - block_data: BlockData(Vec::new()), - parent_head: Default::default(), - relay_parent_number: 1, - relay_parent_storage_root: Default::default(), - }, - Default::default(), - ), - // Run a future that kills the job while it's running. - async { - tokio::time::sleep(Duration::from_secs(1)).await; - kill_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, false); - } - ); - - // Note that we get a more specific error if the job died than if the whole worker died. - assert_matches!( - result, - Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))) - if err == "received signal: SIGKILL" - ); - }) - } -} - #[cfg(feature = "ci-only-tests")] #[tokio::test] async fn ensure_parallel_execution() { diff --git a/polkadot/node/core/pvf/tests/it/process.rs b/polkadot/node/core/pvf/tests/it/process.rs new file mode 100644 index 000000000000..9a9b8f547295 --- /dev/null +++ b/polkadot/node/core/pvf/tests/it/process.rs @@ -0,0 +1,284 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Test unexpected behaviors of the spawned processes. + +use super::TestHost; +use assert_matches::assert_matches; +use polkadot_node_core_pvf::{InvalidCandidate, PrepareError, ValidationError}; +use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams}; +use rusty_fork::rusty_fork_test; +use std::time::Duration; + +const PREPARE_PROCESS_NAME: &'static str = "polkadot-prepare-worker"; +const EXECUTE_PROCESS_NAME: &'static str = "polkadot-execute-worker"; + +const SIGNAL_KILL: i32 = 9; +const SIGNAL_STOP: i32 = 19; + +fn send_signal_by_sid_and_name( + sid: i32, + exe_name: &'static str, + is_direct_child: bool, + signal: i32, +) { + use procfs::process; + + let all_processes: Vec = process::all_processes() + .expect("Can't read /proc") + .filter_map(|p| match p { + Ok(p) => Some(p), // happy path + Err(e) => match e { + // process vanished during iteration, ignore it + procfs::ProcError::NotFound(_) => None, + x => { + panic!("some unknown error: {}", x); + }, + }, + }) + .collect(); + + let mut found = 0; + for process in all_processes { + let stat = process.stat().unwrap(); + + if stat.session != sid || !process.exe().unwrap().to_str().unwrap().contains(exe_name) { + continue + } + // The workers are direct children of the current process, the worker job processes are not + // (they are children of the workers). + let process_is_direct_child = stat.ppid as u32 == std::process::id(); + if is_direct_child != process_is_direct_child { + continue + } + + assert_eq!(unsafe { libc::kill(process.pid(), signal) }, 0); + found += 1; + } + assert_eq!(found, 1); +} + +// Run these tests in their own processes with rusty-fork. They work by each creating a new session, +// then killing the worker process that matches the session ID and expected worker name. +rusty_fork_test! { + // What happens when the prepare worker (not the job) times out? + #[test] + fn prepare_worker_timeout() { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Send a stop signal to pause the worker. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true, SIGNAL_STOP); + } + ); + + assert_matches!(result, Err(PrepareError::TimedOut)); + }) + } + + // What happens when the execute worker (not the job) times out? + #[test] + fn execute_worker_timeout() { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let (result, _) = futures::join!( + // Choose an job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Send a stop signal to pause the worker. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true, SIGNAL_STOP); + } + ); + + assert_matches!( + result, + Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)) + ); + }) + } + + // What happens when the prepare worker dies in the middle of a job? + #[test] + fn prepare_worker_killed_during_job() { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Run a future that kills the job while it's running. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true, SIGNAL_KILL); + } + ); + + assert_matches!(result, Err(PrepareError::IoErr(_))); + }) + } + + // What happens when the execute worker dies in the middle of a job? + #[test] + fn execute_worker_killed_during_job() { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let (result, _) = futures::join!( + // Choose an job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Run a future that kills the job while it's running. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true, SIGNAL_KILL); + } + ); + + assert_matches!( + result, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)) + ); + }) + } + + // What happens when the forked prepare job dies in the middle of its job? + #[test] + fn forked_prepare_job_killed_during_job() { + polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Run a future that kills the job while it's running. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, false, SIGNAL_KILL); + } + ); + + // Note that we get a more specific error if the job died than if the whole worker died. + assert_matches!( + result, + Err(PrepareError::JobDied(err)) if err == "received signal: SIGKILL" + ); + }) + } + + // What happens when the forked execute job dies in the middle of its job? + #[test] + fn forked_execute_job_killed_during_job() { + polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Run a future that kills the job while it's running. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, false, SIGNAL_KILL); + } + ); + + // Note that we get a more specific error if the job died than if the whole worker died. + assert_matches!( + result, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousJobDeath(err))) + if err == "received signal: SIGKILL" + ); + }) + } +} From cec647701432c1608932be32ce34c8d63ade176c Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 7 Nov 2023 13:07:29 +0100 Subject: [PATCH 41/47] Fix test --- polkadot/node/core/candidate-validation/src/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/core/candidate-validation/src/tests.rs b/polkadot/node/core/candidate-validation/src/tests.rs index 61de8c90e908..cab823e1e637 100644 --- a/polkadot/node/core/candidate-validation/src/tests.rs +++ b/polkadot/node/core/candidate-validation/src/tests.rs @@ -1218,7 +1218,7 @@ fn precheck_properly_classifies_outcomes() { inner(Err(PrepareError::Prevalidation("foo".to_owned())), PreCheckOutcome::Invalid); inner(Err(PrepareError::Preparation("bar".to_owned())), PreCheckOutcome::Invalid); - inner(Err(PrepareError::Panic("baz".to_owned())), PreCheckOutcome::Invalid); + inner(Err(PrepareError::JobError("baz".to_owned())), PreCheckOutcome::Invalid); inner(Err(PrepareError::TimedOut), PreCheckOutcome::Failed); inner(Err(PrepareError::IoErr("fizz".to_owned())), PreCheckOutcome::Failed); From 89cfd2ebd8753518c68e13189bde1da2e688e167 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 7 Nov 2023 13:17:12 +0100 Subject: [PATCH 42/47] Make naming of some types more clear --- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index d5cd0d76d463..ad72e11fe7bf 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -302,7 +302,7 @@ fn runtime_construction_check( } #[derive(Encode, Decode)] -struct Response { +struct JobResponse { artifact: CompiledArtifact, memory_stats: MemoryStats, } @@ -327,7 +327,7 @@ struct Response { /// /// - If any error occur, pipe response back with `PrepareError`. /// -/// - If success, pipe back `Response`. +/// - If success, pipe back `JobResponse`. fn handle_child_process( pvf: PvfPrepData, mut pipe_write: PipeWriter, @@ -460,7 +460,7 @@ fn handle_child_process( peak_tracked_alloc: if peak_alloc > 0 { peak_alloc as u64 } else { 0u64 }, }; - Ok(Response { artifact, memory_stats }) + Ok(JobResponse { artifact, memory_stats }) }, } }, @@ -622,9 +622,9 @@ fn get_total_cpu_usage(rusage: Usage) -> Duration { } /// Get a job response. -fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result { +fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result { let response_bytes = framed_recv_blocking(received_data)?; - JobResponse::decode(&mut response_bytes.as_slice()).map_err(|e| { + JobResult::decode(&mut response_bytes.as_slice()).map_err(|e| { io::Error::new( io::ErrorKind::Other, format!("prepare pvf recv_child_response: decode error: {:?}", e), @@ -639,7 +639,7 @@ fn recv_child_response(received_data: &mut io::BufReader<&[u8]>) -> io::Result ! { +fn send_child_response(pipe_write: &mut PipeWriter, response: JobResult) -> ! { framed_send_blocking(pipe_write, response.encode().as_slice()) .unwrap_or_else(|_| process::exit(libc::EXIT_FAILURE)); @@ -654,7 +654,7 @@ fn error_from_errno(context: &'static str, errno: Errno) -> PrepareError { PrepareError::Kernel(format!("{}: {}: {}", context, errno, io::Error::last_os_error())) } -type JobResponse = Result; +type JobResult = Result; /// Pre-encoded length-prefixed `Result::Err(PrepareError::OutOfMemory)` const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08"; @@ -662,7 +662,7 @@ const OOM_PAYLOAD: &[u8] = b"\x02\x00\x00\x00\x00\x00\x00\x00\x01\x08"; #[test] fn pre_encoded_payloads() { // NOTE: This must match the type of `response` in `send_child_response`. - let oom_unencoded: JobResponse = Result::Err(PrepareError::OutOfMemory); + let oom_unencoded: JobResult = Result::Err(PrepareError::OutOfMemory); let oom_encoded = oom_unencoded.encode(); // The payload is prefixed with its length in `framed_send`. let mut oom_payload = oom_encoded.len().to_le_bytes().to_vec(); From 58b814cf17293370e82d466fc1df6f0f5b24ad9e Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 7 Nov 2023 16:37:24 +0100 Subject: [PATCH 43/47] Update execute worker syscalls --- polkadot/scripts/list-syscalls/execute-worker-syscalls | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/polkadot/scripts/list-syscalls/execute-worker-syscalls b/polkadot/scripts/list-syscalls/execute-worker-syscalls index 4a7a66181299..349af783cf1a 100644 --- a/polkadot/scripts/list-syscalls/execute-worker-syscalls +++ b/polkadot/scripts/list-syscalls/execute-worker-syscalls @@ -16,6 +16,7 @@ 16 (ioctl) 19 (readv) 20 (writev) +22 (pipe) 24 (sched_yield) 25 (mremap) 28 (madvise) @@ -25,7 +26,9 @@ 45 (recvfrom) 46 (sendmsg) 56 (clone) +57 (fork) 60 (exit) +61 (wait4) 62 (kill) 72 (fcntl) 79 (getcwd) @@ -36,6 +39,7 @@ 89 (readlink) 96 (gettimeofday) 97 (getrlimit) +98 (getrusage) 99 (sysinfo) 102 (getuid) 110 (getppid) @@ -47,6 +51,7 @@ 158 (arch_prctl) 165 (mount) 166 (umount2) +186 (gettid) 200 (tkill) 202 (futex) 204 (sched_getaffinity) @@ -60,6 +65,7 @@ 263 (unlinkat) 272 (unshare) 273 (set_robust_list) +293 (pipe2) 302 (prlimit64) 318 (getrandom) 319 (memfd_create) From a28cf2ecaefa659139c2ee20121189ddcce08eaa Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 7 Nov 2023 17:11:03 +0100 Subject: [PATCH 44/47] Update prepare worker syscalls --- polkadot/scripts/list-syscalls/prepare-worker-syscalls | 1 - 1 file changed, 1 deletion(-) diff --git a/polkadot/scripts/list-syscalls/prepare-worker-syscalls b/polkadot/scripts/list-syscalls/prepare-worker-syscalls index 1f8c8527bcc1..05281b61591a 100644 --- a/polkadot/scripts/list-syscalls/prepare-worker-syscalls +++ b/polkadot/scripts/list-syscalls/prepare-worker-syscalls @@ -49,7 +49,6 @@ 144 (sched_setscheduler) 157 (prctl) 158 (arch_prctl) -160 (setrlimit) 165 (mount) 166 (umount2) 186 (gettid) From b03c6954531f0caa9a66bfe129eb4d27b49a4a4b Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Thu, 9 Nov 2023 19:45:46 +0100 Subject: [PATCH 45/47] Address review comments --- polkadot/node/core/pvf/common/src/execute.rs | 4 +++- polkadot/node/core/pvf/common/src/worker/mod.rs | 11 ++++++++--- polkadot/node/core/pvf/execute-worker/src/lib.rs | 6 +++--- polkadot/node/core/pvf/prepare-worker/src/lib.rs | 6 +++--- polkadot/node/core/pvf/src/prepare/pool.rs | 2 +- polkadot/node/core/pvf/src/prepare/worker_intf.rs | 4 ++-- 6 files changed, 20 insertions(+), 13 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/execute.rs b/polkadot/node/core/pvf/common/src/execute.rs index bb1eccfad2f2..89e7c8e471a5 100644 --- a/polkadot/node/core/pvf/common/src/execute.rs +++ b/polkadot/node/core/pvf/common/src/execute.rs @@ -83,7 +83,9 @@ impl JobResponse { } } -/// An unexpected error occurred in the execution job. +/// An unexpected error occurred in the execution job process. Because this comes from the job, +/// which executes untrusted code, this error must likewise be treated as untrusted. That is, we +/// cannot raise an internal error based on this. #[derive(thiserror::Error, Debug, Encode, Decode)] pub enum JobError { #[error("The job timed out")] diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 274a2fc80397..f8a97880d0a8 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -205,9 +205,14 @@ impl fmt::Display for WorkerKind { } } -// The worker version must be passed in so that we accurately get the version of the worker, and not -// the version that this crate was compiled with. -pub fn worker_event_loop( +// NOTE: The worker version must be passed in so that we accurately get the version of the worker, +// and not the version that this crate was compiled with. +// +// NOTE: This must not spawn any threads due to safety requirements in `event_loop`. +// +/// Initializes the worker process, then runs the given event loop, which spawns a new job process +/// to securely handle each incoming request. +pub fn run_worker( worker_kind: WorkerKind, socket_path: PathBuf, #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf, diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 2a16f9c1f707..61d4c5538ec6 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -40,9 +40,9 @@ use polkadot_node_core_pvf_common::{ execute::{Handshake, JobError, JobResponse, JobResult, WorkerResponse}, framed_recv_blocking, framed_send_blocking, worker::{ - cpu_time_monitor_loop, stringify_panic_payload, + cpu_time_monitor_loop, run_worker, stringify_panic_payload, thread::{self, WaitOutcome}, - worker_event_loop, WorkerKind, + WorkerKind, }, }; use polkadot_parachain_primitives::primitives::ValidationResult; @@ -141,7 +141,7 @@ pub fn worker_entrypoint( worker_version: Option<&str>, security_status: SecurityStatus, ) { - worker_event_loop( + run_worker( WorkerKind::Execute, socket_path, worker_dir_path, diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index ad72e11fe7bf..6bdb244564b5 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -46,9 +46,9 @@ use polkadot_node_core_pvf_common::{ prepare::{MemoryStats, PrepareJobKind, PrepareStats}, pvf::PvfPrepData, worker::{ - cpu_time_monitor_loop, stringify_panic_payload, + cpu_time_monitor_loop, run_worker, stringify_panic_payload, thread::{self, spawn_worker_thread, WaitOutcome}, - worker_event_loop, WorkerKind, + WorkerKind, }, worker_dir, ProcessTime, SecurityStatus, }; @@ -195,7 +195,7 @@ pub fn worker_entrypoint( worker_version: Option<&str>, security_status: SecurityStatus, ) { - worker_event_loop( + run_worker( WorkerKind::Prepare, socket_path, worker_dir_path, diff --git a/polkadot/node/core/pvf/src/prepare/pool.rs b/polkadot/node/core/pvf/src/prepare/pool.rs index a4a44eb50951..8e02f540d321 100644 --- a/polkadot/node/core/pvf/src/prepare/pool.rs +++ b/polkadot/node/core/pvf/src/prepare/pool.rs @@ -334,7 +334,7 @@ fn handle_mux( handle_concluded_no_rip(from_pool, spawned, worker, idle, result), // Return `Concluded`, but do not kill the worker since the error was on the host // side. - Outcome::CreateTmpFile { worker: idle, err } => handle_concluded_no_rip( + Outcome::CreateTmpFileErr { worker: idle, err } => handle_concluded_no_rip( from_pool, spawned, worker, diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index fc83ad3fb5ff..a22fa74b2fe1 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -76,7 +76,7 @@ pub enum Outcome { /// killed by the system. Unreachable, /// The temporary file for the artifact could not be created at the given cache path. - CreateTmpFile { worker: IdleWorker, err: String }, + CreateTmpFileErr { worker: IdleWorker, err: String }, /// The response from the worker is received, but the tmp file cannot be renamed (moved) to the /// final destination location. RenameTmpFile { @@ -305,7 +305,7 @@ where "failed to create a temp file for the artifact: {:?}", err, ); - return Outcome::CreateTmpFile { + return Outcome::CreateTmpFileErr { worker: IdleWorker { stream, pid, worker_dir }, err: format!("{:?}", err), } From 8070074af983ac7cc84c5a69eeb868a3a9fa1c76 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 10 Nov 2023 15:06:29 +0100 Subject: [PATCH 46/47] Add tests for num_threads of child processes --- .../node/core/pvf/common/src/worker/mod.rs | 3 +- .../node/core/pvf/execute-worker/src/lib.rs | 3 +- .../node/core/pvf/prepare-worker/src/lib.rs | 3 +- polkadot/node/core/pvf/tests/it/process.rs | 121 ++++++++++++++++-- 4 files changed, 116 insertions(+), 14 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index f8a97880d0a8..44478d4780af 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -208,7 +208,8 @@ impl fmt::Display for WorkerKind { // NOTE: The worker version must be passed in so that we accurately get the version of the worker, // and not the version that this crate was compiled with. // -// NOTE: This must not spawn any threads due to safety requirements in `event_loop`. +// NOTE: This must not spawn any threads due to safety requirements in `event_loop` and to avoid +// errors in [`security::unshare_user_namespace_and_change_root`]. // /// Initializes the worker process, then runs the given event loop, which spawns a new job process /// to securely handle each incoming request. diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index 61d4c5538ec6..9ec811686b89 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -186,7 +186,8 @@ pub fn worker_entrypoint( }, }; - // SAFETY: new process is spawned within a single threaded process + // SAFETY: new process is spawned within a single threaded process. This invariant + // is enforced by tests. let response = match unsafe { nix::unistd::fork() } { Err(errno) => internal_error_from_errno("fork", errno), Ok(ForkResult::Child) => { diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index 6bdb244564b5..151b54efc2d1 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -229,7 +229,8 @@ pub fn worker_entrypoint( }, }; - // SAFETY: new process is spawned within a single threaded process + // SAFETY: new process is spawned within a single threaded process. This invariant + // is enforced by tests. let result = match unsafe { nix::unistd::fork() } { Err(errno) => Err(error_from_errno("fork", errno)), Ok(ForkResult::Child) => { diff --git a/polkadot/node/core/pvf/tests/it/process.rs b/polkadot/node/core/pvf/tests/it/process.rs index 9a9b8f547295..725d060ab916 100644 --- a/polkadot/node/core/pvf/tests/it/process.rs +++ b/polkadot/node/core/pvf/tests/it/process.rs @@ -14,12 +14,14 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -//! Test unexpected behaviors of the spawned processes. +//! Test unexpected behaviors of the spawned processes. We test both worker processes (directly +//! spawned by the host) and job processes (spawned by the workers to securely perform PVF jobs). use super::TestHost; use assert_matches::assert_matches; use polkadot_node_core_pvf::{InvalidCandidate, PrepareError, ValidationError}; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams}; +use procfs::process; use rusty_fork::rusty_fork_test; use std::time::Duration; @@ -35,8 +37,19 @@ fn send_signal_by_sid_and_name( is_direct_child: bool, signal: i32, ) { - use procfs::process; + let process = find_process_by_sid_and_name(sid, exe_name, is_direct_child); + assert_eq!(unsafe { libc::kill(process.pid(), signal) }, 0); +} +fn get_num_threads_by_sid_and_name(sid: i32, exe_name: &'static str, is_direct_child: bool) -> i64 { + let process = find_process_by_sid_and_name(sid, exe_name, is_direct_child); + process.stat().unwrap().num_threads +} +fn find_process_by_sid_and_name( + sid: i32, + exe_name: &'static str, + is_direct_child: bool, +) -> process::Process { let all_processes: Vec = process::all_processes() .expect("Can't read /proc") .filter_map(|p| match p { @@ -51,7 +64,7 @@ fn send_signal_by_sid_and_name( }) .collect(); - let mut found = 0; + let mut found = None; for process in all_processes { let stat = process.stat().unwrap(); @@ -65,14 +78,17 @@ fn send_signal_by_sid_and_name( continue } - assert_eq!(unsafe { libc::kill(process.pid(), signal) }, 0); - found += 1; + if found.is_some() { + panic!("Found more than one process") + } + found = Some(process); } - assert_eq!(found, 1); + found.expect("Should have found the expected process") } // Run these tests in their own processes with rusty-fork. They work by each creating a new session, -// then killing the worker process that matches the session ID and expected worker name. +// then doing something with the child process that matches the session ID and expected process +// name. rusty_fork_test! { // What happens when the prepare worker (not the job) times out? #[test] @@ -209,8 +225,6 @@ rusty_fork_test! { // What happens when the forked prepare job dies in the middle of its job? #[test] fn forked_prepare_job_killed_during_job() { - polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); - let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(async { let host = TestHost::new().await; @@ -240,8 +254,6 @@ rusty_fork_test! { // What happens when the forked execute job dies in the middle of its job? #[test] fn forked_execute_job_killed_during_job() { - polkadot_node_core_pvf_common::sp_tracing::try_init_simple(); - let rt = tokio::runtime::Runtime::new().unwrap(); rt.block_on(async { let host = TestHost::new().await; @@ -281,4 +293,91 @@ rusty_fork_test! { ); }) } + + // Ensure that the spawned prepare worker is single-threaded. + // + // See `run_worker` for why we need this invariant. + #[test] + fn ensure_prepare_processes_have_correct_num_threads() { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + let _ = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Run a future that kills the job while it's running. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + assert_eq!( + get_num_threads_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true), + 1 + ); + // Child job should have three threads: main thread, execute thread, CPU time + // monitor, and memory tracking. + assert_eq!( + get_num_threads_by_sid_and_name(sid, PREPARE_PROCESS_NAME, false), + 4 + ); + + // End the test. + send_signal_by_sid_and_name(sid, PREPARE_PROCESS_NAME, true, SIGNAL_KILL); + } + ); + }) + } + + // Ensure that the spawned execute worker is single-threaded. + // + // See `run_worker` for why we need this invariant. + #[test] + fn ensure_execute_processes_have_correct_num_threads() { + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let _ = futures::join!( + // Choose a job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Run a future that tests the thread count while the worker is running. + async { + tokio::time::sleep(Duration::from_secs(1)).await; + assert_eq!( + get_num_threads_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true), + 1 + ); + // Child job should have three threads: main thread, execute thread, and CPU + // time monitor. + assert_eq!( + get_num_threads_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, false), + 3 + ); + + // End the test. + send_signal_by_sid_and_name(sid, EXECUTE_PROCESS_NAME, true, SIGNAL_KILL); + } + ); + }) + } } From 05555e7f85382f750a47e3d59c8ee8e3944a1e41 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 10 Nov 2023 15:37:18 +0100 Subject: [PATCH 47/47] Update impl guide --- .../src/node/utility/pvf-host-and-workers.md | 47 +++++++++++++++---- 1 file changed, 37 insertions(+), 10 deletions(-) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index 0cefeb1f77ca..d5cad369de72 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -1,7 +1,11 @@ # PVF Host and Workers The PVF host is responsible for handling requests to prepare and execute PVF -code blobs, which it sends to PVF workers running in their own child processes. +code blobs, which it sends to PVF **workers** running in their own child +processes. + +While the workers are generally long-living, they also spawn one-off secure +**job processes** that perform the jobs. See "Job Processes" section below. This system has two high-levels goals that we will touch on here: *determinism* and *security*. @@ -36,8 +40,11 @@ execution request: not successful. 2. **Artifact missing:** The prepared artifact might have been deleted due to operator error or some bug in the system. -3. **Panic:** The worker thread panicked for some indeterminate reason, which - may or may not be independent of the candidate or PVF. +3. **Job errors:** For example, the worker thread panicked for some + indeterminate reason, which may or may not be independent of the candidate or + PVF. +4. **Internal errors:** See "Internal Errors" section. In this case, after the + retry we abstain from voting. ### Preparation timeouts @@ -62,10 +69,16 @@ more than the CPU time. ### Internal errors +An internal, or local, error is one that we treat as independent of the PVF +and/or candidate, i.e. local to the running machine. If this happens, then we +will first retry the job and if the errors persists, then we simply do not vote. +This prevents slashes, since otherwise our vote may not agree with that of the +other validators. + In general, for errors not raising a dispute we have to be very careful. This is -only sound, if we either: +only sound, if either: -1. Ruled out that error in pre-checking. If something is not checked in +1. We ruled out that error in pre-checking. If something is not checked in pre-checking, even if independent of the candidate and PVF, we must raise a dispute. 2. We are 100% confident that it is a hardware/local issue: Like corrupted file, @@ -75,11 +88,11 @@ Reasoning: Otherwise it would be possible to register a PVF where candidates can not be checked, but we don't get a dispute - so nobody gets punished. Second, we end up with a finality stall that is not going to resolve! -There are some error conditions where we can't be sure whether the candidate is -really invalid or some internal glitch occurred, e.g. panics. Whenever we are -unsure, we can never treat an error as internal as we would abstain from voting. -So we will first retry the candidate, and if the issue persists we are forced to -vote invalid. +Note that any error from the job process we cannot treat as internal. The job +runs untrusted code and an attacker can therefore return arbitrary errors. If +they were to return errors that we treat as internal, they could make us abstain +from voting. Since we are unsure if such errors are legitimate, we will first +retry the candidate, and if the issue persists we are forced to vote invalid. ## Security @@ -119,6 +132,20 @@ So what are we actually worried about? Things that come to mind: 6. **Intercepting and manipulating packages** - Effect very similar to the above, hard to do without also being able to do 4 or 5. +### Job Processes + +As mentioned above, our architecture includes long-living **worker processes** +and one-off **job processes*. This separation is important so that the handling +of untrusted code can be limited to the job processes. A hijacked job process +can therefore not interfere with other jobs running in separate processes. + +Furthermore, if an unexpected execution error occurred in the worker and not the +job, we generally can be confident that it has nothing to do with the candidate, +so we can abstain from voting. On the other hand, a hijacked job can send back +erroneous responses for candidates, so we know that we should not abstain from +voting on such errors from jobs. Otherwise, an attacker could trigger a finality +stall. (See "Internal Errors" section above.) + ### Restricting file-system access A basic security mechanism is to make sure that any process directly interfacing