From 79cc6ffe2830aeef8d2542ab82d117fb8a0bc1ae Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Dec 2022 06:54:25 -0500 Subject: [PATCH 1/5] Let the PVF host kill the worker on timeout --- node/core/pvf/src/execute/worker.rs | 4 +++- node/core/pvf/src/host.rs | 3 ++- node/core/pvf/src/prepare/pool.rs | 5 ++++- node/core/pvf/src/prepare/worker.rs | 10 +++++++--- node/core/pvf/src/worker_common.rs | 13 +++++++------ 5 files changed, 23 insertions(+), 12 deletions(-) diff --git a/node/core/pvf/src/execute/worker.rs b/node/core/pvf/src/execute/worker.rs index 46226a159c26..105accf18e2b 100644 --- a/node/core/pvf/src/execute/worker.rs +++ b/node/core/pvf/src/execute/worker.rs @@ -74,6 +74,8 @@ pub enum Outcome { /// Given the idle token of a worker and parameters of work, communicates with the worker and /// returns the outcome. +/// +/// NOTE: Returning the `HardTimeout` or `IoErr` errors will trigger the child process being killed. pub async fn start_work( worker: IdleWorker, artifact: ArtifactPathId, @@ -148,7 +150,7 @@ pub async fn start_work( target: LOG_TARGET, worker_pid = %pid, validation_code_hash = ?artifact.id.code_hash, - "execution worker exceeded alloted time for execution", + "execution worker exceeded allotted time for execution", ); // TODO: This case is not really a hard timeout as the timeout here in the host is // lenient. Should fix this as part of diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 483419409448..1c643f662c70 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -390,7 +390,8 @@ async fn run( from_prepare_queue = from_prepare_queue_rx.next() => { let from_queue = break_if_fatal!(from_prepare_queue.ok_or(Fatal)); - // Note that preparation always succeeds. + // Note that unless the worker dies, the preparation outcome is always reported as + // concluded. // // That's because the error conditions are written into the artifact and will be // reported at the time of the execution. It potentially, but not necessarily, can diff --git a/node/core/pvf/src/prepare/pool.rs b/node/core/pvf/src/prepare/pool.rs index 9ba64be97555..6ddf77d7ecae 100644 --- a/node/core/pvf/src/prepare/pool.rs +++ b/node/core/pvf/src/prepare/pool.rs @@ -299,7 +299,8 @@ fn handle_mux( let data = match spawned.get_mut(worker) { None => { // Perhaps the worker was killed meanwhile and the result is no longer - // relevant. + // relevant. We already send `Rip` when purging if we detect that the + // worker is dead. return Ok(()) }, Some(data) => data, @@ -321,6 +322,7 @@ fn handle_mux( Ok(()) }, + // Sending `rip: true` triggers killing the worker. Outcome::DidNotMakeIt => { if attempt_retire(metrics, spawned, worker) { reply( @@ -335,6 +337,7 @@ fn handle_mux( Ok(()) }, + // Sending `rip: true` triggers killing the worker. Outcome::TimedOut => { if attempt_retire(metrics, spawned, worker) { reply( diff --git a/node/core/pvf/src/prepare/worker.rs b/node/core/pvf/src/prepare/worker.rs index 4e0c411e45de..65a6979b7cf2 100644 --- a/node/core/pvf/src/prepare/worker.rs +++ b/node/core/pvf/src/prepare/worker.rs @@ -78,6 +78,9 @@ enum Selected { /// Given the idle token of a worker and parameters of work, communicates with the worker and /// returns the outcome. +/// +/// NOTE: Returning the `TimedOut` or `DidNotMakeIt` errors will trigger the child process being +/// killed. pub async fn start_work( worker: IdleWorker, code: Arc>, @@ -149,6 +152,7 @@ pub async fn start_work( }, }; + // NOTE: A `TimedOut` or `DidNotMakeIt` error triggers the child process being killed. match selected { // Timed out on the child. This should already be logged by the child. Selected::Done(Err(PrepareError::TimedOut)) => Outcome::TimedOut, @@ -162,6 +166,9 @@ pub async fn start_work( } /// Handles the case where we successfully received response bytes on the host from the child. +/// +/// NOTE: Here we know the artifact exists, but is still located in a temporary file. If we error, the temporary +/// artifact will be cleared by `with_tmp_file`. async fn handle_response_bytes( response_bytes: Vec, pid: u32, @@ -201,9 +208,6 @@ async fn handle_response_bytes( ); // Return a timeout error. - // - // NOTE: The artifact exists, but is located in a temporary file which - // will be cleared by `with_tmp_file`. return Selected::Deadline } diff --git a/node/core/pvf/src/worker_common.rs b/node/core/pvf/src/worker_common.rs index 55c91a64424d..f36bb549422c 100644 --- a/node/core/pvf/src/worker_common.rs +++ b/node/core/pvf/src/worker_common.rs @@ -233,7 +233,11 @@ pub async fn cpu_time_monitor_loop( timeout.as_millis(), ); - // Send back a TimedOut error on timeout. + // Send back a `TimedOut` error. + // + // NOTE: This will cause the worker, whether preparation or execution, to be killed by + // the host. We do not kill the process here because it would interfere with the proper + // handling of this error. let encoded_result = match job_kind { JobKind::Prepare => { let result: Result<(), PrepareError> = Err(PrepareError::TimedOut); @@ -244,8 +248,8 @@ pub async fn cpu_time_monitor_loop( result.encode() }, }; - // If we error there is nothing else we can do here, and we are killing the process, - // anyway. The receiving side will just have to time out. + // If we error here there is nothing we can do apart from log it. The receiving side + // will just have to time out. if let Err(err) = framed_send(&mut stream, encoded_result.as_slice()).await { gum::warn!( target: LOG_TARGET, @@ -254,9 +258,6 @@ pub async fn cpu_time_monitor_loop( err ); } - - // Kill the process. - std::process::exit(1); } // Sleep for the remaining CPU time, plus a bit to account for overhead. Note that the sleep From f03599961b3aed6b8785491d19c7e796ccfdef1d Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Dec 2022 07:10:32 -0500 Subject: [PATCH 2/5] Fix comment --- node/core/pvf/src/prepare/worker.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/node/core/pvf/src/prepare/worker.rs b/node/core/pvf/src/prepare/worker.rs index 65a6979b7cf2..91361eacaf26 100644 --- a/node/core/pvf/src/prepare/worker.rs +++ b/node/core/pvf/src/prepare/worker.rs @@ -167,8 +167,8 @@ pub async fn start_work( /// Handles the case where we successfully received response bytes on the host from the child. /// -/// NOTE: Here we know the artifact exists, but is still located in a temporary file. If we error, the temporary -/// artifact will be cleared by `with_tmp_file`. +/// NOTE: Here we know the artifact exists, but is still located in a temporary file which will be +/// cleared by `with_tmp_file`. async fn handle_response_bytes( response_bytes: Vec, pid: u32, From 4b85076697bd8c3405d9fe1707d9c599cffefbb9 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Dec 2022 10:24:41 -0500 Subject: [PATCH 3/5] Fix inaccurate comments; add missing return statement --- node/core/pvf/src/prepare/pool.rs | 4 ++-- node/core/pvf/src/worker_common.rs | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/node/core/pvf/src/prepare/pool.rs b/node/core/pvf/src/prepare/pool.rs index 6ddf77d7ecae..306588eb429a 100644 --- a/node/core/pvf/src/prepare/pool.rs +++ b/node/core/pvf/src/prepare/pool.rs @@ -294,6 +294,8 @@ fn handle_mux( Ok(()) }, PoolEvent::StartWork(worker, outcome) => { + // If we receive any outcome other than `Concluded`, we attempt to kill the worker + // process. match outcome { Outcome::Concluded { worker: idle, result } => { let data = match spawned.get_mut(worker) { @@ -322,7 +324,6 @@ fn handle_mux( Ok(()) }, - // Sending `rip: true` triggers killing the worker. Outcome::DidNotMakeIt => { if attempt_retire(metrics, spawned, worker) { reply( @@ -337,7 +338,6 @@ fn handle_mux( Ok(()) }, - // Sending `rip: true` triggers killing the worker. Outcome::TimedOut => { if attempt_retire(metrics, spawned, worker) { reply( diff --git a/node/core/pvf/src/worker_common.rs b/node/core/pvf/src/worker_common.rs index f36bb549422c..f5df023c6ee0 100644 --- a/node/core/pvf/src/worker_common.rs +++ b/node/core/pvf/src/worker_common.rs @@ -258,6 +258,8 @@ pub async fn cpu_time_monitor_loop( err ); } + + return } // Sleep for the remaining CPU time, plus a bit to account for overhead. Note that the sleep From d0d0c18451aa8d0ce073285a8eb997ef17b4c4b4 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Fri, 2 Dec 2022 10:35:28 -0500 Subject: [PATCH 4/5] Fix a comment --- node/core/pvf/src/host.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index 1c643f662c70..0f2e2b839a80 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -390,8 +390,7 @@ async fn run( from_prepare_queue = from_prepare_queue_rx.next() => { let from_queue = break_if_fatal!(from_prepare_queue.ok_or(Fatal)); - // Note that unless the worker dies, the preparation outcome is always reported as - // concluded. + // Note that the preparation outcome is always reported as concluded. // // That's because the error conditions are written into the artifact and will be // reported at the time of the execution. It potentially, but not necessarily, can From 1ee86959367101dfb44406338c13328e9dff1405 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Mon, 5 Dec 2022 13:58:50 -0500 Subject: [PATCH 5/5] Fix comment --- node/core/pvf/src/worker_common.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/node/core/pvf/src/worker_common.rs b/node/core/pvf/src/worker_common.rs index f5df023c6ee0..f9eaf42dcf67 100644 --- a/node/core/pvf/src/worker_common.rs +++ b/node/core/pvf/src/worker_common.rs @@ -199,10 +199,8 @@ where } /// Loop that runs in the CPU time monitor thread on prepare and execute jobs. Continuously wakes up -/// from sleeping and then either sleeps for the remaining CPU time, or kills the process if we -/// exceed the CPU timeout. -/// -/// NOTE: Killed processes are detected and cleaned up in `purge_dead`. +/// from sleeping and then either sleeps for the remaining CPU time, or sends back a timeout error +/// if we exceed the CPU timeout. /// /// NOTE: If the job completes and this thread is still sleeping, it will continue sleeping in the /// background. When it wakes, it will see that the flag has been set and return.