Skip to content
This repository was archived by the owner on Nov 15, 2023. It is now read-only.

Commit d57049f

Browse files
authored
Let the PVF host kill the worker on timeout (#6381)
* Let the PVF host kill the worker on timeout * Fix comment * Fix inaccurate comments; add missing return statement * Fix a comment * Fix comment
1 parent 9cf336a commit d57049f

File tree

5 files changed

+25
-15
lines changed

5 files changed

+25
-15
lines changed

node/core/pvf/src/execute/worker.rs

+3-1
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@ pub enum Outcome {
7474

7575
/// Given the idle token of a worker and parameters of work, communicates with the worker and
7676
/// returns the outcome.
77+
///
78+
/// NOTE: Returning the `HardTimeout` or `IoErr` errors will trigger the child process being killed.
7779
pub async fn start_work(
7880
worker: IdleWorker,
7981
artifact: ArtifactPathId,
@@ -148,7 +150,7 @@ pub async fn start_work(
148150
target: LOG_TARGET,
149151
worker_pid = %pid,
150152
validation_code_hash = ?artifact.id.code_hash,
151-
"execution worker exceeded alloted time for execution",
153+
"execution worker exceeded allotted time for execution",
152154
);
153155
// TODO: This case is not really a hard timeout as the timeout here in the host is
154156
// lenient. Should fix this as part of

node/core/pvf/src/host.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -390,7 +390,7 @@ async fn run(
390390
from_prepare_queue = from_prepare_queue_rx.next() => {
391391
let from_queue = break_if_fatal!(from_prepare_queue.ok_or(Fatal));
392392

393-
// Note that preparation always succeeds.
393+
// Note that the preparation outcome is always reported as concluded.
394394
//
395395
// That's because the error conditions are written into the artifact and will be
396396
// reported at the time of the execution. It potentially, but not necessarily, can

node/core/pvf/src/prepare/pool.rs

+4-1
Original file line numberDiff line numberDiff line change
@@ -294,12 +294,15 @@ fn handle_mux(
294294
Ok(())
295295
},
296296
PoolEvent::StartWork(worker, outcome) => {
297+
// If we receive any outcome other than `Concluded`, we attempt to kill the worker
298+
// process.
297299
match outcome {
298300
Outcome::Concluded { worker: idle, result } => {
299301
let data = match spawned.get_mut(worker) {
300302
None => {
301303
// Perhaps the worker was killed meanwhile and the result is no longer
302-
// relevant.
304+
// relevant. We already send `Rip` when purging if we detect that the
305+
// worker is dead.
303306
return Ok(())
304307
},
305308
Some(data) => data,

node/core/pvf/src/prepare/worker.rs

+7-3
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ enum Selected {
7878

7979
/// Given the idle token of a worker and parameters of work, communicates with the worker and
8080
/// returns the outcome.
81+
///
82+
/// NOTE: Returning the `TimedOut` or `DidNotMakeIt` errors will trigger the child process being
83+
/// killed.
8184
pub async fn start_work(
8285
worker: IdleWorker,
8386
code: Arc<Vec<u8>>,
@@ -149,6 +152,7 @@ pub async fn start_work(
149152
},
150153
};
151154

155+
// NOTE: A `TimedOut` or `DidNotMakeIt` error triggers the child process being killed.
152156
match selected {
153157
// Timed out on the child. This should already be logged by the child.
154158
Selected::Done(Err(PrepareError::TimedOut)) => Outcome::TimedOut,
@@ -162,6 +166,9 @@ pub async fn start_work(
162166
}
163167

164168
/// Handles the case where we successfully received response bytes on the host from the child.
169+
///
170+
/// NOTE: Here we know the artifact exists, but is still located in a temporary file which will be
171+
/// cleared by `with_tmp_file`.
165172
async fn handle_response_bytes(
166173
response_bytes: Vec<u8>,
167174
pid: u32,
@@ -201,9 +208,6 @@ async fn handle_response_bytes(
201208
);
202209

203210
// Return a timeout error.
204-
//
205-
// NOTE: The artifact exists, but is located in a temporary file which
206-
// will be cleared by `with_tmp_file`.
207211
return Selected::Deadline
208212
}
209213

node/core/pvf/src/worker_common.rs

+10-9
Original file line numberDiff line numberDiff line change
@@ -199,10 +199,8 @@ where
199199
}
200200

201201
/// Loop that runs in the CPU time monitor thread on prepare and execute jobs. Continuously wakes up
202-
/// from sleeping and then either sleeps for the remaining CPU time, or kills the process if we
203-
/// exceed the CPU timeout.
204-
///
205-
/// NOTE: Killed processes are detected and cleaned up in `purge_dead`.
202+
/// from sleeping and then either sleeps for the remaining CPU time, or sends back a timeout error
203+
/// if we exceed the CPU timeout.
206204
///
207205
/// NOTE: If the job completes and this thread is still sleeping, it will continue sleeping in the
208206
/// background. When it wakes, it will see that the flag has been set and return.
@@ -233,7 +231,11 @@ pub async fn cpu_time_monitor_loop(
233231
timeout.as_millis(),
234232
);
235233

236-
// Send back a TimedOut error on timeout.
234+
// Send back a `TimedOut` error.
235+
//
236+
// NOTE: This will cause the worker, whether preparation or execution, to be killed by
237+
// the host. We do not kill the process here because it would interfere with the proper
238+
// handling of this error.
237239
let encoded_result = match job_kind {
238240
JobKind::Prepare => {
239241
let result: Result<(), PrepareError> = Err(PrepareError::TimedOut);
@@ -244,8 +246,8 @@ pub async fn cpu_time_monitor_loop(
244246
result.encode()
245247
},
246248
};
247-
// If we error there is nothing else we can do here, and we are killing the process,
248-
// anyway. The receiving side will just have to time out.
249+
// If we error here there is nothing we can do apart from log it. The receiving side
250+
// will just have to time out.
249251
if let Err(err) = framed_send(&mut stream, encoded_result.as_slice()).await {
250252
gum::warn!(
251253
target: LOG_TARGET,
@@ -255,8 +257,7 @@ pub async fn cpu_time_monitor_loop(
255257
);
256258
}
257259

258-
// Kill the process.
259-
std::process::exit(1);
260+
return
260261
}
261262

262263
// Sleep for the remaining CPU time, plus a bit to account for overhead. Note that the sleep

0 commit comments

Comments
 (0)