Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

Log PVF retries #6504

Merged
merged 1 commit into from
Jan 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -604,13 +604,16 @@ async fn validate_candidate_exhaustive(

#[async_trait]
trait ValidationBackend {
/// Tries executing a PVF a single time (no retries).
async fn validate_candidate(
&mut self,
pvf: Pvf,
timeout: Duration,
encoded_params: Vec<u8>,
) -> Result<WasmValidationResult, ValidationError>;

/// Tries executing a PVF. Will retry once if an error is encountered that may have been
/// transient.
async fn validate_candidate_with_retry(
&mut self,
raw_validation_code: Vec<u8>,
Expand All @@ -620,7 +623,7 @@ trait ValidationBackend {
// Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap.
let pvf = Pvf::from_code(raw_validation_code);

let validation_result =
let mut validation_result =
self.validate_candidate(pvf.clone(), timeout, params.encode()).await;

// If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the
Expand All @@ -630,12 +633,19 @@ trait ValidationBackend {
{
// Wait a brief delay before retrying.
futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await;

gum::debug!(
target: LOG_TARGET,
?pvf,
"Re-trying failed candidate validation due to AmbiguousWorkerDeath."
);

// Encode the params again when re-trying. We expect the retry case to be relatively
// rare, and we want to avoid unconditionally cloning data.
self.validate_candidate(pvf, timeout, params.encode()).await
} else {
validation_result
validation_result = self.validate_candidate(pvf, timeout, params.encode()).await;
}

validation_result
}

async fn precheck_pvf(&mut self, pvf: Pvf) -> Result<Duration, PrepareError>;
Expand Down
22 changes: 21 additions & 1 deletion node/core/pvf/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,16 @@ async fn handle_execute_pvf(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
target: LOG_TARGET,
?pvf,
?artifact_id,
?last_time_failed,
%num_failures,
%error,
"handle_execute_pvf: Re-trying failed PVF preparation."
);

// If we are allowed to retry the failed prepare job, change the state to
// Preparing and re-queue this job.
*state = ArtifactState::Preparing {
Expand Down Expand Up @@ -585,6 +595,16 @@ async fn handle_heads_up(
},
ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => {
if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) {
gum::debug!(
target: LOG_TARGET,
?active_pvf,
?artifact_id,
?last_time_failed,
%num_failures,
%error,
"handle_heads_up: Re-trying failed PVF preparation."
);

// If we are allowed to retry the failed prepare job, change the state to
// Preparing and re-queue this job.
*state = ArtifactState::Preparing {
Expand Down Expand Up @@ -1393,7 +1413,7 @@ mod tests {
}

// Test that multiple execution requests don't trigger preparation retries if the first one
// failed due to reproducible error (e.g. Prevalidation).
// failed due to a reproducible error (e.g. Prevalidation).
#[async_std::test]
async fn test_execute_prepare_no_retry() {
let mut test = Builder::default().build();
Expand Down