From a73b44f2230b6e948c91021437fb2f337e0b74bf Mon Sep 17 00:00:00 2001 From: Marcin S Date: Wed, 4 Jan 2023 15:45:17 -0500 Subject: [PATCH] Log PVF retries --- node/core/candidate-validation/src/lib.rs | 18 ++++++++++++++---- node/core/pvf/src/host.rs | 22 +++++++++++++++++++++- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs index 70fc24eacade..743a053f2ec7 100644 --- a/node/core/candidate-validation/src/lib.rs +++ b/node/core/candidate-validation/src/lib.rs @@ -604,6 +604,7 @@ async fn validate_candidate_exhaustive( #[async_trait] trait ValidationBackend { + /// Tries executing a PVF a single time (no retries). async fn validate_candidate( &mut self, pvf: Pvf, @@ -611,6 +612,8 @@ trait ValidationBackend { encoded_params: Vec, ) -> Result; + /// Tries executing a PVF. Will retry once if an error is encountered that may have been + /// transient. async fn validate_candidate_with_retry( &mut self, raw_validation_code: Vec, @@ -620,7 +623,7 @@ trait ValidationBackend { // Construct the PVF a single time, since it is an expensive operation. Cloning it is cheap. let pvf = Pvf::from_code(raw_validation_code); - let validation_result = + let mut validation_result = self.validate_candidate(pvf.clone(), timeout, params.encode()).await; // If we get an AmbiguousWorkerDeath error, retry once after a brief delay, on the @@ -630,12 +633,19 @@ trait ValidationBackend { { // Wait a brief delay before retrying. futures_timer::Delay::new(PVF_EXECUTION_RETRY_DELAY).await; + + gum::debug!( + target: LOG_TARGET, + ?pvf, + "Re-trying failed candidate validation due to AmbiguousWorkerDeath." + ); + // Encode the params again when re-trying. We expect the retry case to be relatively // rare, and we want to avoid unconditionally cloning data. - self.validate_candidate(pvf, timeout, params.encode()).await - } else { - validation_result + validation_result = self.validate_candidate(pvf, timeout, params.encode()).await; } + + validation_result } async fn precheck_pvf(&mut self, pvf: Pvf) -> Result; diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs index d7823ac44c77..96aed4eae7a8 100644 --- a/node/core/pvf/src/host.rs +++ b/node/core/pvf/src/host.rs @@ -525,6 +525,16 @@ async fn handle_execute_pvf( }, ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => { if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) { + gum::debug!( + target: LOG_TARGET, + ?pvf, + ?artifact_id, + ?last_time_failed, + %num_failures, + %error, + "handle_execute_pvf: Re-trying failed PVF preparation." + ); + // If we are allowed to retry the failed prepare job, change the state to // Preparing and re-queue this job. *state = ArtifactState::Preparing { @@ -585,6 +595,16 @@ async fn handle_heads_up( }, ArtifactState::FailedToProcess { last_time_failed, num_failures, error } => { if can_retry_prepare_after_failure(*last_time_failed, *num_failures, error) { + gum::debug!( + target: LOG_TARGET, + ?active_pvf, + ?artifact_id, + ?last_time_failed, + %num_failures, + %error, + "handle_heads_up: Re-trying failed PVF preparation." + ); + // If we are allowed to retry the failed prepare job, change the state to // Preparing and re-queue this job. *state = ArtifactState::Preparing { @@ -1393,7 +1413,7 @@ mod tests { } // Test that multiple execution requests don't trigger preparation retries if the first one - // failed due to reproducible error (e.g. Prevalidation). + // failed due to a reproducible error (e.g. Prevalidation). #[async_std::test] async fn test_execute_prepare_no_retry() { let mut test = Builder::default().build();