From 665f5eda9a22b35ac8176c51daff42ed4ec47245 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 18 Apr 2024 14:39:48 +0300 Subject: [PATCH 1/3] Fix next_retry busy waiting on first retry The `next_retry_time` gets populated when a request receives an error timeout or any other error, after thatn next_retry would check all requests in the queue returns the smallest one, which then gets used to move the main loop by creating a Delay ``` futures_timer::Delay::new(instant.saturating_duration_since(Instant::now())).await, ``` However when we retry a task for the first time we still keep in the queue an mark it as in flight so its next_retry_time would be the oldest and it would be small than `now`, so the Delay will always triggers, so that would make the main loop essentially busy wait untill we received a response for the retry request. Fix this by excluding the tasks that are already in-flight. Signed-off-by: Alexandru Gheorghe --- .../statement-distribution/src/v2/requests.rs | 4 +-- .../src/v2/tests/requests.rs | 26 ++++++++++++++++++- 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index fe270c8a58e8..a74aaef08fa5 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -288,7 +288,7 @@ impl RequestManager { /// Returns an instant at which the next request to be retried will be ready. pub fn next_retry_time(&mut self) -> Option { let mut next = None; - for (_id, request) in &self.requests { + for (_id, request) in self.requests.iter().filter(|(_, request)| !request.in_flight) { if let Some(next_retry_time) = request.next_retry_time { if next.map_or(true, |next| next_retry_time < next) { next = Some(next_retry_time); @@ -553,7 +553,6 @@ impl UnhandledResponse { let UnhandledResponse { response: TaggedResponse { identifier, requested_peer, props, response }, } = self; - // handle races if the candidate is no longer known. // this could happen if we requested the candidate under two // different identifiers at the same time, and received a valid @@ -578,7 +577,6 @@ impl UnhandledResponse { Ok(i) => i, Err(_) => unreachable!("requested candidates always have a priority entry; qed"), }; - // Set the next retry time before clearing the `in_flight` flag. entry.next_retry_time = Some(Instant::now() + REQUEST_RETRY_DELAY); entry.in_flight = false; diff --git a/polkadot/node/network/statement-distribution/src/v2/tests/requests.rs b/polkadot/node/network/statement-distribution/src/v2/tests/requests.rs index dc2c8f55290b..8cf139802148 100644 --- a/polkadot/node/network/statement-distribution/src/v2/tests/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/tests/requests.rs @@ -2606,7 +2606,31 @@ fn should_delay_before_retrying_dropped_requests() { // Sleep for the given amount of time. This should reset the delay for the first candidate. futures_timer::Delay::new(REQUEST_RETRY_DELAY).await; - // We re-try the first request. + // We re-try the first request the second time drop it again. + assert_matches!( + overseer.recv().await, + AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendRequests(mut requests, IfDisconnected::ImmediateError)) => { + assert_eq!(requests.len(), 1); + assert_matches!( + requests.pop().unwrap(), + Requests::AttestedCandidateV2(outgoing) => { + assert_eq!(outgoing.peer, Recipient::Peer(peer_c)); + assert_eq!(outgoing.payload.candidate_hash, candidate_hash_1); + assert_eq!(outgoing.payload.mask, mask); + } + ); + } + ); + + assert_matches!( + overseer_recv_with_timeout(&mut overseer, Duration::from_millis(100)).await, + None + ); + + // Sleep for the given amount of time. This should reset the delay for the first candidate. + futures_timer::Delay::new(REQUEST_RETRY_DELAY).await; + + // We re-try the first request, for the third time, so let's answer to it. { let statements = vec![ state From 1b7e282b9bfe11660e5a2b27cea0ec87405cdfa1 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Thu, 18 Apr 2024 15:37:23 +0300 Subject: [PATCH 2/3] Update polkadot/node/network/statement-distribution/src/v2/requests.rs Co-authored-by: Andrei Sandu <54316454+sandreim@users.noreply.github.com> --- polkadot/node/network/statement-distribution/src/v2/requests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index a74aaef08fa5..108c67a60999 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -288,7 +288,7 @@ impl RequestManager { /// Returns an instant at which the next request to be retried will be ready. pub fn next_retry_time(&mut self) -> Option { let mut next = None; - for (_id, request) in self.requests.iter().filter(|(_, request)| !request.in_flight) { + for (_id, request) in self.requests.iter().filter(|(_id, request)| !request.in_flight) { if let Some(next_retry_time) = request.next_retry_time { if next.map_or(true, |next| next_retry_time < next) { next = Some(next_retry_time); From a64bf774d9ed71c26dc7915abd11546e0fb7797f Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Thu, 18 Apr 2024 15:39:26 +0300 Subject: [PATCH 3/3] Address review feedback Signed-off-by: Alexandru Gheorghe --- polkadot/node/network/statement-distribution/src/v2/requests.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs index 108c67a60999..1ed18ffd42a9 100644 --- a/polkadot/node/network/statement-distribution/src/v2/requests.rs +++ b/polkadot/node/network/statement-distribution/src/v2/requests.rs @@ -553,6 +553,7 @@ impl UnhandledResponse { let UnhandledResponse { response: TaggedResponse { identifier, requested_peer, props, response }, } = self; + // handle races if the candidate is no longer known. // this could happen if we requested the candidate under two // different identifiers at the same time, and received a valid @@ -577,6 +578,7 @@ impl UnhandledResponse { Ok(i) => i, Err(_) => unreachable!("requested candidates always have a priority entry; qed"), }; + // Set the next retry time before clearing the `in_flight` flag. entry.next_retry_time = Some(Instant::now() + REQUEST_RETRY_DELAY); entry.in_flight = false;