From af3fa4a19edd023e8aa4c4412b0fbfad0364919d Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Tue, 16 Jul 2024 12:51:09 +0300 Subject: [PATCH] [1 / 5] Optimize logic for gossiping assignments (#4848) This is part of the work to further optimize the approval subsystems, if you want to understand the full context start with reading https://github.com/paritytech/polkadot-sdk/pull/4849#issue-2364261568, however that's not necessary, as this change is self-contained and nodes would benefit from it regardless of subsequent changes landing or not. While testing with 1000 validators I found out that the logic for determining the validators an assignment should be gossiped to is taking a lot of time, because it always iterated through all the peers, to determine which are X and Y neighbours and to which we should randomly gossip(4 samples). This could be actually optimised, so we don't have to iterate through all peers for each new assignment, by fetching the list of X and Y peer ids from the topology first and then stopping the loop once we took the 4 random samples. With this improvements we reduce the total CPU time spent in approval-distribution with 15% on networks with 500 validators and 20% on networks with 1000 validators. ## Test coverage: `propagates_assignments_along_unshared_dimension` and `propagates_locally_generated_assignment_to_both_dimensions` cover already logic and they passed, confirm that there is no breaking change. Additionally, the approval voting benchmark measure the traffic sent to other peers, so I confirmed that for various network size there is no difference in the size of the traffic sent to other peers. --------- Signed-off-by: Alexandru Gheorghe --- .../network/approval-distribution/src/lib.rs | 30 +++++++++++++------ .../approval-distribution/src/tests.rs | 12 +++++--- .../network/protocol/src/grid_topology.rs | 22 ++++++++++++++ prdoc/pr_4848.prdoc | 14 +++++++++ 4 files changed, 65 insertions(+), 13 deletions(-) create mode 100644 prdoc/pr_4848.prdoc diff --git a/polkadot/node/network/approval-distribution/src/lib.rs b/polkadot/node/network/approval-distribution/src/lib.rs index 369d82b45b094..d48fb08a311c7 100644 --- a/polkadot/node/network/approval-distribution/src/lib.rs +++ b/polkadot/node/network/approval-distribution/src/lib.rs @@ -1431,6 +1431,21 @@ impl State { let required_routing = topology.map_or(RequiredRouting::PendingTopology, |t| { t.local_grid_neighbors().required_routing_by_index(validator_index, local) }); + // Peers that we will send the assignment to. + let mut peers = HashSet::new(); + + let peers_to_route_to = topology + .as_ref() + .map(|t| t.peers_to_route(required_routing)) + .unwrap_or_default(); + + for peer in peers_to_route_to { + if !entry.known_by.contains_key(&peer) { + continue + } + + peers.insert(peer); + } // All the peers that know the relay chain block. let peers_to_filter = entry.known_by(); @@ -1456,20 +1471,13 @@ impl State { let n_peers_total = self.peer_views.len(); let source_peer = source.peer_id(); - // Peers that we will send the assignment to. - let mut peers = Vec::new(); - // Filter destination peers for peer in peers_to_filter.into_iter() { if Some(peer) == source_peer { continue } - if let Some(true) = topology - .as_ref() - .map(|t| t.local_grid_neighbors().route_to_peer(required_routing, &peer)) - { - peers.push(peer); + if peers.contains(&peer) { continue } @@ -1485,7 +1493,11 @@ impl State { if route_random { approval_entry.routing_info_mut().mark_randomly_sent(peer); - peers.push(peer); + peers.insert(peer); + } + + if approval_entry.routing_info().random_routing.is_complete() { + break } } diff --git a/polkadot/node/network/approval-distribution/src/tests.rs b/polkadot/node/network/approval-distribution/src/tests.rs index 5ad034464767e..2d08807f97b60 100644 --- a/polkadot/node/network/approval-distribution/src/tests.rs +++ b/polkadot/node/network/approval-distribution/src/tests.rs @@ -2404,7 +2404,7 @@ fn propagates_locally_generated_assignment_to_both_dimensions() { let assignments = vec![(cert.clone(), candidate_index)]; let approvals = vec![approval.clone()]; - let assignment_sent_peers = assert_matches!( + let mut assignment_sent_peers = assert_matches!( overseer_recv(overseer).await, AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendValidationMessage( sent_peers, @@ -2428,12 +2428,14 @@ fn propagates_locally_generated_assignment_to_both_dimensions() { assert_matches!( overseer_recv(overseer).await, AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendValidationMessage( - sent_peers, + mut sent_peers, Versioned::V1(protocol_v1::ValidationProtocol::ApprovalDistribution( protocol_v1::ApprovalDistributionMessage::Approvals(sent_approvals) )) )) => { // Random sampling is reused from the assignment. + sent_peers.sort(); + assignment_sent_peers.sort(); assert_eq!(sent_peers, assignment_sent_peers); assert_eq!(sent_approvals, approvals); } @@ -2678,7 +2680,7 @@ fn propagates_to_required_after_connect() { let assignments = vec![(cert.clone(), candidate_index)]; let approvals = vec![approval.clone()]; - let assignment_sent_peers = assert_matches!( + let mut assignment_sent_peers = assert_matches!( overseer_recv(overseer).await, AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendValidationMessage( sent_peers, @@ -2702,12 +2704,14 @@ fn propagates_to_required_after_connect() { assert_matches!( overseer_recv(overseer).await, AllMessages::NetworkBridgeTx(NetworkBridgeTxMessage::SendValidationMessage( - sent_peers, + mut sent_peers, Versioned::V1(protocol_v1::ValidationProtocol::ApprovalDistribution( protocol_v1::ApprovalDistributionMessage::Approvals(sent_approvals) )) )) => { // Random sampling is reused from the assignment. + sent_peers.sort(); + assignment_sent_peers.sort(); assert_eq!(sent_peers, assignment_sent_peers); assert_eq!(sent_approvals, approvals); } diff --git a/polkadot/node/network/protocol/src/grid_topology.rs b/polkadot/node/network/protocol/src/grid_topology.rs index a14d24610722b..4dd7d29fc25cd 100644 --- a/polkadot/node/network/protocol/src/grid_topology.rs +++ b/polkadot/node/network/protocol/src/grid_topology.rs @@ -313,6 +313,23 @@ impl SessionGridTopologyEntry { self.topology.is_validator(peer) } + /// Returns the list of peers to route based on the required routing. + pub fn peers_to_route(&self, required_routing: RequiredRouting) -> Vec { + match required_routing { + RequiredRouting::All => self.topology.peer_ids.iter().copied().collect(), + RequiredRouting::GridX => self.local_neighbors.peers_x.iter().copied().collect(), + RequiredRouting::GridY => self.local_neighbors.peers_y.iter().copied().collect(), + RequiredRouting::GridXY => self + .local_neighbors + .peers_x + .iter() + .chain(self.local_neighbors.peers_y.iter()) + .copied() + .collect(), + RequiredRouting::None | RequiredRouting::PendingTopology => Vec::new(), + } + } + /// Updates the known peer ids for the passed authorities ids. pub fn update_authority_ids( &mut self, @@ -524,6 +541,11 @@ impl RandomRouting { pub fn inc_sent(&mut self) { self.sent += 1 } + + /// Returns `true` if we already took all the necessary samples. + pub fn is_complete(&self) -> bool { + self.sent >= self.target + } } /// Routing mode diff --git a/prdoc/pr_4848.prdoc b/prdoc/pr_4848.prdoc new file mode 100644 index 0000000000000..cbc0c8322d772 --- /dev/null +++ b/prdoc/pr_4848.prdoc @@ -0,0 +1,14 @@ +title: Optimize logic for gossiping assignments + +doc: + - audience: Node Dev + description: | + Optimize the logic for gossiping assignments by obtaining the list of peer ids + from the topology instead of iterating through all connected validators, this + gives us a 15% to 20% reduction in cpu usage. + +crates: +- name: polkadot-approval-distribution + bump: minor +- name: polkadot-node-network-protocol + bump: minor \ No newline at end of file