Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(net): Limit the number of leftover nonces in the self-connection nonce set #6534

Merged
merged 6 commits into from
Apr 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 48 additions & 15 deletions zebra-network/src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,48 @@ pub const MAX_RECENT_PEER_AGE: Duration32 = Duration32::from_days(3);
/// Using a prime number makes sure that heartbeats don't synchronise with crawls.
pub const HEARTBEAT_INTERVAL: Duration = Duration::from_secs(59);

/// The minimum time between successive calls to
/// The minimum time between outbound peer connections, implemented by
/// [`CandidateSet::next`][crate::peer_set::CandidateSet::next].
///
/// ## Security
///
/// Zebra resists distributed denial of service attacks by making sure that new peer connections
/// are initiated at least [`MIN_PEER_CONNECTION_INTERVAL`] apart.
pub const MIN_PEER_CONNECTION_INTERVAL: Duration = Duration::from_millis(25);
/// Zebra resists distributed denial of service attacks by making sure that new outbound peer
/// connections are only initiated after this minimum time has elapsed.
///
/// It also enforces a minimum per-peer reconnection interval, and filters failed outbound peers.
pub const MIN_OUTBOUND_PEER_CONNECTION_INTERVAL: Duration = Duration::from_millis(50);

/// The minimum time between _successful_ inbound peer connections, implemented by
/// `peer_set::initialize::accept_inbound_connections`.
///
/// To support multiple peers connecting simultaneously, this is less than the
/// [`HANDSHAKE_TIMEOUT`].
///
/// ## Security
///
/// Zebra resists distributed denial of service attacks by limiting the inbound connection rate.
/// After a _successful_ inbound connection, new inbound peer connections are only accepted,
/// and our side of the handshake initiated, after this minimum time has elapsed.
///
/// The inbound interval is much longer than the outbound interval, because Zebra does not
/// control the selection or reconnections of inbound peers.
pub const MIN_INBOUND_PEER_CONNECTION_INTERVAL: Duration = Duration::from_secs(1);

/// The minimum time between _failed_ inbound peer connections, implemented by
/// `peer_set::initialize::accept_inbound_connections`.
///
/// This is a tradeoff between:
/// - the memory, CPU, and network usage of each new connection attempt, and
/// - denying service to honest peers due to an attack which makes many inbound connections.
///
/// Attacks that reach this limit should be managed using a firewall or intrusion prevention system.
///
/// ## Security
///
/// Zebra resists distributed denial of service attacks by limiting the inbound connection rate.
/// After a _failed_ inbound connection, new inbound peer connections are only accepted,
/// and our side of the handshake initiated, after this minimum time has elapsed.
pub const MIN_INBOUND_PEER_FAILED_CONNECTION_INTERVAL: Duration = Duration::from_millis(10);

/// The minimum time between successive calls to
/// [`CandidateSet::update`][crate::peer_set::CandidateSet::update].
Expand Down Expand Up @@ -324,9 +358,6 @@ pub mod magics {

#[cfg(test)]
mod tests {

use std::convert::TryFrom;

use zebra_chain::parameters::POST_BLOSSOM_POW_TARGET_SPACING;

use super::*;
Expand Down Expand Up @@ -363,18 +394,20 @@ mod tests {
"The EWMA decay time should be higher than the request timeout, so timed out peers are penalised by the EWMA.");

assert!(
u32::try_from(MAX_ADDRS_IN_ADDRESS_BOOK).expect("fits in u32")
* MIN_PEER_CONNECTION_INTERVAL
< MIN_PEER_RECONNECTION_DELAY,
"each peer should get at least one connection attempt in each connection interval",
MIN_PEER_RECONNECTION_DELAY.as_secs() as f32
/ (u32::try_from(MAX_ADDRS_IN_ADDRESS_BOOK).expect("fits in u32")
* MIN_OUTBOUND_PEER_CONNECTION_INTERVAL)
.as_secs() as f32
>= 0.5,
"most peers should get a connection attempt in each connection interval",
);

assert!(
MIN_PEER_RECONNECTION_DELAY.as_secs()
MIN_PEER_RECONNECTION_DELAY.as_secs() as f32
/ (u32::try_from(MAX_ADDRS_IN_ADDRESS_BOOK).expect("fits in u32")
* MIN_PEER_CONNECTION_INTERVAL)
.as_secs()
<= 2,
* MIN_OUTBOUND_PEER_CONNECTION_INTERVAL)
.as_secs() as f32
<= 2.0,
"each peer should only have a few connection attempts in each connection interval",
);
}
Expand Down
51 changes: 45 additions & 6 deletions zebra-network/src/peer/handshake.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

use std::{
cmp::min,
collections::HashSet,
fmt,
future::Future,
net::{IpAddr, Ipv4Addr, SocketAddr},
Expand All @@ -14,6 +13,7 @@ use std::{

use chrono::{TimeZone, Utc};
use futures::{channel::oneshot, future, pin_mut, FutureExt, SinkExt, StreamExt};
use indexmap::IndexSet;
use tokio::{
io::{AsyncRead, AsyncWrite},
sync::broadcast,
Expand Down Expand Up @@ -48,6 +48,9 @@ use crate::{
BoxError, Config, VersionMessage,
};

#[cfg(test)]
mod tests;

/// A [`Service`] that handshakes with a remote peer and constructs a
/// client/server pair.
///
Expand All @@ -71,7 +74,7 @@ where
address_book_updater: tokio::sync::mpsc::Sender<MetaAddrChange>,
inv_collector: broadcast::Sender<InventoryChange>,
minimum_peer_version: MinimumPeerVersion<C>,
nonces: Arc<futures::lock::Mutex<HashSet<Nonce>>>,
nonces: Arc<futures::lock::Mutex<IndexSet<Nonce>>>,

parent_span: Span,
}
Expand Down Expand Up @@ -515,7 +518,7 @@ where
let (tx, _rx) = tokio::sync::mpsc::channel(1);
tx
});
let nonces = Arc::new(futures::lock::Mutex::new(HashSet::new()));
let nonces = Arc::new(futures::lock::Mutex::new(IndexSet::new()));
let user_agent = self.user_agent.unwrap_or_default();
let our_services = self.our_services.unwrap_or_else(PeerServices::empty);
let relay = self.relay.unwrap_or(false);
Expand Down Expand Up @@ -572,7 +575,7 @@ pub async fn negotiate_version<PeerTransport>(
peer_conn: &mut Framed<PeerTransport, Codec>,
connected_addr: &ConnectedAddr,
config: Config,
nonces: Arc<futures::lock::Mutex<HashSet<Nonce>>>,
nonces: Arc<futures::lock::Mutex<IndexSet<Nonce>>>,
user_agent: String,
our_services: PeerServices,
relay: bool,
Expand All @@ -583,12 +586,43 @@ where
{
// Create a random nonce for this connection
let local_nonce = Nonce::default();

// Insert the nonce for this handshake into the shared nonce set.
// Each connection has its own connection state, and handshakes execute concurrently.
//
// # Correctness
//
// It is ok to wait for the lock here, because handshakes have a short
// timeout, and the async mutex will be released when the task times
// out.
nonces.lock().await.insert(local_nonce);
//
// Duplicate nonces don't matter here, because 64-bit random collisions are very rare.
// If they happen, we're probably replacing a leftover nonce from a failed connection,
// which wasn't cleaned up when it closed.
{
let mut locked_nonces = nonces.lock().await;
locked_nonces.insert(local_nonce);

// # Security
//
// Limit the amount of memory used for nonces.
// Nonces can be left in the set if the connection fails or times out between
// the nonce being inserted, and it being removed.
//
// Zebra has strict connection limits, so we limit the number of nonces to
// the configured connection limit.
// This is a tradeoff between:
// - avoiding memory denial of service attacks which make large numbers of connections,
// for example, 100 failed inbound connections takes 1 second.
// - memory usage: 16 bytes per `Nonce`, 3.2 kB for 200 nonces
// - collision probability: 2^32 has ~50% collision probability, so we use a lower limit
// <https://en.wikipedia.org/wiki/Birthday_problem#Probability_of_a_shared_birthday_(collision)>
while locked_nonces.len() > config.peerset_total_connection_limit() {
locked_nonces.shift_remove_index(0);
}

std::mem::drop(locked_nonces);
};

// Don't leak our exact clock skew to our peers. On the other hand,
// we can't deviate too much, or zcashd will get confused.
Expand Down Expand Up @@ -684,10 +718,15 @@ where
// We must wait for the lock before we continue with the connection, to avoid
// self-connection. If the connection times out, the async lock will be
// released.
//
// # Security
//
// Connections that get a `Version` message will remove their nonces here,
// but connections that fail before this point can leave their nonces in the nonce set.
let nonce_reuse = {
let mut locked_nonces = nonces.lock().await;
let nonce_reuse = locked_nonces.contains(&remote.nonce);
// Regardless of whether we observed nonce reuse, clean up the nonce set.
// Regardless of whether we observed nonce reuse, remove our own nonce from the nonce set.
locked_nonces.remove(&local_nonce);
nonce_reuse
};
Expand Down
15 changes: 15 additions & 0 deletions zebra-network/src/peer/handshake/tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//! Implements methods for testing [`Handshake`]

use super::*;

impl<S, C> Handshake<S, C>
where
S: Service<Request, Response = Response, Error = BoxError> + Clone + Send + 'static,
S::Future: Send,
C: ChainTip + Clone + Send + 'static,
{
/// Returns a count of how many connection nonces are stored in this [`Handshake`]
pub async fn nonce_count(&self) -> usize {
self.nonces.lock().await.len()
}
}
7 changes: 5 additions & 2 deletions zebra-network/src/peer_set/candidate_set.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Candidate peer selection for outbound connections using the [`CandidateSet`].

use std::{cmp::min, sync::Arc};

use chrono::Utc;
Expand Down Expand Up @@ -361,7 +363,8 @@ where
///
/// Zebra resists distributed denial of service attacks by making sure that
/// new peer connections are initiated at least
/// [`MIN_PEER_CONNECTION_INTERVAL`][constants::MIN_PEER_CONNECTION_INTERVAL] apart.
/// [`MIN_OUTBOUND_PEER_CONNECTION_INTERVAL`][constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL]
/// apart.
///
/// [`Responded`]: crate::PeerAddrState::Responded
pub async fn next(&mut self) -> Option<MetaAddr> {
Expand Down Expand Up @@ -397,7 +400,7 @@ where

// Security: rate-limit new outbound peer connections
sleep_until(self.min_next_handshake).await;
self.min_next_handshake = Instant::now() + constants::MIN_PEER_CONNECTION_INTERVAL;
self.min_next_handshake = Instant::now() + constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL;

Some(next_peer)
}
Expand Down
15 changes: 9 additions & 6 deletions zebra-network/src/peer_set/candidate_set/tests/prop.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
//! Randomised property tests for candidate peer selection.

use std::{
env,
net::SocketAddr,
Expand All @@ -13,7 +15,7 @@ use tracing::Span;
use zebra_chain::{parameters::Network::*, serialization::DateTime32};

use crate::{
constants::MIN_PEER_CONNECTION_INTERVAL,
constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL,
meta_addr::{MetaAddr, MetaAddrChange},
AddressBook, BoxError, Request, Response,
};
Expand Down Expand Up @@ -75,7 +77,7 @@ proptest! {
//
// Check that it takes less than the peer set candidate delay,
// and hope that is enough time for test machines with high CPU load.
let less_than_min_interval = MIN_PEER_CONNECTION_INTERVAL - Duration::from_millis(1);
let less_than_min_interval = MIN_OUTBOUND_PEER_CONNECTION_INTERVAL - Duration::from_millis(1);
assert_eq!(runtime.block_on(timeout(less_than_min_interval, candidate_set.next())), Ok(None));
}
}
Expand Down Expand Up @@ -112,7 +114,7 @@ proptest! {
// Check rate limiting for initial peers
check_candidates_rate_limiting(&mut candidate_set, initial_candidates).await;
// Sleep more than the rate limiting delay
sleep(MAX_TEST_CANDIDATES * MIN_PEER_CONNECTION_INTERVAL).await;
sleep(MAX_TEST_CANDIDATES * MIN_OUTBOUND_PEER_CONNECTION_INTERVAL).await;
// Check that the next peers are still respecting the rate limiting, without causing a
// burst of reconnections
check_candidates_rate_limiting(&mut candidate_set, extra_candidates).await;
Expand All @@ -121,7 +123,7 @@ proptest! {
// Allow enough time for the maximum number of candidates,
// plus some extra time for test machines with high CPU load.
// (The max delay asserts usually fail before hitting this timeout.)
let max_rate_limit_sleep = 3 * MAX_TEST_CANDIDATES * MIN_PEER_CONNECTION_INTERVAL;
let max_rate_limit_sleep = 3 * MAX_TEST_CANDIDATES * MIN_OUTBOUND_PEER_CONNECTION_INTERVAL;
let max_extra_delay = (2 * MAX_TEST_CANDIDATES + 1) * MAX_SLEEP_EXTRA_DELAY;
assert!(runtime.block_on(timeout(max_rate_limit_sleep + max_extra_delay, checks)).is_ok());
}
Expand Down Expand Up @@ -161,7 +163,8 @@ where
"rate-limited candidates should not be delayed too long: now: {now:?} max: {maximum_reconnect_instant:?}. Hint: is the test machine overloaded?",
);

minimum_reconnect_instant = now + MIN_PEER_CONNECTION_INTERVAL;
maximum_reconnect_instant = now + MIN_PEER_CONNECTION_INTERVAL + MAX_SLEEP_EXTRA_DELAY;
minimum_reconnect_instant = now + MIN_OUTBOUND_PEER_CONNECTION_INTERVAL;
maximum_reconnect_instant =
now + MIN_OUTBOUND_PEER_CONNECTION_INTERVAL + MAX_SLEEP_EXTRA_DELAY;
}
}
Loading