From 623b68e10fb5e592ef6ea32b9c442d0312f9c8f4 Mon Sep 17 00:00:00 2001 From: Alexandru Vasile <60601340+lexnv@users.noreply.github.com> Date: Mon, 22 Jul 2024 11:34:07 +0300 Subject: [PATCH] litep2p: Improve metric reasons for RequestResponse errors (#5077) This PR improves the metrics reported by litep2p on request-response errors. Discovered while investigating: - https://github.com/paritytech/polkadot-sdk/issues/4985 We are experiencing many requests that are `Refused` by litep2p in comparison with libp2p. The metric roughly approximates the sum of other reasons from libp2p. This PR aims to provide more insights. ``` {__name__="substrate_sub_libp2p_requests_out_failure_total", chain="ksmcc3", instance="localhost:9615", job="substrate_node", protocol="/b0a8d493285c2df73290dfb7e61f870f17b41801197a149ca93654499ea3dafe/sync/2", reason="Remote has closed the substream before answering, thereby signaling that it considers the request as valid, but refused to answer it."} Last *: 3365 Min: 3363 Max: 3365 Mean: 3365 {__name__="substrate_sub_libp2p_requests_out_failure_total", chain="ksmcc3", instance="localhost:9615", job="substrate_node", protocol="/b0a8d493285c2df73290dfb7e61f870f17b41801197a149ca93654499ea3dafe/beefy/justifications/1", reason="Remote has closed the substream before answering, thereby signaling that it considers the request as valid, but refused to answer it."} Last *: 3461 Min: 3461 Max: 3461 Mean: 3461 ``` Part of: - https://github.com/paritytech/polkadot-sdk/issues/4681 cc @paritytech/networking --------- Signed-off-by: Alexandru Vasile --- .../src/litep2p/shim/request_response/mod.rs | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/substrate/client/network/src/litep2p/shim/request_response/mod.rs b/substrate/client/network/src/litep2p/shim/request_response/mod.rs index 82d01c01236f..a77acb464144 100644 --- a/substrate/client/network/src/litep2p/shim/request_response/mod.rs +++ b/substrate/client/network/src/litep2p/shim/request_response/mod.rs @@ -24,7 +24,7 @@ use crate::{ peer_store::PeerStoreProvider, request_responses::{IncomingRequest, OutgoingResponse}, service::{metrics::Metrics, traits::RequestResponseConfig as RequestResponseConfigT}, - IfDisconnected, ProtocolName, RequestFailure, + IfDisconnected, OutboundFailure, ProtocolName, RequestFailure, }; use futures::{channel::oneshot, future::BoxFuture, stream::FuturesUnordered, StreamExt}; @@ -369,10 +369,12 @@ impl RequestResponseProtocol { return }; - let error = match error { - RequestResponseError::NotConnected => Some(RequestFailure::NotConnected), - RequestResponseError::Rejected | RequestResponseError::Timeout => - Some(RequestFailure::Refused), + let status = match error { + RequestResponseError::NotConnected => + Some((RequestFailure::NotConnected, "not-connected")), + RequestResponseError::Rejected => Some((RequestFailure::Refused, "rejected")), + RequestResponseError::Timeout => + Some((RequestFailure::Network(OutboundFailure::Timeout), "timeout")), RequestResponseError::Canceled => { log::debug!( target: LOG_TARGET, @@ -387,7 +389,7 @@ impl RequestResponseProtocol { "{}: tried to send too large request to {peer:?} ({request_id:?})", self.protocol, ); - Some(RequestFailure::Refused) + Some((RequestFailure::Refused, "payload-too-large")) }, RequestResponseError::UnsupportedProtocol => match fallback_request { Some((request, protocol)) => match self.request_tx.get(&protocol) { @@ -426,15 +428,15 @@ impl RequestResponseProtocol { peer, ); - Some(RequestFailure::Refused) + Some((RequestFailure::Refused, "invalid-fallback-protocol")) }, }, - None => Some(RequestFailure::Refused), + None => Some((RequestFailure::Refused, "unsupported-protocol")), }, }; - if let Some(error) = error { - self.metrics.register_outbound_request_failure(error.to_string().as_ref()); + if let Some((error, reason)) = status { + self.metrics.register_outbound_request_failure(reason); let _ = tx.send(Err(error)); } }