Skip to content

Commit

Permalink
Auto merge of rust-lang#11881 - ehuss:http-retry, r=epage
Browse files Browse the repository at this point in the history
Add delays to network retries.

This adds a delay to network retries to help guard against short-lived transient errors.

The overview of how this works is: Requests that fail are placed into a queue with a timestamp of when they should be retried. The download code then checks for any requests that need to be retried, and re-injects them back into curl.

Care must be taken in the download loops to track which requests are currently in flight *plus* which requests are waiting to be retried.

This also adds jitter to the retry times so that multiple failed requests don't all flood the server when they are retried. This is a very primitive form of jitter which suffers from clumping, but I think it should be fine.

The max number of retries is raised to 3 in order to bring the total retry time to around 10 seconds. This is intended to address Cloudfront's default negative TTL of 10 seconds. The retry schedule looks like 0.5seconds ± 1 second, then 3.5 seconds then 6.5 seconds, for a total of 10.5 to 11.5 seconds.  If the user configures additional retries, each attempt afterwards has a max delay of 10 seconds.

The retry times are currently not user-configurable. This could potentially be added in the future, but I don't think is particularly necessary for now.

There is an unfortunate amount of duplication between the crate download and HTTP index code. I think in the near future we should work on consolidating that code. That might be challenging, since their use patterns are different, but I think it is feasible.
  • Loading branch information
bors committed Mar 31, 2023
2 parents e589a5f + 4fdea65 commit 0e474cf
Show file tree
Hide file tree
Showing 12 changed files with 585 additions and 136 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ os_info = "3.5.0"
pasetors = { version = "0.6.4", features = ["v3", "paserk", "std", "serde"] }
pathdiff = "0.2"
pretty_env_logger = { version = "0.4", optional = true }
rand = "0.8.5"
rustfix = "0.6.0"
semver = { version = "1.0.3", features = ["serde"] }
serde = { version = "1.0.123", features = ["derive"] }
Expand Down
14 changes: 6 additions & 8 deletions crates/cargo-test-support/src/registry.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ pub struct RegistryBuilder {
/// Write the registry in configuration.
configure_registry: bool,
/// API responders.
custom_responders: HashMap<&'static str, Box<dyn Send + Fn(&Request, &HttpServer) -> Response>>,
custom_responders: HashMap<String, Box<dyn Send + Fn(&Request, &HttpServer) -> Response>>,
/// If nonzero, the git index update to be delayed by the given number of seconds.
delayed_index_update: usize,
}
Expand Down Expand Up @@ -167,10 +167,11 @@ impl RegistryBuilder {
#[must_use]
pub fn add_responder<R: 'static + Send + Fn(&Request, &HttpServer) -> Response>(
mut self,
url: &'static str,
url: impl Into<String>,
responder: R,
) -> Self {
self.custom_responders.insert(url, Box::new(responder));
self.custom_responders
.insert(url.into(), Box::new(responder));
self
}

Expand Down Expand Up @@ -601,7 +602,7 @@ pub struct HttpServer {
addr: SocketAddr,
token: Token,
auth_required: bool,
custom_responders: HashMap<&'static str, Box<dyn Send + Fn(&Request, &HttpServer) -> Response>>,
custom_responders: HashMap<String, Box<dyn Send + Fn(&Request, &HttpServer) -> Response>>,
delayed_index_update: usize,
}

Expand All @@ -621,10 +622,7 @@ impl HttpServer {
api_path: PathBuf,
token: Token,
auth_required: bool,
api_responders: HashMap<
&'static str,
Box<dyn Send + Fn(&Request, &HttpServer) -> Response>,
>,
api_responders: HashMap<String, Box<dyn Send + Fn(&Request, &HttpServer) -> Response>>,
delayed_index_update: usize,
) -> HttpServerHandle {
let listener = TcpListener::bind("127.0.0.1:0").unwrap();
Expand Down
118 changes: 69 additions & 49 deletions src/cargo/core/package.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ use crate::ops;
use crate::util::config::PackageCacheLock;
use crate::util::errors::{CargoResult, HttpNotSuccessful};
use crate::util::interning::InternedString;
use crate::util::network::Retry;
use crate::util::network::retry::{Retry, RetryResult};
use crate::util::network::sleep::SleepTracker;
use crate::util::{self, internal, Config, Progress, ProgressStyle};

pub const MANIFEST_PREAMBLE: &str = "\
Expand Down Expand Up @@ -319,6 +320,8 @@ pub struct Downloads<'a, 'cfg> {
/// Set of packages currently being downloaded. This should stay in sync
/// with `pending`.
pending_ids: HashSet<PackageId>,
/// Downloads that have failed and are waiting to retry again later.
sleeping: SleepTracker<(Download<'cfg>, Easy)>,
/// The final result of each download. A pair `(token, result)`. This is a
/// temporary holding area, needed because curl can report multiple
/// downloads at once, but the main loop (`wait`) is written to only
Expand Down Expand Up @@ -442,6 +445,7 @@ impl<'cfg> PackageSet<'cfg> {
next: 0,
pending: HashMap::new(),
pending_ids: HashSet::new(),
sleeping: SleepTracker::new(),
results: Vec::new(),
progress: RefCell::new(Some(Progress::with_style(
"Downloading",
Expand Down Expand Up @@ -800,7 +804,7 @@ impl<'a, 'cfg> Downloads<'a, 'cfg> {

/// Returns the number of crates that are still downloading.
pub fn remaining(&self) -> usize {
self.pending.len()
self.pending.len() + self.sleeping.len()
}

/// Blocks the current thread waiting for a package to finish downloading.
Expand Down Expand Up @@ -831,51 +835,52 @@ impl<'a, 'cfg> Downloads<'a, 'cfg> {
let ret = {
let timed_out = &dl.timed_out;
let url = &dl.url;
dl.retry
.r#try(|| {
if let Err(e) = result {
// If this error is "aborted by callback" then that's
// probably because our progress callback aborted due to
// a timeout. We'll find out by looking at the
// `timed_out` field, looking for a descriptive message.
// If one is found we switch the error code (to ensure
// it's flagged as spurious) and then attach our extra
// information to the error.
if !e.is_aborted_by_callback() {
return Err(e.into());
}
dl.retry.r#try(|| {
if let Err(e) = result {
// If this error is "aborted by callback" then that's
// probably because our progress callback aborted due to
// a timeout. We'll find out by looking at the
// `timed_out` field, looking for a descriptive message.
// If one is found we switch the error code (to ensure
// it's flagged as spurious) and then attach our extra
// information to the error.
if !e.is_aborted_by_callback() {
return Err(e.into());
}

return Err(match timed_out.replace(None) {
Some(msg) => {
let code = curl_sys::CURLE_OPERATION_TIMEDOUT;
let mut err = curl::Error::new(code);
err.set_extra(msg);
err
}
None => e,
return Err(match timed_out.replace(None) {
Some(msg) => {
let code = curl_sys::CURLE_OPERATION_TIMEDOUT;
let mut err = curl::Error::new(code);
err.set_extra(msg);
err
}
.into());
None => e,
}
.into());
}

let code = handle.response_code()?;
if code != 200 && code != 0 {
let url = handle.effective_url()?.unwrap_or(url);
return Err(HttpNotSuccessful {
code,
url: url.to_string(),
body: data,
}
.into());
let code = handle.response_code()?;
if code != 200 && code != 0 {
let url = handle.effective_url()?.unwrap_or(url);
return Err(HttpNotSuccessful {
code,
url: url.to_string(),
body: data,
}
Ok(data)
})
.with_context(|| format!("failed to download from `{}`", dl.url))?
.into());
}
Ok(data)
})
};
match ret {
Some(data) => break (dl, data),
None => {
self.pending_ids.insert(dl.id);
self.enqueue(dl, handle)?
RetryResult::Success(data) => break (dl, data),
RetryResult::Err(e) => {
return Err(e.context(format!("failed to download from `{}`", dl.url)))
}
RetryResult::Retry(sleep) => {
debug!("download retry {} for {sleep}ms", dl.url);
self.sleeping.push(sleep, (dl, handle));
}
}
};
Expand Down Expand Up @@ -963,6 +968,7 @@ impl<'a, 'cfg> Downloads<'a, 'cfg> {
// actually block waiting for I/O to happen, which we achieve with the
// `wait` method on `multi`.
loop {
self.add_sleepers()?;
let n = tls::set(self, || {
self.set
.multi
Expand All @@ -985,17 +991,31 @@ impl<'a, 'cfg> Downloads<'a, 'cfg> {
if let Some(pair) = results.pop() {
break Ok(pair);
}
assert!(!self.pending.is_empty());
let min_timeout = Duration::new(1, 0);
let timeout = self.set.multi.get_timeout()?.unwrap_or(min_timeout);
let timeout = timeout.min(min_timeout);
self.set
.multi
.wait(&mut [], timeout)
.with_context(|| "failed to wait on curl `Multi`")?;
assert_ne!(self.remaining(), 0);
if self.pending.is_empty() {
let delay = self.sleeping.time_to_next().unwrap();
debug!("sleeping main thread for {delay:?}");
std::thread::sleep(delay);
} else {
let min_timeout = Duration::new(1, 0);
let timeout = self.set.multi.get_timeout()?.unwrap_or(min_timeout);
let timeout = timeout.min(min_timeout);
self.set
.multi
.wait(&mut [], timeout)
.with_context(|| "failed to wait on curl `Multi`")?;
}
}
}

fn add_sleepers(&mut self) -> CargoResult<()> {
for (dl, handle) in self.sleeping.to_retry() {
self.pending_ids.insert(dl.id);
self.enqueue(dl, handle)?;
}
Ok(())
}

fn progress(&self, token: usize, total: u64, cur: u64) -> bool {
let dl = &self.pending[&token].0;
dl.total.set(total);
Expand Down Expand Up @@ -1061,7 +1081,7 @@ impl<'a, 'cfg> Downloads<'a, 'cfg> {
return Ok(());
}
}
let pending = self.pending.len();
let pending = self.remaining();
let mut msg = if pending == 1 {
format!("{} crate", pending)
} else {
Expand Down
2 changes: 1 addition & 1 deletion src/cargo/sources/git/oxide.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ pub fn with_retry_and_progress(
) -> CargoResult<()> {
std::thread::scope(|s| {
let mut progress_bar = Progress::new("Fetch", config);
network::with_retry(config, || {
network::retry::with_retry(config, || {
let progress_root: Arc<gix::progress::tree::Root> =
gix::progress::tree::root::Options {
initial_capacity: 10,
Expand Down
2 changes: 1 addition & 1 deletion src/cargo/sources/git/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -739,7 +739,7 @@ pub fn with_fetch_options(
let ssh_config = config.net_config()?.ssh.as_ref();
let config_known_hosts = ssh_config.and_then(|ssh| ssh.known_hosts.as_ref());
let diagnostic_home_config = config.diagnostic_home_config();
network::with_retry(config, || {
network::retry::with_retry(config, || {
with_authentication(config, url, git_config, |f| {
let port = Url::parse(url).ok().and_then(|url| url.port());
let mut last_update = Instant::now();
Expand Down
Loading

0 comments on commit 0e474cf

Please sign in to comment.