From e1099324e15eeda1e9c13b28d03b456aa9ecc70c Mon Sep 17 00:00:00 2001 From: John DiSanti Date: Tue, 22 Feb 2022 12:58:24 -0800 Subject: [PATCH] Replenish cross-request retry allowance on successful response (#1197) * Replenish cross-request retry allowance on successful response * Update changelog * Rename `NotRetryable` to `UnretryableFailure` and fix credential retry classifiers * Incorporate feedback --- CHANGELOG.next.toml | 18 +++ .../src/http_credential_provider.rs | 8 +- .../aws-config/src/imds/client.rs | 44 +++++- aws/rust-runtime/aws-http/src/retry.rs | 12 +- .../dynamodb/tests/movies.rs | 8 +- rust-runtime/aws-smithy-client/src/retry.rs | 149 +++++++++++------- .../aws-smithy-client/tests/e2e_test.rs | 4 +- rust-runtime/aws-smithy-http/src/retry.rs | 2 +- rust-runtime/aws-smithy-types/src/retry.rs | 7 +- 9 files changed, 173 insertions(+), 79 deletions(-) diff --git a/CHANGELOG.next.toml b/CHANGELOG.next.toml index 5c328bc56d..9e6c415d4d 100644 --- a/CHANGELOG.next.toml +++ b/CHANGELOG.next.toml @@ -11,6 +11,24 @@ # meta = { "breaking" = false, "tada" = false, "bug" = false } # author = "rcoh" +[[aws-sdk-rust]] +message = "Fixed a bug that caused clients to eventually stop retrying. The cross-request retry allowance wasn't being reimbursed upon receiving a successful response, so once this allowance reached zero, no further retries would ever be attempted." +references = ["smithy-rs#1197"] +meta = { "breaking" = false, "tada" = false, "bug" = true } +author = "jdisanti" + +[[smithy-rs]] +message = "Fixed a bug that caused clients to eventually stop retrying. The cross-request retry allowance wasn't being reimbursed upon receiving a successful response, so once this allowance reached zero, no further retries would ever be attempted." +references = ["smithy-rs#1197"] +meta = { "breaking" = false, "tada" = false, "bug" = true } +author = "jdisanti" + +[[smithy-rs]] +message = "`aws_smithy_types::retry::RetryKind` had its `NotRetryable` variant split into `UnretryableFailure` and `Unnecessary`. If you implement the `ClassifyResponse`, then successful responses need to return `Unnecessary`, and failures that shouldn't be retried need to return `UnretryableFailure`." +references = ["smithy-rs#1197"] +meta = { "breaking" = true, "tada" = false, "bug" = false } +author = "jdisanti" + [[smithy-rs]] message = "`aws_smithy_types::primitive::Encoder` is now a struct rather than an enum, but its usage remains the same." references = ["smithy-rs#1209"] diff --git a/aws/rust-runtime/aws-config/src/http_credential_provider.rs b/aws/rust-runtime/aws-config/src/http_credential_provider.rs index ae152fe4e9..cb6a30de2a 100644 --- a/aws/rust-runtime/aws-config/src/http_credential_provider.rs +++ b/aws/rust-runtime/aws-config/src/http_credential_provider.rs @@ -174,7 +174,7 @@ impl ClassifyResponse, SdkError> * - Non-parseable 200 responses. * */ match response { - Ok(_) => RetryKind::NotRetryable, + Ok(_) => RetryKind::Unnecessary, // socket errors, networking timeouts Err(SdkError::DispatchFailure(client_err)) if client_err.is_timeout() || client_err.is_io() => @@ -192,7 +192,7 @@ impl ClassifyResponse, SdkError> { RetryKind::Error(ErrorKind::ServerError) } - Err(_) => RetryKind::NotRetryable, + Err(_) => RetryKind::UnretryableFailure, } } } @@ -260,7 +260,7 @@ mod test { assert_eq!( HttpCredentialRetryPolicy.classify(sdk_result.as_ref()), - RetryKind::NotRetryable + RetryKind::Unnecessary ); assert!(sdk_result.is_ok(), "should be ok: {:?}", sdk_result) @@ -275,7 +275,7 @@ mod test { let sdk_result = sdk_resp(error_response); assert_eq!( HttpCredentialRetryPolicy.classify(sdk_result.as_ref()), - RetryKind::NotRetryable + RetryKind::UnretryableFailure ); let sdk_error = sdk_result.expect_err("should be error"); diff --git a/aws/rust-runtime/aws-config/src/imds/client.rs b/aws/rust-runtime/aws-config/src/imds/client.rs index 6f52aa66b0..76534d3a38 100644 --- a/aws/rust-runtime/aws-config/src/imds/client.rs +++ b/aws/rust-runtime/aws-config/src/imds/client.rs @@ -693,7 +693,8 @@ impl ImdsErrorPolicy { _ if status.is_server_error() => RetryKind::Error(ErrorKind::ServerError), // 401 indicates that the token has expired, this is retryable _ if status.as_u16() == 401 => RetryKind::Error(ErrorKind::ServerError), - _ => RetryKind::NotRetryable, + // This catch-all includes successful responses that fail to parse. These should not be retried. + _ => RetryKind::UnretryableFailure, } } } @@ -710,11 +711,11 @@ impl ImdsErrorPolicy { impl ClassifyResponse, SdkError> for ImdsErrorPolicy { fn classify(&self, response: Result<&SdkSuccess, &SdkError>) -> RetryKind { match response { - Ok(_) => RetryKind::NotRetryable, + Ok(_) => RetryKind::Unnecessary, Err(SdkError::ResponseError { raw, .. }) | Err(SdkError::ServiceError { raw, .. }) => { ImdsErrorPolicy::classify(raw) } - _ => RetryKind::NotRetryable, + _ => RetryKind::UnretryableFailure, } } } @@ -723,20 +724,24 @@ impl ClassifyResponse, SdkError> for ImdsErrorPolicy { pub(crate) mod test { use std::collections::HashMap; use std::error::Error; + use std::io; use std::time::{Duration, UNIX_EPOCH}; use aws_smithy_async::rt::sleep::TokioSleep; use aws_smithy_client::erase::DynConnector; use aws_smithy_client::test_connection::{capture_request, TestConnection}; + use aws_smithy_client::{SdkError, SdkSuccess}; use aws_smithy_http::body::SdkBody; + use aws_smithy_http::operation; + use aws_smithy_types::retry::RetryKind; use aws_types::os_shim_internal::{Env, Fs, ManualTimeSource, TimeSource}; + use http::header::USER_AGENT; use http::Uri; use serde::Deserialize; use tracing_test::traced_test; - use crate::imds::client::{Client, EndpointMode}; + use crate::imds::client::{Client, EndpointMode, ImdsErrorPolicy}; use crate::provider_config::ProviderConfig; - use http::header::USER_AGENT; const TOKEN_A: &str = "AQAEAFTNrA4eEGx0AQgJ1arIq_Cc-t4tWt3fB0Hd8RKhXlKc5ccvhg=="; const TOKEN_B: &str = "alternatetoken=="; @@ -977,6 +982,35 @@ pub(crate) mod test { connection.assert_requests_match(&[]); } + /// Successful responses should classify as `RetryKind::Unnecessary` + #[test] + fn successful_response_properly_classified() { + use aws_smithy_http::retry::ClassifyResponse; + + let policy = ImdsErrorPolicy; + fn response_200() -> operation::Response { + operation::Response::new(imds_response("").map(|_| SdkBody::empty())) + } + let success = SdkSuccess { + raw: response_200(), + parsed: (), + }; + assert_eq!( + RetryKind::Unnecessary, + policy.classify(Ok::<_, &SdkError<()>>(&success)) + ); + + // Emulate a failure to parse the response body (using an io error since it's easy to construct in a test) + let failure = SdkError::<()>::ResponseError { + err: Box::new(io::Error::new(io::ErrorKind::BrokenPipe, "fail to parse")), + raw: response_200(), + }; + assert_eq!( + RetryKind::UnretryableFailure, + policy.classify(Err::<&SdkSuccess<()>, _>(&failure)) + ); + } + // since tokens are sent as headers, the tokens need to be valid header values #[tokio::test] async fn invalid_token() { diff --git a/aws/rust-runtime/aws-http/src/retry.rs b/aws/rust-runtime/aws-http/src/retry.rs index bf289397fe..7affba29aa 100644 --- a/aws/rust-runtime/aws-http/src/retry.rs +++ b/aws/rust-runtime/aws-http/src/retry.rs @@ -58,7 +58,7 @@ where { fn classify(&self, err: Result<&T, &SdkError>) -> RetryKind { let (err, response) = match err { - Ok(_) => return RetryKind::NotRetryable, + Ok(_) => return RetryKind::Unnecessary, Err(SdkError::ServiceError { err, raw }) => (err, raw), Err(SdkError::DispatchFailure(err)) => { return if err.is_timeout() || err.is_io() { @@ -66,10 +66,10 @@ where } else if let Some(ek) = err.is_other() { RetryKind::Error(ek) } else { - RetryKind::NotRetryable + RetryKind::UnretryableFailure } } - Err(_) => return RetryKind::NotRetryable, + Err(_) => return RetryKind::UnretryableFailure, }; if let Some(retry_after_delay) = response .http() @@ -95,7 +95,7 @@ where return RetryKind::Error(ErrorKind::TransientError); }; // TODO(https://github.com/awslabs/smithy-rs/issues/966): IDPCommuncation error needs to be retried - RetryKind::NotRetryable + RetryKind::UnretryableFailure } } @@ -151,7 +151,7 @@ mod test { let test_response = http::Response::new("OK"); assert_eq!( policy.classify(make_err(UnmodeledError, test_response).as_ref()), - RetryKind::NotRetryable + RetryKind::UnretryableFailure ); } @@ -177,7 +177,7 @@ mod test { .unwrap(); assert_eq!( policy.classify(make_err(UnmodeledError, test_resp).as_ref()), - RetryKind::NotRetryable + RetryKind::UnretryableFailure ); } diff --git a/aws/sdk/integration-tests/dynamodb/tests/movies.rs b/aws/sdk/integration-tests/dynamodb/tests/movies.rs index 1eff04da8c..0350b8817c 100644 --- a/aws/sdk/integration-tests/dynamodb/tests/movies.rs +++ b/aws/sdk/integration-tests/dynamodb/tests/movies.rs @@ -125,8 +125,8 @@ where &self, response: Result<&SdkSuccess, &SdkError>, ) -> RetryKind { - match self.inner.classify(response.clone()) { - RetryKind::NotRetryable => (), + match self.inner.classify(response) { + RetryKind::UnretryableFailure | RetryKind::Unnecessary => (), other => return other, }; match response { @@ -142,10 +142,10 @@ where { RetryKind::Explicit(Duration::from_secs(1)) } else { - RetryKind::NotRetryable + RetryKind::Unnecessary } } - _ => RetryKind::NotRetryable, + _ => RetryKind::UnretryableFailure, } } } diff --git a/rust-runtime/aws-smithy-client/src/retry.rs b/rust-runtime/aws-smithy-client/src/retry.rs index ad553a2b4b..faea04b930 100644 --- a/rust-runtime/aws-smithy-client/src/retry.rs +++ b/rust-runtime/aws-smithy-client/src/retry.rs @@ -228,6 +228,8 @@ impl CrossRequestRetryState { } } +type BoxFuture = Pin + Send>>; + /// RetryHandler /// /// Implement retries for an individual request. @@ -253,19 +255,12 @@ impl RetryHandler { /// /// If a retry is specified, this function returns `(next, backoff_duration)` /// If no retry is specified, this function returns None - fn attempt_retry(&self, retry_kind: Result<(), ErrorKind>) -> Option<(Self, Duration)> { - let quota_used = match retry_kind { - Ok(_) => { - self.shared - .quota_release(self.local.last_quota_usage, &self.config); + fn should_retry_error(&self, error_kind: &ErrorKind) -> Option<(Self, Duration)> { + let quota_used = { + if self.local.attempts == self.config.max_attempts { return None; } - Err(e) => { - if self.local.attempts == self.config.max_attempts { - return None; - } - self.shared.quota_acquire(&e, &self.config)? - } + self.shared.quota_acquire(error_kind, &self.config)? }; /* From the retry spec: @@ -291,48 +286,61 @@ impl RetryHandler { Some((next, backoff)) } -} -impl - tower::retry::Policy, SdkSuccess, SdkError> - for RetryHandler -where - Handler: Clone, - R: ClassifyResponse, SdkError>, -{ - type Future = Pin + Send>>; + fn should_retry(&self, retry_kind: &RetryKind) -> Option<(Self, Duration)> { + match retry_kind { + RetryKind::Explicit(dur) => Some((self.clone(), *dur)), + RetryKind::UnretryableFailure => None, + RetryKind::Unnecessary => { + self.shared + .quota_release(self.local.last_quota_usage, &self.config); + None + } + RetryKind::Error(err) => self.should_retry_error(err), + _ => None, + } + } + + fn retry_for(&self, retry_kind: RetryKind) -> Option> { + let (next, dur) = self.should_retry(&retry_kind)?; - fn retry( - &self, - req: &Operation, - result: Result<&SdkSuccess, &SdkError>, - ) -> Option { - let policy = req.retry_policy(); - let retry = policy.classify(result); let sleep = match &self.sleep_impl { Some(sleep) => sleep, None => { - if retry != RetryKind::NotRetryable { + if retry_kind != RetryKind::UnretryableFailure { tracing::debug!("cannot retry because no sleep implementation exists"); } return None; } }; - let (next, dur) = match retry { - RetryKind::Explicit(dur) => (self.clone(), dur), - RetryKind::NotRetryable => return None, - RetryKind::Error(err) => self.attempt_retry(Err(err))?, - _ => return None, - }; - let sleep_future = sleep.sleep(dur); let fut = async move { sleep_future.await; next } - .instrument(tracing::info_span!("retry", kind = &debug(retry))); + .instrument(tracing::info_span!("retry", kind = &debug(retry_kind))); Some(check_send(Box::pin(fut))) } +} + +impl + tower::retry::Policy, SdkSuccess, SdkError> + for RetryHandler +where + Handler: Clone, + R: ClassifyResponse, SdkError>, +{ + type Future = BoxFuture; + + fn retry( + &self, + req: &Operation, + result: Result<&SdkSuccess, &SdkError>, + ) -> Option { + let policy = req.retry_policy(); + let retry_kind = policy.classify(result); + self.retry_for(retry_kind) + } fn clone_request(&self, req: &Operation) -> Option> { req.try_clone() @@ -348,7 +356,7 @@ mod test { use crate::retry::{Config, NewRequestPolicy, RetryHandler, Standard}; - use aws_smithy_types::retry::ErrorKind; + use aws_smithy_types::retry::{ErrorKind, RetryKind}; use std::time::Duration; @@ -367,18 +375,18 @@ mod test { fn eventual_success() { let policy = Standard::new(test_config()).new_request_policy(None); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(1)); assert_eq!(policy.retry_quota(), 495); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(2)); assert_eq!(policy.retry_quota(), 490); - let no_retry = policy.attempt_retry(Ok(())); + let no_retry = policy.should_retry(&RetryKind::Unnecessary); assert!(no_retry.is_none()); assert_eq!(policy.retry_quota(), 495); } @@ -387,18 +395,18 @@ mod test { fn no_more_attempts() { let policy = Standard::new(test_config()).new_request_policy(None); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(1)); assert_eq!(policy.retry_quota(), 495); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(2)); assert_eq!(policy.retry_quota(), 490); - let no_retry = policy.attempt_retry(Err(ErrorKind::ServerError)); + let no_retry = policy.should_retry(&RetryKind::Error(ErrorKind::ServerError)); assert!(no_retry.is_none()); assert_eq!(policy.retry_quota(), 490); } @@ -408,46 +416,77 @@ mod test { let mut conf = test_config(); conf.initial_retry_tokens = 5; let policy = Standard::new(conf).new_request_policy(None); + let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(1)); assert_eq!(policy.retry_quota(), 0); - let no_retry = policy.attempt_retry(Err(ErrorKind::ServerError)); + + let no_retry = policy.should_retry(&RetryKind::Error(ErrorKind::ServerError)); assert!(no_retry.is_none()); assert_eq!(policy.retry_quota(), 0); } + #[test] + fn quota_replenishes_on_success() { + let mut conf = test_config(); + conf.initial_retry_tokens = 100; + let policy = Standard::new(conf).new_request_policy(None); + let (policy, dur) = policy + .should_retry(&RetryKind::Error(ErrorKind::TransientError)) + .expect("should retry"); + assert_eq!(dur, Duration::from_secs(1)); + assert_eq!(policy.retry_quota(), 90); + + let (policy, dur) = policy + .should_retry(&RetryKind::Explicit(Duration::from_secs(1))) + .expect("should retry"); + assert_eq!(dur, Duration::from_secs(1)); + assert_eq!( + policy.retry_quota(), + 90, + "explicit retry should not subtract from quota" + ); + + assert!( + policy.should_retry(&RetryKind::Unnecessary).is_none(), + "it should not retry success" + ); + let available = policy.shared.quota_available.lock().unwrap(); + assert_eq!(100, *available, "successful request should replenish quota"); + } + #[test] fn backoff_timing() { let mut conf = test_config(); conf.max_attempts = 5; let policy = Standard::new(conf).new_request_policy(None); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(1)); assert_eq!(policy.retry_quota(), 495); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(2)); assert_eq!(policy.retry_quota(), 490); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(4)); assert_eq!(policy.retry_quota(), 485); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(8)); assert_eq!(policy.retry_quota(), 480); - let no_retry = policy.attempt_retry(Err(ErrorKind::ServerError)); + let no_retry = policy.should_retry(&RetryKind::Error(ErrorKind::ServerError)); assert!(no_retry.is_none()); assert_eq!(policy.retry_quota(), 480); } @@ -459,30 +498,30 @@ mod test { conf.max_backoff = Duration::from_secs(3); let policy = Standard::new(conf).new_request_policy(None); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(1)); assert_eq!(policy.retry_quota(), 495); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(2)); assert_eq!(policy.retry_quota(), 490); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(3)); assert_eq!(policy.retry_quota(), 485); let (policy, dur) = policy - .attempt_retry(Err(ErrorKind::ServerError)) + .should_retry(&RetryKind::Error(ErrorKind::ServerError)) .expect("should retry"); assert_eq!(dur, Duration::from_secs(3)); assert_eq!(policy.retry_quota(), 480); - let no_retry = policy.attempt_retry(Err(ErrorKind::ServerError)); + let no_retry = policy.should_retry(&RetryKind::Error(ErrorKind::ServerError)); assert!(no_retry.is_none()); assert_eq!(policy.retry_quota(), 480); } diff --git a/rust-runtime/aws-smithy-client/tests/e2e_test.rs b/rust-runtime/aws-smithy-client/tests/e2e_test.rs index 8394f82db4..2731c43138 100644 --- a/rust-runtime/aws-smithy-client/tests/e2e_test.rs +++ b/rust-runtime/aws-smithy-client/tests/e2e_test.rs @@ -78,12 +78,12 @@ mod test_operation { fn classify(&self, err: Result<&T, &SdkError>) -> RetryKind { let kind = match err { Err(SdkError::ServiceError { err, .. }) => err.retryable_error_kind(), - Ok(_) => return RetryKind::NotRetryable, + Ok(_) => return RetryKind::Unnecessary, _ => panic!("test handler only handles modeled errors got: {:?}", err), }; match kind { Some(kind) => RetryKind::Error(kind), - None => RetryKind::NotRetryable, + None => RetryKind::UnretryableFailure, } } } diff --git a/rust-runtime/aws-smithy-http/src/retry.rs b/rust-runtime/aws-smithy-http/src/retry.rs index 5db5f795eb..b0dd393287 100644 --- a/rust-runtime/aws-smithy-http/src/retry.rs +++ b/rust-runtime/aws-smithy-http/src/retry.rs @@ -15,6 +15,6 @@ pub trait ClassifyResponse: Clone { impl ClassifyResponse for () { fn classify(&self, _: Result<&T, &E>) -> RetryKind { - RetryKind::NotRetryable + RetryKind::Unnecessary } } diff --git a/rust-runtime/aws-smithy-types/src/retry.rs b/rust-runtime/aws-smithy-types/src/retry.rs index 69e6706a7c..a17cc4d7a6 100644 --- a/rust-runtime/aws-smithy-types/src/retry.rs +++ b/rust-runtime/aws-smithy-types/src/retry.rs @@ -71,8 +71,11 @@ pub enum RetryKind { /// Note: The specified `Duration` is considered a suggestion and may be replaced or ignored. Explicit(Duration), - /// The response associated with this variant should not be retried. - NotRetryable, + /// The response was a failure that should _not_ be retried. + UnretryableFailure, + + /// The response was successful, so no retry is necessary. + Unnecessary, } /// Specifies how failed requests should be retried.