Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

YQ-3405 fixed endless retries for external error #6722

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,15 @@ message TQueryMapping {

// 1. StatusCode(s) are handled with defined policies, non-unique StatusCode(s) across all policies is UB
// 2. RetryCount and RetryPeriodMs are used to calculate actual RetryRate, if it exceeds RetryCount, query is aborted
// - Number of retries during RetryPeriod time less than 2 * RetryCount due to RetryRate
// 3. BackoffPeriodMs is factor of RetryRate to delay query execution before next retry
// 4. There are no default retry policy, all unhandled statuses are fatal
// 4. RetryLimit is hard limit for amount query retry count, after that query is aborted
// - If RetryLimit = 0, query can be abborted only by RetryRate
// 5. There are no default retry policy, all unhandled statuses are fatal

message TRetryPolicy {
uint64 RetryCount = 1;
uint64 RetryLimit = 4;
string RetryPeriod = 2;
string BackoffPeriod = 3;
}
Expand Down
3 changes: 2 additions & 1 deletion ydb/core/fq/libs/control_plane_storage/config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,11 @@ TControlPlaneStorageConfig::TControlPlaneStorageConfig(const NConfig::TControlPl
for (const auto& mapping : Proto.GetRetryPolicyMapping()) {
auto& retryPolicy = mapping.GetPolicy();
auto retryCount = retryPolicy.GetRetryCount();
auto retryLimit = retryPolicy.GetRetryLimit();
auto retryPeriod = GetDuration(retryPolicy.GetRetryPeriod(), TDuration::Hours(1));
auto backoffPeriod = GetDuration(retryPolicy.GetBackoffPeriod(), TDuration::Zero());
for (const auto statusCode: mapping.GetStatusCode()) {
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryPeriod, backoffPeriod));
RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryLimit, retryPeriod, backoffPeriod));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ TPingTaskParams ConstructHardPingTask(
internal.clear_operation_id();
}

TRetryPolicyItem policy(0, TDuration::Seconds(1), TDuration::Zero());
TRetryPolicyItem policy(0, 0, TDuration::Seconds(1), TDuration::Zero());
auto it = retryPolicies.find(request.status_code());
auto policyFound = it != retryPolicies.end();
if (policyFound) {
Expand All @@ -183,7 +183,7 @@ TPingTaskParams ConstructHardPingTask(
TStringBuilder builder;
builder << "Query failed with code " << NYql::NDqProto::StatusIds_StatusCode_Name(request.status_code());
if (policy.RetryCount) {
builder << " (failure rate " << retryLimiter.RetryRate << " exceeds limit of " << policy.RetryCount << ")";
builder << " (" << retryLimiter.LastError << ")";
}
builder << " at " << Now();

Expand Down
12 changes: 11 additions & 1 deletion ydb/core/fq/libs/control_plane_storage/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,16 @@ bool TRetryLimiter::UpdateOnRetry(const TInstant& lastSeenAt, const TRetryPolicy
RetryRate = 0.0;
}
}
bool shouldRetry = RetryRate < policy.RetryCount;

bool shouldRetry = true;
if (RetryRate >= policy.RetryCount) {
shouldRetry = false;
LastError = TStringBuilder() << "failure rate " << RetryRate << " exceeds limit of " << policy.RetryCount;
} else if (policy.RetryLimit && RetryCount >= policy.RetryLimit) {
shouldRetry = false;
LastError = TStringBuilder() << "retry count reached limit of " << policy.RetryLimit;
}

if (shouldRetry) {
RetryCount++;
RetryCounterUpdatedAt = now;
Expand Down Expand Up @@ -140,6 +149,7 @@ NConfig::TControlPlaneStorageConfig FillDefaultParameters(NConfig::TControlPlane
policyMapping.AddStatusCode(NYql::NDqProto::StatusIds::EXTERNAL_ERROR);
auto& policy = *policyMapping.MutablePolicy();
policy.SetRetryCount(10);
policy.SetRetryLimit(40);
policy.SetRetryPeriod("1m");
policy.SetBackoffPeriod("1s");
}
Expand Down
6 changes: 4 additions & 2 deletions ydb/core/fq/libs/control_plane_storage/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,11 @@ namespace NFq {
class TRetryPolicyItem {
public:
TRetryPolicyItem() = default;
TRetryPolicyItem(ui64 retryCount, const TDuration& retryPeriod, const TDuration& backoffPeriod)
: RetryCount(retryCount), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
TRetryPolicyItem(ui64 retryCount, ui64 retryLimit, const TDuration& retryPeriod, const TDuration& backoffPeriod)
: RetryCount(retryCount), RetryLimit(retryLimit), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod)
{ }
ui64 RetryCount = 0;
ui64 RetryLimit = 0;
TDuration RetryPeriod = TDuration::Zero();
TDuration BackoffPeriod = TDuration::Zero();
};
Expand All @@ -32,6 +33,7 @@ class TRetryLimiter {
ui64 RetryCount = 0;
TInstant RetryCounterUpdatedAt = TInstant::Zero();
double RetryRate = 0.0;
TString LastError;
};

bool IsTerminalStatus(FederatedQuery::QueryMeta::ComputeStatus status);
Expand Down
Loading