From fe5e16fb9d9a80aed639f57716a070b0a2a76689 Mon Sep 17 00:00:00 2001 From: Pisarenko Grigoriy <79596613+GrigoriyPA@users.noreply.github.com> Date: Tue, 16 Jul 2024 15:59:18 +0300 Subject: [PATCH] YQ-3405 fixed endless retries for external error (#6722) --- .../libs/config/protos/control_plane_storage.proto | 6 +++++- ydb/core/fq/libs/control_plane_storage/config.cpp | 3 ++- .../control_plane_storage/internal/task_ping.cpp | 4 ++-- ydb/core/fq/libs/control_plane_storage/util.cpp | 12 +++++++++++- ydb/core/fq/libs/control_plane_storage/util.h | 6 ++++-- 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/ydb/core/fq/libs/config/protos/control_plane_storage.proto b/ydb/core/fq/libs/config/protos/control_plane_storage.proto index 1c4b668d4ccc..9cc2d31158d4 100644 --- a/ydb/core/fq/libs/config/protos/control_plane_storage.proto +++ b/ydb/core/fq/libs/config/protos/control_plane_storage.proto @@ -24,11 +24,15 @@ message TQueryMapping { // 1. StatusCode(s) are handled with defined policies, non-unique StatusCode(s) across all policies is UB // 2. RetryCount and RetryPeriodMs are used to calculate actual RetryRate, if it exceeds RetryCount, query is aborted +// - Number of retries during RetryPeriod time less than 2 * RetryCount due to RetryRate // 3. BackoffPeriodMs is factor of RetryRate to delay query execution before next retry -// 4. There are no default retry policy, all unhandled statuses are fatal +// 4. RetryLimit is hard limit for amount query retry count, after that query is aborted +// - If RetryLimit = 0, query can be abborted only by RetryRate +// 5. There are no default retry policy, all unhandled statuses are fatal message TRetryPolicy { uint64 RetryCount = 1; + uint64 RetryLimit = 4; string RetryPeriod = 2; string BackoffPeriod = 3; } diff --git a/ydb/core/fq/libs/control_plane_storage/config.cpp b/ydb/core/fq/libs/control_plane_storage/config.cpp index 2cd4dd6bbe74..41638b506741 100644 --- a/ydb/core/fq/libs/control_plane_storage/config.cpp +++ b/ydb/core/fq/libs/control_plane_storage/config.cpp @@ -50,10 +50,11 @@ TControlPlaneStorageConfig::TControlPlaneStorageConfig(const NConfig::TControlPl for (const auto& mapping : Proto.GetRetryPolicyMapping()) { auto& retryPolicy = mapping.GetPolicy(); auto retryCount = retryPolicy.GetRetryCount(); + auto retryLimit = retryPolicy.GetRetryLimit(); auto retryPeriod = GetDuration(retryPolicy.GetRetryPeriod(), TDuration::Hours(1)); auto backoffPeriod = GetDuration(retryPolicy.GetBackoffPeriod(), TDuration::Zero()); for (const auto statusCode: mapping.GetStatusCode()) { - RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryPeriod, backoffPeriod)); + RetryPolicies.emplace(statusCode, TRetryPolicyItem(retryCount, retryLimit, retryPeriod, backoffPeriod)); } } diff --git a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp index c0802446d019..bddcfbeab0e7 100644 --- a/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp +++ b/ydb/core/fq/libs/control_plane_storage/internal/task_ping.cpp @@ -156,7 +156,7 @@ TPingTaskParams ConstructHardPingTask( internal.clear_operation_id(); } - TRetryPolicyItem policy(0, TDuration::Seconds(1), TDuration::Zero()); + TRetryPolicyItem policy(0, 0, TDuration::Seconds(1), TDuration::Zero()); auto it = retryPolicies.find(request.status_code()); auto policyFound = it != retryPolicies.end(); if (policyFound) { @@ -183,7 +183,7 @@ TPingTaskParams ConstructHardPingTask( TStringBuilder builder; builder << "Query failed with code " << NYql::NDqProto::StatusIds_StatusCode_Name(request.status_code()); if (policy.RetryCount) { - builder << " (failure rate " << retryLimiter.RetryRate << " exceeds limit of " << policy.RetryCount << ")"; + builder << " (" << retryLimiter.LastError << ")"; } builder << " at " << Now(); diff --git a/ydb/core/fq/libs/control_plane_storage/util.cpp b/ydb/core/fq/libs/control_plane_storage/util.cpp index 74c5bd866406..b296952bdbca 100644 --- a/ydb/core/fq/libs/control_plane_storage/util.cpp +++ b/ydb/core/fq/libs/control_plane_storage/util.cpp @@ -28,7 +28,16 @@ bool TRetryLimiter::UpdateOnRetry(const TInstant& lastSeenAt, const TRetryPolicy RetryRate = 0.0; } } - bool shouldRetry = RetryRate < policy.RetryCount; + + bool shouldRetry = true; + if (RetryRate >= policy.RetryCount) { + shouldRetry = false; + LastError = TStringBuilder() << "failure rate " << RetryRate << " exceeds limit of " << policy.RetryCount; + } else if (policy.RetryLimit && RetryCount >= policy.RetryLimit) { + shouldRetry = false; + LastError = TStringBuilder() << "retry count reached limit of " << policy.RetryLimit; + } + if (shouldRetry) { RetryCount++; RetryCounterUpdatedAt = now; @@ -140,6 +149,7 @@ NConfig::TControlPlaneStorageConfig FillDefaultParameters(NConfig::TControlPlane policyMapping.AddStatusCode(NYql::NDqProto::StatusIds::EXTERNAL_ERROR); auto& policy = *policyMapping.MutablePolicy(); policy.SetRetryCount(10); + policy.SetRetryLimit(40); policy.SetRetryPeriod("1m"); policy.SetBackoffPeriod("1s"); } diff --git a/ydb/core/fq/libs/control_plane_storage/util.h b/ydb/core/fq/libs/control_plane_storage/util.h index fd9c30cf11b6..3f10cb182f4e 100644 --- a/ydb/core/fq/libs/control_plane_storage/util.h +++ b/ydb/core/fq/libs/control_plane_storage/util.h @@ -15,10 +15,11 @@ namespace NFq { class TRetryPolicyItem { public: TRetryPolicyItem() = default; - TRetryPolicyItem(ui64 retryCount, const TDuration& retryPeriod, const TDuration& backoffPeriod) - : RetryCount(retryCount), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod) + TRetryPolicyItem(ui64 retryCount, ui64 retryLimit, const TDuration& retryPeriod, const TDuration& backoffPeriod) + : RetryCount(retryCount), RetryLimit(retryLimit), RetryPeriod(retryPeriod), BackoffPeriod(backoffPeriod) { } ui64 RetryCount = 0; + ui64 RetryLimit = 0; TDuration RetryPeriod = TDuration::Zero(); TDuration BackoffPeriod = TDuration::Zero(); }; @@ -32,6 +33,7 @@ class TRetryLimiter { ui64 RetryCount = 0; TInstant RetryCounterUpdatedAt = TInstant::Zero(); double RetryRate = 0.0; + TString LastError; }; bool IsTerminalStatus(FederatedQuery::QueryMeta::ComputeStatus status);