Skip to content

Commit

Permalink
ENG-3020: Backoff exponentially for server too busy errors
Browse files Browse the repository at this point in the history
Summary:
1) Backoff exponentially for server too busy errors.
2) Reduce log spew by converting some commonly seen errors into _EVERY_N_SECS

Test Plan: jenkins

Reviewers: kannan, sergei, bharat, hector, mikhail

Reviewed By: mikhail

Subscribers: ybase

Differential Revision: https://phabricator.dev.yugabyte.com/D4335
  • Loading branch information
amitanandaiyer committed Mar 13, 2018
1 parent e0b78dc commit d16bc28
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 10 deletions.
6 changes: 6 additions & 0 deletions src/yb/client/client-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,8 @@ DECLARE_int32(scanner_ttl_ms);
DECLARE_int32(tablet_server_svc_queue_length);

DEFINE_int32(test_scan_num_rows, 1000, "Number of rows to insert and scan");
DECLARE_int32(min_backoff_ms_exponent);
DECLARE_int32(max_backoff_ms_exponent);

METRIC_DECLARE_counter(rpcs_queue_overflow);

Expand Down Expand Up @@ -1850,6 +1852,10 @@ TEST_F(ClientTest, TestServerTooBusyRetry) {
// Reduce the service queue length of each tablet server in order to increase
// the likelihood of ERROR_SERVER_TOO_BUSY.
FLAGS_tablet_server_svc_queue_length = 1;
// Set the backoff limits to be small for this test, so that we finish in a reasonable
// amount of time.
FLAGS_min_backoff_ms_exponent = 0;
FLAGS_max_backoff_ms_exponent = 3;
for (int i = 0; i < cluster_->num_tablet_servers(); i++) {
MiniTabletServer* ts = cluster_->mini_tablet_server(i);
ASSERT_OK(ts->Restart());
Expand Down
2 changes: 1 addition & 1 deletion src/yb/client/tablet_rpc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ bool TabletInvoker::Done(Status* status) {
tablet_id_,
current_ts_string,
retrier_->attempt_num()));
LOG(WARNING) << status->ToString();
YB_LOG_EVERY_N_SECS(WARNING, 1) << status->ToString();
rpc_->Failed(old_status);
}

Expand Down
28 changes: 25 additions & 3 deletions src/yb/rpc/rpc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
#include "yb/rpc/messenger.h"
#include "yb/rpc/rpc_header.pb.h"

#include "yb/util/flag_tags.h"
#include "yb/util/random_util.h"
#include "yb/util/tsan_util.h"

Expand All @@ -54,6 +55,19 @@ DEFINE_int64(rpcs_shutdown_extra_delay_ms, 5000 * yb::kTimeMultiplier,
"Extra allowed time for a single RPC command to complete after its deadline.");
DEFINE_int64(retryable_rpc_single_call_timeout_ms, 5000 * yb::kTimeMultiplier,
"Timeout of single RPC call in retryable RPC command.");
DEFINE_int32(
min_backoff_ms_exponent, 7,
"Min amount of backoff delay if the server responds with TOO BUSY (default: 128ms). "
"Set this to some amount, during which the server might have recovered.");
DEFINE_int32(
max_backoff_ms_exponent, 16,
"Max amount of backoff delay if the server responds with TOO BUSY (default: 64 sec). "
"Set this to some duration, past which you are okay having no backoff for a Ddos "
"style build-up, during times when the server is overloaded, and unable to recover.");
TAG_FLAG(min_backoff_ms_exponent, hidden);
TAG_FLAG(min_backoff_ms_exponent, advanced);
TAG_FLAG(max_backoff_ms_exponent, hidden);
TAG_FLAG(max_backoff_ms_exponent, advanced);

namespace yb {

Expand All @@ -74,7 +88,7 @@ bool RpcRetrier::HandleResponse(RpcCommand* rpc, Status* out_status) {
if (err &&
err->has_code() &&
err->code() == ErrorStatusPB::ERROR_SERVER_TOO_BUSY) {
auto status = DelayedRetry(rpc, controller_status);
auto status = DelayedRetry(rpc, controller_status, BackoffStrategy::kExponential);
if (!status.ok()) {
*out_status = status;
return false;
Expand All @@ -87,15 +101,23 @@ bool RpcRetrier::HandleResponse(RpcCommand* rpc, Status* out_status) {
return false;
}

Status RpcRetrier::DelayedRetry(RpcCommand* rpc, const Status& why_status) {
Status RpcRetrier::DelayedRetry(
RpcCommand* rpc, const Status& why_status, BackoffStrategy strategy) {
if (!why_status.ok() && (last_error_.ok() || last_error_.IsTimedOut())) {
last_error_ = why_status;
}
// Add some jitter to the retry delay.
//
// If the delay causes us to miss our deadline, RetryCb will fail the
// RPC on our behalf.
int num_ms = attempt_num_++ + RandomUniformInt(0, 4);
// makes the call redundant by then.
int num_ms =
(strategy == BackoffStrategy::kExponential
? 1 << std::min(
FLAGS_min_backoff_ms_exponent + attempt_num_, FLAGS_max_backoff_ms_exponent)
: attempt_num_) +
RandomUniformInt(0, 4);
attempt_num_++;

RpcRetrierState expected_state = RpcRetrierState::kIdle;
while (!state_.compare_exchange_strong(expected_state, RpcRetrierState::kWaiting)) {
Expand Down
5 changes: 4 additions & 1 deletion src/yb/rpc/rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ class RpcCommand : public std::enable_shared_from_this<RpcCommand> {
};

YB_DEFINE_ENUM(RpcRetrierState, (kIdle)(kRunning)(kWaiting)(kFinished));
YB_DEFINE_ENUM(BackoffStrategy, (kLinear)(kExponential));

// Provides utilities for retrying failed RPCs.
//
Expand Down Expand Up @@ -111,7 +112,9 @@ class RpcRetrier {
// deadline has already expired at the time that Retry() was called.
//
// Callers should ensure that 'rpc' remains alive.
CHECKED_STATUS DelayedRetry(RpcCommand* rpc, const Status& why_status);
CHECKED_STATUS DelayedRetry(
RpcCommand* rpc, const Status& why_status,
BackoffStrategy strategy = BackoffStrategy::kLinear);

RpcController* mutable_controller() { return &controller_; }
const RpcController& controller() const { return controller_; }
Expand Down
8 changes: 4 additions & 4 deletions src/yb/rpc/service_pool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ class ServicePoolImpl {
yb::ToString(call->remote_address()),
type,
limit);
LOG(WARNING) << err_msg;
YB_LOG_EVERY_N_SECS(WARNING, 1) << err_msg;
const auto response_status = STATUS(ServiceUnavailable, err_msg);
rpcs_queue_overflow_->Increment();
call->RespondFailure(ErrorStatusPB::ERROR_SERVER_TOO_BUSY, response_status);
Expand All @@ -162,9 +162,9 @@ class ServicePoolImpl {
Overflow(call, "global", thread_pool_->options().queue_limit);
return;
}
LOG(WARNING) << call->method_name() << " request on "
<< service_->service_name() << " from " << call->remote_address()
<< " dropped because of: " << status.ToString();
YB_LOG_EVERY_N_SECS(WARNING, 1)
<< call->method_name() << " request on " << service_->service_name() << " from "
<< call->remote_address() << " dropped because of: " << status.ToString();
const auto response_status = STATUS(ServiceUnavailable, "Service is shutting down");
call->RespondFailure(ErrorStatusPB::FATAL_SERVER_SHUTTING_DOWN, response_status);
}
Expand Down
3 changes: 2 additions & 1 deletion src/yb/yql/redis/redisserver/redis_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,8 @@ class Block : public std::enable_shared_from_this<Block> {
if (!status.ok()) {
if (session_.get() != nullptr) {
for (const auto& error : session_->GetPendingErrors()) {
LOG(WARNING) << "Explicit error while inserting: " << error->status().ToString();
YB_LOG_EVERY_N_SECS(WARNING, 1) << "Explicit error while inserting: "
<< error->status().ToString();
}
}
}
Expand Down

0 comments on commit d16bc28

Please sign in to comment.