Skip to content

Commit

Permalink
[#20124] docdb: leaderless tablet endpoint fixes
Browse files Browse the repository at this point in the history
Summary:
The leaderless tablet endpoint can incorrectly report a tablet which just had leader changed as leaderless tablet because:
The new leader's lease does not get updated right after the leader change, instead, it's updated by the next heartbeat from the new leader. And if the new leader originally has a very old leader lease because it was the leader long time ago, we can get this tablet incorrectly reported as leaderless.

In this diff, the leaderless tablet detection logic is modified to rely on checking the last time the tablet has a valid leader (peer role is LEADER and has a valid lease), so a per-tablet `TabletInfo::last_time_with_valid_leader_` is introduced to track it.

This timestamp will be set to MonoTime::Now when:

  # initialization of the TabletInfo
  # when handling ts metrics heartbeat and find the leader of the tablet reported:

```
  its status as HAS_LEASE
  the ht lease is greater than the last reported value (to avoid stale heartbeats)
  the ht lease is not expired for more than `FLAGS_maximum_tablet_leader_lease_expired_secs` seconds
```

At detection, if the tablet doesn't have a valid leader for more than `FLAGS_maximum_tablet_without_valid_leader_secs` seconds, the tablet is treated as leaderless.

With this approach, if the tablet has 3 followers (between leader step-down and re-election), leaderless tablet endpoint can also avoid reporting such tablets as leaderless right after step-down happens, instead, will be detected only when no valid leader lease is reported after `FLAGS_maximum_tablet_without_valid_leader_secs` seconds after leader change. (MasterPathHandlersLeaderlessITest.TestAllFollowers is added to simulate this scenario).
Jira: DB-9064

Test Plan: MasterPathHandlersLeaderlessITest.*

Reviewers: asrivastava, zdrudi

Reviewed By: asrivastava, zdrudi

Subscribers: bogdan, ybase, esheng

Differential Revision: https://phorge.dev.yugabyte.com/D31078
  • Loading branch information
Huqicheng committed Dec 20, 2023
1 parent 015326c commit afe4c00
Show file tree
Hide file tree
Showing 6 changed files with 226 additions and 97 deletions.
146 changes: 141 additions & 5 deletions src/yb/integration-tests/master_path_handlers-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,10 @@ DECLARE_bool(TEST_tserver_disable_heartbeat);

DECLARE_int32(follower_unavailable_considered_failed_sec);

DECLARE_uint64(master_maximum_heartbeats_without_lease);
DECLARE_int32(tserver_heartbeat_metrics_interval_ms);
DECLARE_int32(cleanup_split_tablets_interval_sec);
DECLARE_int32(catalog_manager_bg_task_wait_ms);
DECLARE_bool(enable_automatic_tablet_splitting);
DECLARE_uint32(leaderless_tablet_alert_delay_secs);

namespace yb {
namespace master {
Expand Down Expand Up @@ -256,6 +255,9 @@ TEST_F(MasterPathHandlersItest, TestDeadTServers) {
}

TEST_F(MasterPathHandlersItest, TestTabletReplicationEndpoint) {
// Alert for leaderless tablet is delayed for FLAGS_leaderless_tablet_alert_delay_secs secodns.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_leaderless_tablet_alert_delay_secs) =
FLAGS_heartbeat_interval_ms / 1000 * 5;
auto table = CreateTestTable(kNumTablets);

// Choose a tablet to orphan and take note of the servers which are leaders/followers for this
Expand Down Expand Up @@ -991,6 +993,12 @@ TEST_F_EX(MasterPathHandlersItest, TestTablePlacementInfo, MasterPathHandlersExt

class MasterPathHandlersLeaderlessITest : public MasterPathHandlersExternalItest {
public:
void SetUp() override {
opts_.extra_tserver_flags.push_back(
{Format("--tserver_heartbeat_metrics_interval_ms=$0", kTserverHeartbeatMetricsIntervalMs)});
MasterPathHandlersExternalItest::SetUp();
}

void CreateSingleTabletTestTable() {
table_ = CreateTestTable(1);
}
Expand All @@ -1009,13 +1017,31 @@ class MasterPathHandlersLeaderlessITest : public MasterPathHandlersExternalItest
return "";
}

Status WaitForLeaderPeer(const TabletId& tablet_id, uint64_t leader_idx) {
return WaitFor([&] {
const auto current_leader_idx_result = cluster_->GetTabletLeaderIndex(tablet_id);
if (current_leader_idx_result.ok()) {
return *current_leader_idx_result == leader_idx;
}
return false;
},
10s,
Format("Peer $0 becomes leader of tablet $1",
cluster_->tablet_server(leader_idx)->uuid(),
tablet_id));
}

bool HasLeaderPeer(const TabletId& tablet_id) {
const auto current_leader_idx_result = cluster_->GetTabletLeaderIndex(tablet_id);
return current_leader_idx_result.ok();
}

std::shared_ptr<client::YBTable> table_;
static constexpr int kTserverHeartbeatMetricsIntervalMs = 1000;
};

TEST_F(MasterPathHandlersLeaderlessITest, TestLeaderlessTabletEndpoint) {
ASSERT_OK(cluster_->SetFlagOnMasters("maximum_tablet_leader_lease_expired_secs", "5"));
ASSERT_OK(cluster_->SetFlagOnMasters("master_maximum_heartbeats_without_lease", "2"));
ASSERT_OK(cluster_->SetFlagOnTServers("tserver_heartbeat_metrics_interval_ms", "1000"));
ASSERT_OK(cluster_->SetFlagOnMasters("leaderless_tablet_alert_delay_secs", "5"));
CreateSingleTabletTestTable();
auto tablet_id = GetSingleTabletId();

Expand Down Expand Up @@ -1079,24 +1105,134 @@ TEST_F(MasterPathHandlersLeaderlessITest, TestLeaderlessTabletEndpoint) {
}, 20s * kTimeMultiplier, "leaderless tablet endpoint becomes empty"));
}

TEST_F(MasterPathHandlersLeaderlessITest, TestLeaderChange) {
const auto kMaxLeaderLeaseExpiredMs = 5000;
const auto kHtLeaseDurationMs = 2000;
const auto kMaxTabletWithoutValidLeaderMs = 5000;
ASSERT_OK(cluster_->SetFlagOnMasters("maximum_tablet_leader_lease_expired_secs",
std::to_string(kMaxLeaderLeaseExpiredMs / 1000)));
ASSERT_OK(cluster_->SetFlagOnMasters("leaderless_tablet_alert_delay_secs",
std::to_string(kMaxTabletWithoutValidLeaderMs / 1000)));
ASSERT_OK(cluster_->SetFlagOnTServers("ht_lease_duration_ms",
std::to_string(kHtLeaseDurationMs)));
CreateSingleTabletTestTable();
auto tablet_id = GetSingleTabletId();

SleepFor(kTserverHeartbeatMetricsIntervalMs * 2ms);

// Initially the leaderless tablets list should be empty.
string result = GetLeaderlessTabletsString();
ASSERT_EQ(result.find(tablet_id), string::npos);

const auto leader_idx = CHECK_RESULT(cluster_->GetTabletLeaderIndex(tablet_id));
const auto leader = cluster_->tablet_server(leader_idx);
const auto follower_idx = (leader_idx + 1) % 3;

auto ts_map = ASSERT_RESULT(itest::CreateTabletServerMap(
cluster_->GetLeaderMasterProxy<master::MasterClusterProxy>(), &cluster_->proxy_cache()));

const auto new_leader_idx = follower_idx;
const auto new_leader = cluster_->tablet_server(new_leader_idx);
ASSERT_OK(itest::LeaderStepDown(
ts_map[leader->uuid()].get(), tablet_id, ts_map[new_leader->uuid()].get(), 10s));
ASSERT_OK(WaitForLeaderPeer(tablet_id, new_leader_idx));

// Wait the old leader's tracked leader lease to be expired. The maximum wait time is
// (ht_lease_duration + max_leader_lease_expired).
SleepFor((kHtLeaseDurationMs + kMaxLeaderLeaseExpiredMs) * 1ms);

ASSERT_OK(cluster_->SetFlagOnMasters("TEST_skip_processing_tablet_metadata", "true"));
ASSERT_OK(itest::LeaderStepDown(
ts_map[new_leader->uuid()].get(), tablet_id, ts_map[leader->uuid()].get(), 10s));
ASSERT_OK(WaitForLeaderPeer(tablet_id, leader_idx));

// Wait the next ts heartbeat to report new leader.
SleepFor(kTserverHeartbeatMetricsIntervalMs * 2ms);

// We don't expect to be leaderless even though the lease is expired because not enough
// time has passed for us to alert.
result = GetLeaderlessTabletsString();
ASSERT_EQ(result.find(tablet_id), string::npos);

SleepFor(kMaxTabletWithoutValidLeaderMs * 1ms);
result = GetLeaderlessTabletsString();
ASSERT_NE(result.find(tablet_id), string::npos);

ASSERT_OK(cluster_->SetFlagOnMasters("TEST_skip_processing_tablet_metadata", "false"));
SleepFor(kTserverHeartbeatMetricsIntervalMs * 2ms);
result = GetLeaderlessTabletsString();
ASSERT_EQ(result.find(tablet_id), string::npos);
}

TEST_F(MasterPathHandlersLeaderlessITest, TestAllFollowers) {
const auto kMaxTabletWithoutValidLeaderMs = 5000;
ASSERT_OK(cluster_->SetFlagOnMasters("leaderless_tablet_alert_delay_secs",
std::to_string(kMaxTabletWithoutValidLeaderMs / 1000)));
CreateSingleTabletTestTable();
auto tablet_id = GetSingleTabletId();

// Initially the leaderless tablets list should be empty.
string result = GetLeaderlessTabletsString();
ASSERT_EQ(result.find(tablet_id), string::npos);

// Disable new leader election after leader stepdown.
ASSERT_OK(cluster_->SetFlagOnTServers("stepdown_disable_graceful_transition", "true"));
ASSERT_OK(cluster_->SetFlagOnTServers("TEST_skip_election_when_fail_detected", "true"));

const auto leader_idx = CHECK_RESULT(cluster_->GetTabletLeaderIndex(tablet_id));
const auto leader = cluster_->tablet_server(leader_idx);
auto ts_map = ASSERT_RESULT(itest::CreateTabletServerMap(
cluster_->GetLeaderMasterProxy<master::MasterClusterProxy>(), &cluster_->proxy_cache()));
// Leader step down and don't assign new leader, all three peers should be FOLLOWER.
ASSERT_OK(itest::LeaderStepDown(
ts_map[leader->uuid()].get(), tablet_id, nullptr, 10s));

// Wait for next TS heartbeat to report all three followers.
ASSERT_FALSE(HasLeaderPeer(tablet_id));
SleepFor(kTserverHeartbeatMetricsIntervalMs * 2ms);

// Shouldn't report it as leaderless before kMaxTabletWithoutValidLeaderMs.
result = GetLeaderlessTabletsString();
ASSERT_EQ(result.find(tablet_id), string::npos);

ASSERT_OK(WaitFor([&] {
string result = GetLeaderlessTabletsString();
return result.find(tablet_id) != string::npos &&
result.find("No valid leader reported") != string::npos;
}, 20s * kTimeMultiplier, "leaderless tablet endpoint catch the tablet"));

ASSERT_OK(cluster_->SetFlagOnTServers("TEST_skip_election_when_fail_detected", "false"));

ASSERT_OK(WaitFor([&] {
string result = GetLeaderlessTabletsString();
return result.find(tablet_id) == string::npos;
}, 20s * kTimeMultiplier, "leaderless tablet endpoint becomes empty"));
}

TEST_F(MasterPathHandlersItest, TestLeaderlessDeletedTablet) {
auto table = CreateTestTable(kNumTablets);
const auto kLeaderlessTabletAlertDelaySecs = 5;

// Prevent heartbeats from overwriting replica locations.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_tserver_disable_heartbeat) = true;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_leaderless_tablet_alert_delay_secs) =
kLeaderlessTabletAlertDelaySecs;

auto& catalog_mgr = ASSERT_RESULT(cluster_->GetLeaderMiniMaster())->catalog_manager();
auto table_info = catalog_mgr.GetTableInfo(table->id());
auto tablets = table_info->GetTablets();
ASSERT_EQ(tablets.size(), kNumTablets);

// Make all tablets leaderless.
MonoTime last_time_with_valid_leader_override = MonoTime::Now();
last_time_with_valid_leader_override.SubtractDelta(kLeaderlessTabletAlertDelaySecs * 1s);
for (auto& tablet : tablets) {
auto replicas = std::make_shared<TabletReplicaMap>(*tablet->GetReplicaLocations());
for (auto& replica : *replicas) {
replica.second.role = PeerRole::FOLLOWER;
}
tablet->SetReplicaLocations(replicas);
tablet->TEST_set_last_time_with_valid_leader(last_time_with_valid_leader_override);
}
auto running_tablet = tablets[0];
auto deleted_tablet = tablets[1];
Expand Down
22 changes: 20 additions & 2 deletions src/yb/master/catalog_entity_info.cc
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,10 @@ void TabletReplica::UpdateLeaderLeaseInfo(const TabletLeaderLeaseInfo& info) {
const bool initialized = leader_lease_info.initialized;
const auto old_lease_exp = leader_lease_info.ht_lease_expiration;
leader_lease_info = info;
leader_lease_info.ht_lease_expiration = std::max(old_lease_exp, info.ht_lease_expiration);
leader_lease_info.ht_lease_expiration =
info.leader_lease_status == consensus::LeaderLeaseStatus::HAS_LEASE
? std::max(info.ht_lease_expiration, old_lease_exp)
: 0;
leader_lease_info.initialized = initialized || info.initialized;
}

Expand Down Expand Up @@ -157,6 +160,7 @@ TabletInfo::TabletInfo(const scoped_refptr<TableInfo>& table, TabletId tablet_id
: tablet_id_(std::move(tablet_id)),
table_(table),
last_update_time_(MonoTime::Now()),
last_time_with_valid_leader_(MonoTime::Now()),
reported_schema_version_({}) {
// Have to pre-initialize to an empty map, in case of access before the first setter is called.
replica_locations_ = std::make_shared<TabletReplicaMap>();
Expand Down Expand Up @@ -242,7 +246,6 @@ Result<TabletReplicaDriveInfo> TabletInfo::GetLeaderReplicaDriveInfo() const {
Result<TabletLeaderLeaseInfo> TabletInfo::GetLeaderLeaseInfoIfLeader(
const std::string& ts_uuid) const {
std::lock_guard l(lock_);

auto it = replica_locations_->find(ts_uuid);
if (it == replica_locations_->end() || it->second.role != PeerRole::LEADER) {
return GetLeaderNotFoundStatus();
Expand Down Expand Up @@ -311,6 +314,21 @@ MonoTime TabletInfo::last_update_time() const {
return last_update_time_;
}

void TabletInfo::UpdateLastTimeWithValidLeader() {
std::lock_guard l(lock_);
last_time_with_valid_leader_ = MonoTime::Now();
}

MonoTime TabletInfo::last_time_with_valid_leader() const {
std::lock_guard l(lock_);
return last_time_with_valid_leader_;
}

void TabletInfo::TEST_set_last_time_with_valid_leader(const MonoTime& time) {
std::lock_guard l(lock_);
last_time_with_valid_leader_ = time;
}

bool TabletInfo::set_reported_schema_version(const TableId& table_id, uint32_t version) {
std::lock_guard l(lock_);
if (reported_schema_version_.count(table_id) == 0 ||
Expand Down
11 changes: 11 additions & 0 deletions src/yb/master/catalog_entity_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ struct TabletLeaderLeaseInfo {
bool initialized = false;
consensus::LeaderLeaseStatus leader_lease_status =
consensus::LeaderLeaseStatus::NO_MAJORITY_REPLICATED_LEASE;
// Expiration time of leader ht lease, invalid when leader_lease_status != HAS_LEASE.
MicrosTime ht_lease_expiration = 0;
// Number of heartbeats that current tablet leader doesn't have a valid lease.
uint64 heartbeats_without_leader_lease = 0;
Expand Down Expand Up @@ -264,6 +265,11 @@ class TabletInfo : public RefCountedThreadSafe<TabletInfo>,
void set_last_update_time(const MonoTime& ts);
MonoTime last_update_time() const;

// Update last_time_with_valid_leader_ to the Now().
void UpdateLastTimeWithValidLeader();
MonoTime last_time_with_valid_leader() const;
void TEST_set_last_time_with_valid_leader(const MonoTime& time);

// Accessors for the last reported schema version.
bool set_reported_schema_version(const TableId& table_id, uint32_t version);
uint32_t reported_schema_version(const TableId& table_id);
Expand Down Expand Up @@ -328,6 +334,11 @@ class TabletInfo : public RefCountedThreadSafe<TabletInfo>,
// Also set when the Master first attempts to create the tablet.
MonoTime last_update_time_ GUARDED_BY(lock_);

// The last time the tablet has a valid leader, a valid leader is:
// 1. with peer role LEADER.
// 2. has not-expired lease.
MonoTime last_time_with_valid_leader_ GUARDED_BY(lock_);

// The locations in the latest Raft config where this tablet has been
// reported. The map is keyed by tablet server UUID.
std::shared_ptr<TabletReplicaMap> replica_locations_ GUARDED_BY(lock_);
Expand Down
44 changes: 32 additions & 12 deletions src/yb/master/catalog_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,10 @@ DEFINE_test_flag(bool, simulate_sys_catalog_data_loss, false,
"On the heartbeat processing path, simulate a scenario where tablet metadata is missing due to "
"a corruption. ");

DEFINE_RUNTIME_uint32(maximum_tablet_leader_lease_expired_secs, 2 * 60,
"If the leader lease in master's view has expired for this amount of seconds, "
"treat the lease as expired for too long time.");

#define RETURN_FALSE_IF(cond) \
do { \
if ((cond)) { \
Expand Down Expand Up @@ -13073,6 +13077,19 @@ Result<BlacklistSet> CatalogManager::BlacklistSetFromPB(bool leader_blacklist) c
return ToBlacklistSet(GetBlacklist(l->pb, leader_blacklist));
}

namespace {

// Return true if the received ht_lease_exp from the leader peer has been expired for too
// long time when the ts metrics heartbeat reaches the master.
bool IsHtLeaseExpiredForTooLong(MicrosTime now, MicrosTime ht_lease_exp) {
const auto now_usec = boost::posix_time::microseconds(now);
const auto ht_lease_exp_usec = boost::posix_time::microseconds(ht_lease_exp);
return (now_usec - ht_lease_exp_usec).total_seconds() >
GetAtomicFlag(&FLAGS_maximum_tablet_leader_lease_expired_secs);
}

} // namespace

void CatalogManager::ProcessTabletMetadata(
const std::string& ts_uuid,
const TabletDriveStorageMetadataPB& storage_metadata,
Expand All @@ -13093,23 +13110,26 @@ void CatalogManager::ProcessTabletMetadata(
consensus::LeaderLeaseStatus::NO_MAJORITY_REPLICATED_LEASE;
bool leader_lease_info_initialized = false;
if (leader_metrics.has_value()) {
auto leader = tablet->GetLeader();
auto existing_leader_lease_info = tablet->GetLeaderLeaseInfoIfLeader(ts_uuid);
// If the peer is the current leader, update the counter to track heartbeats that
// the tablet doesn't have a valid lease.
if (leader && (*leader)->permanent_uuid() == ts_uuid) {
if (existing_leader_lease_info) {
const auto& leader_info = *leader_metrics;
leader_lease_status = leader_info.leader_lease_status();
auto existing_leader_lease_info =
tablet->GetLeaderLeaseInfoIfLeader((*leader)->permanent_uuid());
// It's possible that the leader was step down after exiting GetLeader.
if (existing_leader_lease_info) {
leader_lease_info_initialized = true;
if (leader_info.leader_lease_status() == consensus::LeaderLeaseStatus::HAS_LEASE) {
ht_lease_exp = leader_info.ht_lease_expiration();
} else {
new_heartbeats_without_leader_lease =
existing_leader_lease_info->heartbeats_without_leader_lease + 1;
leader_lease_info_initialized = true;
if (leader_info.leader_lease_status() == consensus::LeaderLeaseStatus::HAS_LEASE) {
ht_lease_exp = leader_info.ht_lease_expiration();
// If the reported ht lease from the leader is expired for more than
// FLAGS_maximum_tablet_leader_lease_expired_secs, the leader shouldn't be treated
// as a valid leader.
if (ht_lease_exp > existing_leader_lease_info->ht_lease_expiration &&
!IsHtLeaseExpiredForTooLong(
master_->clock()->Now().GetPhysicalValueMicros(), ht_lease_exp)) {
tablet->UpdateLastTimeWithValidLeader();
}
} else {
new_heartbeats_without_leader_lease =
existing_leader_lease_info->heartbeats_without_leader_lease + 1;
}
}
}
Expand Down
Loading

0 comments on commit afe4c00

Please sign in to comment.