Skip to content

Commit

Permalink
[yugabyte#21491] DocDB: Increment ClusterConfig version during univer…
Browse files Browse the repository at this point in the history
…se_uuid update and yb-ts-cli to clear universe uuid.

Summary:
This fix addresses the following race condition which is possible when upgrading a universe from a release which does not have master_enable_universe_uuid_heartbeat_check to a release which has universe_uuid generation and checks.

  # Cluster gets upgraded to a release with commit fb98e56 and the feature master_enable_universe_uuid_heartbeat_check is enabled.
  # Reader reads the ClusterConfig at version 'X'.
  # Catalog Manager background thread runs and generates a new universe_uuid, persists it in ClusterConfig and propagates it to all the t-servers.
  # Reader from Step 2 updates the ClusterConfig using ChangeMasterClusterConfigRequestPB with version 'X'.
  # Update from Step 4 succeeds because ClusterConfig version 'X' on disk matches the one in the request 'X' - effectively overwriting the universe_uuid generated in Step 3.
  # Catalog Manager background thread runs again and since universe_uuid is empty, it generates a new universe_uuid again.

Fix is to increment the ClusterConfig version during an update by the background task. Also added checks to ensure that universe_uuid does not modified once set by ChangeMasterClusterConfig.

Added a yb-ts-cli to clear the universe uuid in case we ever run into issues because of this.

**Upgrade/Rollback safety:**
- The proto changes are strictly for the yb-ts-cli only and as such should not have any upgrade implications.
Jira: DB-10377

Test Plan:
- Manually verified with an upgrade test that the clusterconfig version gets bumped up correctly and ChangeMasterClusterConfig fails if it attempts to modify the universe_uuid.
- Manually tested the yb-ts-cli command to validate that it clears the universe_uuid correctly.

ybt integration-tests_master_heartbeat-itest MasterHeartbeatITestWithUpgrade.ClearUniverseUuidToRecoverUniverse

Reviewers: hsunder

Reviewed By: hsunder

Subscribers: ybase, bogdan

Differential Revision: https://phorge.dev.yugabyte.com/D33180
  • Loading branch information
lingamsandeep authored and asrinivasanyb committed Mar 18, 2024
1 parent 1c0d687 commit 9998f76
Show file tree
Hide file tree
Showing 12 changed files with 176 additions and 4 deletions.
11 changes: 11 additions & 0 deletions src/yb/fs/fs_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -339,12 +339,23 @@ Status FsManager::SetUniverseUuidOnTserverInstanceMetadata(
const UniverseUuid& universe_uuid) {
std::lock_guard lock(metadata_mutex_);
SCHECK_NOTNULL(metadata_);
LOG(INFO) << "Setting the universe_uuid to " << universe_uuid;
metadata_->mutable_tserver_instance_metadata()->set_universe_uuid(universe_uuid.ToString());
auto instance_metadata_path = VERIFY_RESULT(GetExistingInstanceMetadataPath());
return pb_util::WritePBContainerToPath(
env_, instance_metadata_path, *metadata_.get(), pb_util::OVERWRITE, pb_util::SYNC);
}

Status FsManager::ClearUniverseUuidOnTserverInstanceMetadata() {
std::lock_guard lock(metadata_mutex_);
SCHECK_NOTNULL(metadata_);
LOG(INFO) << "Clearing the universe_uuid from Instance Metadata";
metadata_->mutable_tserver_instance_metadata()->clear_universe_uuid();
auto instance_metadata_path = VERIFY_RESULT(GetExistingInstanceMetadataPath());
return pb_util::WritePBContainerToPath(
env_, instance_metadata_path, *metadata_.get(), pb_util::OVERWRITE, pb_util::SYNC);
}

Status FsManager::CheckAndOpenFileSystemRoots() {
RETURN_NOT_OK(Init());

Expand Down
2 changes: 2 additions & 0 deletions src/yb/fs/fs_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,8 @@ class FsManager {

Status SetUniverseUuidOnTserverInstanceMetadata(const UniverseUuid& universe_uuid);

Status ClearUniverseUuidOnTserverInstanceMetadata();

// Return the path where InstanceMetadataPB is stored.
std::string GetInstanceMetadataPath(const std::string& root) const;

Expand Down
84 changes: 84 additions & 0 deletions src/yb/integration-tests/master_heartbeat-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ DECLARE_bool(master_enable_universe_uuid_heartbeat_check);

namespace yb {

using master::GetMasterClusterConfigResponsePB;
namespace integration_tests {

class MasterHeartbeatITest : public YBTableTestBase {
Expand Down Expand Up @@ -147,6 +148,89 @@ TEST_F(MasterHeartbeatITest, IgnorePeerNotInConfig) {
}, FLAGS_heartbeat_interval_ms * 5ms, "Wait for proper replica locations."));
}

class MasterHeartbeatITestWithUpgrade : public YBTableTestBase {
public:
void SetUp() override {
// Start the cluster without the universe_uuid generation FLAG to test upgrade.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_master_enable_universe_uuid_heartbeat_check) = false;
YBTableTestBase::SetUp();
proxy_cache_ = std::make_unique<rpc::ProxyCache>(client_->messenger());
}

Status GetClusterConfig(GetMasterClusterConfigResponsePB *config_resp) {
const auto* master = VERIFY_RESULT(mini_cluster_->GetLeaderMiniMaster());
return master->catalog_manager().GetClusterConfig(config_resp);
}

Status ClearUniverseUuid() {
for (auto& ts : mini_cluster_->mini_tablet_servers()) {
RETURN_NOT_OK(ts->server()->ClearUniverseUuid());
}
return Status::OK();
}

protected:
std::unique_ptr<rpc::ProxyCache> proxy_cache_;
};

TEST_F(MasterHeartbeatITestWithUpgrade, ClearUniverseUuidToRecoverUniverse) {
GetMasterClusterConfigResponsePB resp;
ASSERT_OK(GetClusterConfig(&resp));
auto cluster_config_version = resp.cluster_config().version();
LOG(INFO) << "Cluster Config version : " << cluster_config_version;

// Attempt to clear universe uuid. Should fail when it is not set.
ASSERT_NOK(ClearUniverseUuid());

// Enable the flag and wait for heartbeat to propagate the universe_uuid.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_master_enable_universe_uuid_heartbeat_check) = true;

// Wait for ClusterConfig version to increase.
ASSERT_OK(LoggedWaitFor([&]() {
if (!GetClusterConfig(&resp).ok()) {
return false;
}

if (!resp.cluster_config().has_universe_uuid()) {
return false;
}

return true;
}, 60s, "Waiting for new universe uuid to be generated"));

ASSERT_GE(resp.cluster_config().version(), cluster_config_version);
LOG(INFO) << "Updated cluster config version:" << resp.cluster_config().version();
LOG(INFO) << "Universe UUID:" << resp.cluster_config().universe_uuid();

// Wait for propagation of universe_uuid.
ASSERT_OK(LoggedWaitFor([&]() {
for (auto& ts : mini_cluster_->mini_tablet_servers()) {
auto uuid_str = ts->server()->fs_manager()->GetUniverseUuidFromTserverInstanceMetadata();
if (!uuid_str.ok() || uuid_str.get() != resp.cluster_config().universe_uuid()) {
return false;
}
}

return true;
}, 60s, "Waiting for tservers to pick up new universe uuid"));

// Verify servers are heartbeating correctly.
master::TSDescriptorVector ts_descs;
ASSERT_OK(mini_cluster_->WaitForTabletServerCount(3, &ts_descs, true /* live_only */));

// Artificially generate a fake universe uuid and propagate that by clearing the universe_uuid.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_master_universe_uuid) = Uuid::Generate().ToString();
ANNOTATE_UNPROTECTED_WRITE(FLAGS_tserver_unresponsive_timeout_ms) = 10 * 1000;

// Heartbeats should first fail due to universe_uuid mismatch.
ASSERT_OK(mini_cluster_->WaitForTabletServerCount(0, &ts_descs, true /* live_only */));

// Once t-server instance metadata is cleared, heartbeats should succeed again.
ASSERT_OK(ClearUniverseUuid());
ASSERT_OK(mini_cluster_->WaitForTabletServerCount(3, &ts_descs, true /* live_only */));
}


class MasterHeartbeatITestWithExternal : public MasterHeartbeatITest {
public:
bool use_external_mini_cluster() { return true; }
Expand Down
6 changes: 6 additions & 0 deletions src/yb/master/catalog_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1659,6 +1659,7 @@ Status CatalogManager::SetUniverseUuidIfNeeded(const LeaderEpoch& epoch) {
universe_uuid);

l.mutable_data()->pb.set_universe_uuid(universe_uuid);
l.mutable_data()->pb.set_version(l.mutable_data()->pb.version() + 1);
RETURN_NOT_OK(sys_catalog_->Upsert(epoch, cluster_config_.get()));
l.Commit();
return Status::OK();
Expand Down Expand Up @@ -12524,6 +12525,11 @@ Status CatalogManager::SetClusterConfig(
return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
}

if (config.universe_uuid() != l->pb.universe_uuid()) {
Status s = STATUS(InvalidArgument, "Config Universe UUID cannot be updated");
return SetupError(resp->mutable_error(), MasterErrorPB::INVALID_CLUSTER_CONFIG, s);
}

// TODO(bogdan): should this live here?
const ReplicationInfoPB& replication_info = config.replication_info();
for (auto& read_replica : replication_info.read_replicas()) {
Expand Down
9 changes: 7 additions & 2 deletions src/yb/master/sys_catalog-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -333,10 +333,15 @@ TEST_F(SysCatalogTest, TestSysCatalogPlacementOperations) {
req.mutable_cluster_config()->set_cluster_uuid("some-cluster-uuid");
auto status = master_->catalog_manager()->SetClusterConfig(&req, &resp);
ASSERT_TRUE(status.IsInvalidArgument());

// Setting the cluster uuid should make the request succeed.
req.mutable_cluster_config()->set_cluster_uuid(config.cluster_uuid());

// Verify that we receive an error when trying to change the universe uuid.
req.mutable_cluster_config()->set_universe_uuid("some-universe-uuid");
status = master_->catalog_manager()->SetClusterConfig(&req, &resp);
ASSERT_TRUE(status.IsInvalidArgument());
req.mutable_cluster_config()->set_universe_uuid(config.universe_uuid());

// Setting the cluster and universe uuid correctly should make the request succeed.
ASSERT_OK(master_->catalog_manager()->SetClusterConfig(&req, &resp));
l.Commit();
}
Expand Down
32 changes: 30 additions & 2 deletions src/yb/tools/ts-cli.cc
Original file line number Diff line number Diff line change
Expand Up @@ -72,12 +72,14 @@ using yb::consensus::RaftConfigPB;
using yb::rpc::Messenger;
using yb::rpc::MessengerBuilder;
using yb::rpc::RpcController;
using yb::server::ServerStatusPB;
using yb::server::ReloadCertificatesRequestPB;
using yb::server::ReloadCertificatesResponsePB;
using yb::server::ServerStatusPB;
using yb::tablet::TabletStatusPB;
using yb::tserver::ClearAllMetaCachesOnServerRequestPB;
using yb::tserver::ClearAllMetaCachesOnServerResponsePB;
using yb::tserver::ClearUniverseUuidRequestPB;
using yb::tserver::ClearUniverseUuidResponsePB;
using yb::tserver::CountIntentsRequestPB;
using yb::tserver::CountIntentsResponsePB;
using yb::tserver::DeleteTabletRequestPB;
Expand Down Expand Up @@ -116,6 +118,7 @@ const char* const kReloadCertificatesOp = "reload_certificates";
const char* const kRemoteBootstrapOp = "remote_bootstrap";
const char* const kListMasterServersOp = "list_master_servers";
const char* const kClearAllMetaCachesOnServerOp = "clear_server_metacache";
const char* const kClearUniverseUuidOp = "clear_universe_uuid";

DEFINE_NON_RUNTIME_string(server_address, "localhost",
"Address of server to run against");
Expand Down Expand Up @@ -251,6 +254,9 @@ class TsAdminClient {

Status ClearAllMetaCachesOnServer();

// Clear Universe Uuid.
Status ClearUniverseUuid();

private:
std::string addr_;
MonoDelta timeout_;
Expand Down Expand Up @@ -694,6 +700,22 @@ Status TsAdminClient::ClearAllMetaCachesOnServer() {
return Status::OK();
}

Status TsAdminClient::ClearUniverseUuid() {
CHECK(initted_);
ClearUniverseUuidRequestPB req;
ClearUniverseUuidResponsePB resp;
RpcController rpc;

rpc.set_timeout(timeout_);
RETURN_NOT_OK(ts_proxy_->ClearUniverseUuid(req, &resp, &rpc));
if (resp.has_error()) {
return StatusFromPB(resp.error().status());
}

std::cout << "Universe UUID cleared from Instance Metadata" << std::endl;
return Status::OK();
}

namespace {

void SetUsage(const char* argv0) {
Expand Down Expand Up @@ -721,7 +743,8 @@ void SetUsage(const char* argv0) {
<< " <tablet_id> <number of indexes> <index list> <start_key> <number of rows>\n"
<< " " << kReloadCertificatesOp << "\n"
<< " " << kRemoteBootstrapOp << " <server address to bootstrap from> <tablet_id>\n"
<< " " << kListMasterServersOp << "\n";
<< " " << kListMasterServersOp << "\n"
<< " " << kClearUniverseUuidOp << "\n";
google::SetUsageMessage(str.str());
}

Expand Down Expand Up @@ -942,6 +965,11 @@ static int TsCliMain(int argc, char** argv) {
RETURN_NOT_OK_PREPEND_FROM_MAIN(
client.ClearAllMetaCachesOnServer(),
"Unable to clear the meta-cache on tablet server with address " + addr);
} else if (op == kClearUniverseUuidOp) {
CHECK_ARGC_OR_RETURN_WITH_USAGE(op, 2);

RETURN_NOT_OK_PREPEND_FROM_MAIN(client.ClearUniverseUuid(),
"Unable to clear universe uuid on " + addr);
} else {
std::cerr << "Invalid operation: " << op << std::endl;
google::ShowUsageWithFlagsRestrict(argv[0], __FILE__);
Expand Down
10 changes: 10 additions & 0 deletions src/yb/tserver/tablet_server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1283,6 +1283,15 @@ Status TabletServer::XClusterHandleMasterHeartbeatResponse(
return Status::OK();
}

Status TabletServer::ClearUniverseUuid() {
auto instance_universe_uuid_str = VERIFY_RESULT(
fs_manager_->GetUniverseUuidFromTserverInstanceMetadata());
auto instance_universe_uuid = VERIFY_RESULT(UniverseUuid::FromString(instance_universe_uuid_str));
SCHECK_EQ(false, instance_universe_uuid.IsNil(), IllegalState,
"universe_uuid is not set in instance metadata");
return fs_manager_->ClearUniverseUuidOnTserverInstanceMetadata();
}

Status TabletServer::ValidateAndMaybeSetUniverseUuid(const UniverseUuid& universe_uuid) {
auto instance_universe_uuid_str = VERIFY_RESULT(
fs_manager_->GetUniverseUuidFromTserverInstanceMetadata());
Expand All @@ -1294,6 +1303,7 @@ Status TabletServer::ValidateAndMaybeSetUniverseUuid(const UniverseUuid& univers
"uuid is $1", universe_uuid.ToString(), instance_universe_uuid.ToString()));
return Status::OK();
}

return fs_manager_->SetUniverseUuidOnTserverInstanceMetadata(universe_uuid);
}

Expand Down
2 changes: 2 additions & 0 deletions src/yb/tserver/tablet_server.h
Original file line number Diff line number Diff line change
Expand Up @@ -310,6 +310,8 @@ class TabletServer : public DbServerBase, public TabletServerIf {

Status ValidateAndMaybeSetUniverseUuid(const UniverseUuid& universe_uuid);

Status ClearUniverseUuid();

XClusterConsumerIf* GetXClusterConsumer() const;

// Mark the CDC service as enabled via heartbeat.
Expand Down
11 changes: 11 additions & 0 deletions src/yb/tserver/tablet_service.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2929,6 +2929,17 @@ void TabletServiceImpl::ListMasterServers(const ListMasterServersRequestPB* req,
context.RespondSuccess();
}

void TabletServiceImpl::ClearUniverseUuid(const ClearUniverseUuidRequestPB* req,
ClearUniverseUuidResponsePB* resp,
rpc::RpcContext context) {
const Status s = server_->tablet_manager()->server()->ClearUniverseUuid();
if (!s.ok()) {
SetupErrorAndRespond(resp->mutable_error(), s, &context);
return;
}
context.RespondSuccess();
}

void TabletServiceImpl::CheckTserverTabletHealth(const CheckTserverTabletHealthRequestPB* req,
CheckTserverTabletHealthResponsePB* resp,
rpc::RpcContext context) {
Expand Down
4 changes: 4 additions & 0 deletions src/yb/tserver/tablet_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,10 @@ class TabletServiceImpl : public TabletServerServiceIf, public ReadTabletProvide
ListMasterServersResponsePB* resp,
rpc::RpcContext context) override;

void ClearUniverseUuid(const ClearUniverseUuidRequestPB* req,
ClearUniverseUuidResponsePB* resp,
rpc::RpcContext context) override;

void GetLockStatus(const GetLockStatusRequestPB* req,
GetLockStatusResponsePB* resp,
rpc::RpcContext context) override;
Expand Down
7 changes: 7 additions & 0 deletions src/yb/tserver/tserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -373,3 +373,10 @@ message CheckTserverTabletHealthResponsePB {
message ClearAllMetaCachesOnServerRequestPB {}

message ClearAllMetaCachesOnServerResponsePB {}

message ClearUniverseUuidRequestPB {
}

message ClearUniverseUuidResponsePB {
optional TabletServerErrorPB error = 1;
}
2 changes: 2 additions & 0 deletions src/yb/tserver/tserver_service.proto
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ service TabletServerService {

rpc ClearAllMetaCachesOnServer(ClearAllMetaCachesOnServerRequestPB)
returns (ClearAllMetaCachesOnServerResponsePB);

rpc ClearUniverseUuid(ClearUniverseUuidRequestPB) returns (ClearUniverseUuidResponsePB);
}

// Note: Either among transactions_by_tablet or transaction_ids should be set. Both the fields
Expand Down

0 comments on commit 9998f76

Please sign in to comment.