Skip to content

Commit

Permalink
[#19954] docdb: Register both tablet split children atomically
Browse files Browse the repository at this point in the history
Summary:
Introduce `RegisterNewTabletsForSplit` which adds both new split tablet children at the same time. This also ensures that the `partitions_` field of `TableInfo` always has a complete covering of the keyspace during splits (before we could have the case where only one tablet was registered).

Fixes #19954
Jira: DB-8923

Test Plan: Existing tests

Reviewers: jhe

Reviewed By: jhe

Subscribers: ybase, bogdan

Differential Revision: https://phorge.dev.yugabyte.com/D31428
  • Loading branch information
SrivastavaAnubhav committed Jul 25, 2024
1 parent 399f165 commit 5a4bbd4
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 61 deletions.
4 changes: 2 additions & 2 deletions src/yb/integration-tests/tablet-split-itest-base.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,7 @@ Status TabletSplitExternalMiniClusterITest::SplitTabletCrashMaster(
RETURN_NOT_OK(FlushTestTable());

// Split tablet should crash before creating either tablet
RETURN_NOT_OK(cluster_->SetFlagOnMasters("TEST_crash_after_creating_single_split_tablet", "1.0"));
RETURN_NOT_OK(cluster_->SetFlagOnMasters("TEST_crash_after_registering_split_tablets", "1.0"));

// Retrieve split key from a leader peer
if (split_partition_key) {
Expand All @@ -1027,7 +1027,7 @@ Status TabletSplitExternalMiniClusterITest::SplitTabletCrashMaster(
}

RETURN_NOT_OK(RestartAllMasters(cluster_.get()));
RETURN_NOT_OK(cluster_->SetFlagOnMasters("TEST_crash_after_creating_single_split_tablet", "0.0"));
RETURN_NOT_OK(cluster_->SetFlagOnMasters("TEST_crash_after_registering_split_tablets", "0.0"));

if (change_split_boundary) {
RETURN_NOT_OK(WriteRows(kNumRows * 2, kNumRows));
Expand Down
6 changes: 3 additions & 3 deletions src/yb/integration-tests/tablet-split-itest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ DECLARE_uint64(prevent_split_for_ttl_tables_for_seconds);
DECLARE_bool(sort_automatic_tablet_splitting_candidates);
DECLARE_int32(intents_flush_max_delay_ms);
DECLARE_int32(index_block_restart_interval);
DECLARE_bool(TEST_error_after_creating_single_split_tablet);
DECLARE_bool(TEST_error_after_registering_split_tablets);
DECLARE_bool(TEST_pause_before_send_hinted_election);
DECLARE_bool(TEST_skip_election_when_fail_detected);
DECLARE_int32(scheduled_full_compaction_frequency_hours);
Expand Down Expand Up @@ -3255,7 +3255,7 @@ class TabletSplitSingleServerITestWithPartition :
TEST_P(TabletSplitSingleServerITestWithPartition, TestSplitEncodedKeyAfterBreakInTheMiddleOfSplit) {
// Make catalog manager to do only Upsert in order to emulate master error/crash behaviour in the
// middle of split. Restart is required to be sure the flags change is seen at the master's side.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_error_after_creating_single_split_tablet) = true;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_error_after_registering_split_tablets) = true;
ASSERT_OK(cluster_->RestartSync());
SetNumTablets(1);

Expand Down Expand Up @@ -3285,7 +3285,7 @@ TEST_P(TabletSplitSingleServerITestWithPartition, TestSplitEncodedKeyAfterBreakI
ASSERT_NOK(status) << "Corresponding split is expected to fail!";

// Reset flag to emulate partitions re-calculation.
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_error_after_creating_single_split_tablet) = false;
ANNOTATE_UNPROTECTED_WRITE(FLAGS_TEST_error_after_registering_split_tablets) = false;
ASSERT_OK(cluster_->RestartSync());

// Split should pass without any error.
Expand Down
86 changes: 41 additions & 45 deletions src/yb/master/catalog_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -460,12 +460,12 @@ DEFINE_test_flag(int32, slowdown_alter_table_rpcs_ms, 0,
DEFINE_test_flag(bool, reject_delete_not_serving_tablet_rpc, false,
"Whether to reject DeleteNotServingTablet RPC.");

DEFINE_test_flag(double, crash_after_creating_single_split_tablet, 0.0,
"Crash inside CatalogManager::RegisterNewTabletForSplit after calling Upsert.");
DEFINE_test_flag(double, crash_after_registering_split_tablets, 0.0,
"Crash inside CatalogManager::RegisterNewTabletsForSplit after calling Upsert.");

DEFINE_test_flag(bool, error_after_creating_single_split_tablet, false,
"Return an error inside CatalogManager::RegisterNewTabletForSplit "
"after calling Upsert.");
DEFINE_test_flag(bool, error_after_registering_split_tablets, false,
"Return an error inside CatalogManager::RegisterNewTabletsForSplit "
"after calling Upsert.");

DEFINE_RUNTIME_bool(enable_delete_truncate_xcluster_replicated_table, false,
"When set, enables deleting/truncating YCQL tables currently in xCluster replication. "
Expand Down Expand Up @@ -497,9 +497,6 @@ DEFINE_RUNTIME_bool(batch_ysql_system_tables_metadata, true,
"Whether change metadata operation and SysCatalogTable upserts for ysql system tables during a "
"create database is performed one by one or batched together");

DEFINE_test_flag(bool, pause_split_child_registration,
false, "Pause split after registering one child");

DEFINE_test_flag(bool, keep_docdb_table_on_ysql_drop_table, false,
"When enabled does not delete tables from the docdb layer, resulting in YSQL "
"tables only being dropped in the postgres layer.");
Expand Down Expand Up @@ -3056,15 +3053,14 @@ Status CatalogManager::DoSplitTablet(
}

// Add new split children to the sys catalog.
for (auto& new_tablet : new_tablets) {
RETURN_NOT_OK(RegisterNewTabletForSplit(
source_tablet_info.get(), new_tablet, epoch, &source_table_lock, &source_tablet_lock));
}
RETURN_NOT_OK(RegisterNewTabletsForSplit(
source_tablet_info.get(), new_tablets, epoch, &source_table_lock, &source_tablet_lock));

source_tablet_lock.Commit();
source_table_lock.Commit();
}
// At this point, the tablets exist in the table but not in the tablet map. Callers should retry
// if the tablets are not found in the tablet map.
// At this point, the tablets exist in the table but not in the tablet map. Users that need these
// tablets should retry until the tablets are inserted into the tablet map.

MAYBE_FAULT(FLAGS_TEST_fault_crash_after_registering_split_children);

Expand Down Expand Up @@ -7401,54 +7397,54 @@ Status CatalogManager::IsAlterTableDone(const IsAlterTableDoneRequestPB* req,
return Status::OK();
}

Status CatalogManager::RegisterNewTabletForSplit(
TabletInfo* source_tablet_info, const TabletInfoPtr& new_tablet, const LeaderEpoch& epoch,
TableInfo::WriteLock* table_write_lock, TabletInfo::WriteLock* tablet_write_lock) {
Status CatalogManager::RegisterNewTabletsForSplit(
TabletInfo* source_tablet_info, const std::vector<TabletInfoPtr>& new_tablets,
const LeaderEpoch& epoch, TableInfo::WriteLock* table_write_lock,
TabletInfo::WriteLock* parent_write_lock) {
const auto tablet_lock = source_tablet_info->LockForRead();

auto table = source_tablet_info->table();
const auto& source_tablet_meta = tablet_lock->pb;

auto& new_tablet_meta = new_tablet->mutable_metadata()->mutable_dirty()->pb;
new_tablet_meta.mutable_committed_consensus_state()->CopyFrom(
source_tablet_meta.committed_consensus_state());
const auto new_split_depth = source_tablet_meta.split_depth() + 1;
new_tablet_meta.set_split_depth(new_split_depth);
new_tablet_meta.set_split_parent_tablet_id(source_tablet_info->tablet_id());
// TODO(tsplit): consider and handle failure scenarios, for example:
// - Crash or leader failover before sending out the split tasks.
// - Long enough partition while trying to send out the splits so that they timeout and
// not get executed.
for (auto& new_tablet : new_tablets) {
auto& new_tablet_meta = new_tablet->mutable_metadata()->mutable_dirty()->pb;
new_tablet_meta.mutable_committed_consensus_state()->CopyFrom(
source_tablet_meta.committed_consensus_state());
new_tablet_meta.set_split_depth(new_split_depth);
new_tablet_meta.set_split_parent_tablet_id(source_tablet_info->tablet_id());

parent_write_lock->mutable_data()->pb.add_split_tablet_ids(new_tablet->id());
}

int new_partition_list_version;
auto& table_pb = table_write_lock->mutable_data()->pb;
new_partition_list_version = table_pb.partition_list_version() + 1;
table_pb.set_partition_list_version(new_partition_list_version);

tablet_write_lock->mutable_data()->pb.add_split_tablet_ids(new_tablet->id());
RETURN_NOT_OK(sys_catalog_->Upsert(epoch, table, new_tablet, source_tablet_info));
RETURN_NOT_OK(sys_catalog_->Upsert(epoch, table, new_tablets, source_tablet_info));

TEST_PAUSE_IF_FLAG(TEST_pause_split_child_registration);
MAYBE_FAULT(FLAGS_TEST_crash_after_creating_single_split_tablet);
if (PREDICT_FALSE(FLAGS_TEST_error_after_creating_single_split_tablet)) {
MAYBE_FAULT(FLAGS_TEST_crash_after_registering_split_tablets);
if (PREDICT_FALSE(FLAGS_TEST_error_after_registering_split_tablets)) {
return STATUS(IllegalState, "TEST: error happened while registering a new tablet.");
}

// This has to be done while the table lock is held since TableInfo::partitions_ must be updated
// at the same time as the partition list version.
RETURN_NOT_OK(table->AddTablet(new_tablet));

const PartitionPB& partition = new_tablet->metadata().state().pb.partition();
LOG(INFO) << "Registered new tablet " << new_tablet->tablet_id() << " (partition_key_start: "
<< Slice(partition.partition_key_start()).ToDebugString(/* max_length = */ 64)
<< ", partition_key_end: "
<< Slice(partition.partition_key_end()).ToDebugString(/* max_length = */ 64)
<< ", split_depth: " << new_split_depth << ") to split the tablet "
<< source_tablet_info->tablet_id() << " (" << AsString(source_tablet_meta.partition())
<< ") for table " << table->ToString()
<< ", new partition_list_version: " << new_partition_list_version;

RETURN_NOT_OK(table->AddTablets(new_tablets));
// TODO: We use this pattern in other places, but what if concurrent thread accesses not yet
// committed TabletInfo from the `table` ?
new_tablet->mutable_metadata()->CommitMutation();
for (auto& new_tablet : new_tablets) {
const PartitionPB& partition = new_tablet->metadata().state().pb.partition();
LOG(INFO) << "Registered new tablet " << new_tablet->tablet_id() << " (partition_key_start: "
<< Slice(partition.partition_key_start()).ToDebugString(/* max_length = */ 64)
<< ", partition_key_end: "
<< Slice(partition.partition_key_end()).ToDebugString(/* max_length = */ 64)
<< ", split_depth: " << new_split_depth << ") to split the tablet "
<< source_tablet_info->tablet_id() << " (" << AsString(source_tablet_meta.partition())
<< ") for table " << table->ToString()
<< ", new partition_list_version: " << new_partition_list_version;
new_tablet->mutable_metadata()->CommitMutation();
}
return Status::OK();
}

Expand Down
10 changes: 5 additions & 5 deletions src/yb/master/catalog_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -2035,12 +2035,12 @@ class CatalogManager : public tserver::TabletPeerLookupIf,
// partitions_vtable_cache_refresh_secs seconds.
void RebuildYQLSystemPartitions();

// Registers `new_tablet` for the same table as `source_tablet_info` tablet.
// Registers `new_tablets` for the same table as `source_tablet_info` tablet.
// Does not change any other tablets and their partitions.
Status RegisterNewTabletForSplit(
TabletInfo* source_tablet_info, const TabletInfoPtr& new_tablet, const LeaderEpoch& epoch,
TableInfo::WriteLock* table_write_lock, TabletInfo::WriteLock* tablet_write_lock)
EXCLUDES(mutex_);
Status RegisterNewTabletsForSplit(
TabletInfo* source_tablet_info, const std::vector<TabletInfoPtr>& new_tablets,
const LeaderEpoch& epoch, TableInfo::WriteLock* table_write_lock,
TabletInfo::WriteLock* tablet_write_lock) EXCLUDES(mutex_);

Result<TabletInfoPtr> GetTabletInfoUnlocked(const TabletId& tablet_id)
REQUIRES_SHARED(mutex_);
Expand Down
6 changes: 0 additions & 6 deletions src/yb/tools/yb-admin-snapshot-schedule-test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4117,12 +4117,6 @@ TEST_F(YbAdminRestoreDuringSplit, RestoreBeforeGetSplitKey) {
ASSERT_OK(RunTest(1 /* expected num tablets after restore */));
}

// Restore to a time after one of the child tablets is registered by the master.
TEST_F(YbAdminRestoreDuringSplit, RestoreAfterOneChildRegistered) {
SetDelayFlag("TEST_pause_split_child_registration", /* set on master */ true);
ASSERT_OK(RunTest(1 /* expected num tablets after restore */));
}

// Restore to a time after both the child tablets are registered by the master but
// before the SPLIT_OP is applied.
TEST_F(YbAdminRestoreDuringSplit, RestoreBeforeSplitOpIsApplied) {
Expand Down

0 comments on commit 5a4bbd4

Please sign in to comment.