From 6404af4847b59d6c240bb2eedc107039ca9fd650 Mon Sep 17 00:00:00 2001 From: Sergei Politov Date: Thu, 26 Dec 2024 16:05:16 +0300 Subject: [PATCH] [#24310] DocDB: Fix wrong retention time could be picked when snapshot schedule is enabled Summary: When snapshot schedule is active, we retain records from being compacted until there is snapshot that contains this record. Due to race condition we could get into situation where tablet replica consider snapshot as taken while it does not have it. TServer receives last schedule snapshot time via heartbeat mechanism. There are 2 places that could lead to described bug. 1) Master sends last snapshot time for schedule that is not yet complete. 2) Snapshot is taken on leader and one follower, while another follower did not complete snapshot yet. Fixed by switching to maintain last snapshot time locally and store this information in filesystem. When snapshot is created for some schedule, we create file in snapshots folder named: last_snapshot.[SCHEDULE_ID].[SNAPSHOT_TIME]. And remove previous file for the same schedule. During tablet bootstrap we list all files in snapshot folder, recovering latest snapshot times and removing outdated files. Jira: DB-13199 Test Plan: ./yb_build.sh release -n 400 --gtest_filter YbAdminSnapshotScheduleTest.PgsqlAddColumnCompactWithPackedRow -- -p 8 Reviewers: timur Reviewed By: timur Subscribers: ybase Tags: #jenkins-ready Differential Revision: https://phorge.dev.yugabyte.com/D40893 --- src/yb/docdb/consensus_frontier-test.cc | 19 +-- src/yb/docdb/docdb_compaction_context.h | 14 +- .../operations/history_cutoff_operation.cc | 10 +- src/yb/tablet/tablet.cc | 22 +++- src/yb/tablet/tablet.h | 3 + src/yb/tablet/tablet_retention_policy.cc | 33 +---- src/yb/tablet/tablet_retention_policy.h | 1 - src/yb/tablet/tablet_snapshots.cc | 123 ++++++++++++++++-- src/yb/tablet/tablet_snapshots.h | 8 ++ .../tools/yb-admin-snapshot-schedule-test.cc | 1 + src/yb/tserver/ts_tablet_manager.cc | 42 +++--- 11 files changed, 192 insertions(+), 84 deletions(-) diff --git a/src/yb/docdb/consensus_frontier-test.cc b/src/yb/docdb/consensus_frontier-test.cc index 9986a436eb0a..bd377a5cef70 100644 --- a/src/yb/docdb/consensus_frontier-test.cc +++ b/src/yb/docdb/consensus_frontier-test.cc @@ -47,7 +47,7 @@ TEST_F(ConsensusFrontierTest, TestUpdates) { EXPECT_TRUE(frontier.Equals(frontier)); EXPECT_EQ( "{ op_id: 0.0 hybrid_time: " - "history_cutoff: { cotables cutoff: , primary cutoff: } " + "history_cutoff: { cotables_cutoff_ht: primary_cutoff_ht: } " "max_value_level_ttl_expiration_time: primary_schema_version: " "cotable_schema_versions: [] global_filter: cotables_filter: [] }", frontier.ToString()); @@ -62,7 +62,8 @@ TEST_F(ConsensusFrontierTest, TestUpdates) { ConsensusFrontier frontier{{1, 1}, 1000_usec_ht, { 500_usec_ht, 500_usec_ht }}; EXPECT_EQ( "{ op_id: 1.1 hybrid_time: { physical: 1000 } " - "history_cutoff: { cotables cutoff: { physical: 500 }, primary cutoff: { physical: 500 } } " + "history_cutoff: { cotables_cutoff_ht: { physical: 500 } " + "primary_cutoff_ht: { physical: 500 } } " "max_value_level_ttl_expiration_time: primary_schema_version: " "cotable_schema_versions: [] global_filter: cotables_filter: [] }", frontier.ToString()); @@ -122,7 +123,7 @@ TEST_F(ConsensusFrontierTest, TestUpdates) { EXPECT_EQ( PbToString(pb), "{ op_id: 0.0 hybrid_time: " - "history_cutoff: { cotables cutoff: , primary cutoff: } " + "history_cutoff: { cotables_cutoff_ht: primary_cutoff_ht: } " "max_value_level_ttl_expiration_time: primary_schema_version: " "cotable_schema_versions: [] global_filter: cotables_filter: [] }"); @@ -131,7 +132,7 @@ TEST_F(ConsensusFrontierTest, TestUpdates) { EXPECT_EQ( PbToString(pb), "{ op_id: 2.3 hybrid_time: " - "history_cutoff: { cotables cutoff: , primary cutoff: } " + "history_cutoff: { cotables_cutoff_ht: primary_cutoff_ht: } " "max_value_level_ttl_expiration_time: primary_schema_version: " "cotable_schema_versions: [] global_filter: cotables_filter: [] }"); @@ -139,7 +140,7 @@ TEST_F(ConsensusFrontierTest, TestUpdates) { EXPECT_EQ( PbToString(pb), "{ op_id: 2.3 hybrid_time: { physical: 24 logical: 1696 } " - "history_cutoff: { cotables cutoff: , primary cutoff: } " + "history_cutoff: { cotables_cutoff_ht: primary_cutoff_ht: } " "max_value_level_ttl_expiration_time: primary_schema_version: " "cotable_schema_versions: [] global_filter: cotables_filter: [] }"); @@ -147,8 +148,8 @@ TEST_F(ConsensusFrontierTest, TestUpdates) { EXPECT_EQ( PbToString(pb), "{ op_id: 2.3 hybrid_time: { physical: 24 logical: 1696 } " - "history_cutoff: { cotables cutoff: , " - "primary cutoff: { physical: 48 logical: 3392 } } " + "history_cutoff: { cotables_cutoff_ht: " + "primary_cutoff_ht: { physical: 48 logical: 3392 } } " "max_value_level_ttl_expiration_time: primary_schema_version: " "cotable_schema_versions: [] global_filter: cotables_filter: [] }"); @@ -156,8 +157,8 @@ TEST_F(ConsensusFrontierTest, TestUpdates) { EXPECT_EQ( PbToString(pb), "{ op_id: 2.3 hybrid_time: { physical: 24 logical: 1696 } " - "history_cutoff: { cotables cutoff: { physical: 48 logical: 3392 }, " - "primary cutoff: { physical: 48 logical: 3392 } } " + "history_cutoff: { cotables_cutoff_ht: { physical: 48 logical: 3392 } " + "primary_cutoff_ht: { physical: 48 logical: 3392 } } " "max_value_level_ttl_expiration_time: primary_schema_version: " "cotable_schema_versions: [] global_filter: cotables_filter: [] }"); } diff --git a/src/yb/docdb/docdb_compaction_context.h b/src/yb/docdb/docdb_compaction_context.h index 3b92ef70f244..2932fe4aa439 100644 --- a/src/yb/docdb/docdb_compaction_context.h +++ b/src/yb/docdb/docdb_compaction_context.h @@ -57,13 +57,23 @@ std::optional PackedRowVersion(TableType table_type, bo struct HistoryCutoff { // Set only on the master and applies to cotables. HybridTime cotables_cutoff_ht; + // Used everywhere else i.e. for the sys catalog table on the master, // colocated tables on tservers and non-colocated tables on tservers. HybridTime primary_cutoff_ht; + void MakeAtLeast(const HistoryCutoff& rhs) { + cotables_cutoff_ht.MakeAtLeast(rhs.cotables_cutoff_ht); + primary_cutoff_ht.MakeAtLeast(rhs.primary_cutoff_ht); + } + + void MakeAtMost(const HistoryCutoff& rhs) { + cotables_cutoff_ht.MakeAtMost(rhs.cotables_cutoff_ht); + primary_cutoff_ht.MakeAtMost(rhs.primary_cutoff_ht); + } + std::string ToString() const { - return Format("{ cotables cutoff: $0, primary cutoff: $1 }", - cotables_cutoff_ht, primary_cutoff_ht); + return YB_STRUCT_TO_STRING(cotables_cutoff_ht, primary_cutoff_ht); } }; diff --git a/src/yb/tablet/operations/history_cutoff_operation.cc b/src/yb/tablet/operations/history_cutoff_operation.cc index 89c6bfd1c19e..4efe6f3beca0 100644 --- a/src/yb/tablet/operations/history_cutoff_operation.cc +++ b/src/yb/tablet/operations/history_cutoff_operation.cc @@ -40,12 +40,10 @@ consensus::LWHistoryCutoffPB* RequestTraits::Mutab } Status HistoryCutoffOperation::Apply(int64_t leader_term) { - auto primary_cutoff = request()->has_primary_cutoff_ht() ? - HybridTime(request()->primary_cutoff_ht()) : HybridTime(); - auto cotables_cutoff = request()->has_cotables_cutoff_ht() ? - HybridTime(request()->cotables_cutoff_ht()) : HybridTime(); - docdb::HistoryCutoff history_cutoff( - { cotables_cutoff, primary_cutoff }); + docdb::HistoryCutoff history_cutoff = { + .cotables_cutoff_ht = HybridTime::FromPB(request()->cotables_cutoff_ht()), + .primary_cutoff_ht = HybridTime::FromPB(request()->primary_cutoff_ht()) + }; VLOG_WITH_PREFIX(2) << "History cutoff replicated " << op_id() << ": " << history_cutoff; diff --git a/src/yb/tablet/tablet.cc b/src/yb/tablet/tablet.cc index b0138fb49df6..70a6add65306 100644 --- a/src/yb/tablet/tablet.cc +++ b/src/yb/tablet/tablet.cc @@ -308,8 +308,7 @@ using namespace std::literals; // NOLINT using rocksdb::SequenceNumber; -namespace yb { -namespace tablet { +namespace yb::tablet { using strings::Substitute; @@ -647,8 +646,9 @@ Tablet::Tablet(const TabletInitData& data) log_prefix_suffix_(data.log_prefix_suffix), is_sys_catalog_(data.is_sys_catalog), txns_enabled_(data.txns_enabled), + allowed_history_cutoff_provider_(data.allowed_history_cutoff_provider), retention_policy_(std::make_shared( - clock_, data.allowed_history_cutoff_provider, metadata_.get())), + clock_, [this](const auto&) { return AllowedHistoryCutoff(); }, metadata_.get())), full_compaction_pool_(data.full_compaction_pool), admin_triggered_compaction_pool_(data.admin_triggered_compaction_pool), ts_post_split_compaction_added_(std::move(data.post_split_compaction_added)), @@ -976,6 +976,7 @@ std::string Tablet::LogPrefix(docdb::StorageDbType db_type) const { Status Tablet::OpenKeyValueTablet() { auto common_options = VERIFY_RESULT(CommonRocksDBOptions()); + RETURN_NOT_OK(snapshots_->Open()); RETURN_NOT_OK(OpenRegularDB(common_options)); RETURN_NOT_OK(OpenIntentsDB(common_options)); @@ -5564,6 +5565,18 @@ Status Tablet::GetTabletKeyRanges( FATAL_INVALID_ENUM_VALUE(Direction, direction); } +docdb::HistoryCutoff Tablet::AllowedHistoryCutoff() { + docdb::HistoryCutoff result = { + .cotables_cutoff_ht = HybridTime::kInvalid, + .primary_cutoff_ht = HybridTime::kMax, + }; + if (allowed_history_cutoff_provider_) { + result.MakeAtMost(allowed_history_cutoff_provider_(metadata_.get())); + } + result.primary_cutoff_ht.MakeAtMost(snapshots_->AllowedHistoryCutoff()); + return result; +} + // ------------------------------------------------------------------------------------------------ Result ScopedReadOperation::Create( @@ -5635,5 +5648,4 @@ Result> ListFiles(const std:: return result; } -} // namespace tablet -} // namespace yb +} // namespace yb::tablet diff --git a/src/yb/tablet/tablet.h b/src/yb/tablet/tablet.h index a8f872246659..7d5f0347fe49 100644 --- a/src/yb/tablet/tablet.h +++ b/src/yb/tablet/tablet.h @@ -1244,6 +1244,8 @@ class Tablet : public AbstractTablet, REQUIRES(vector_indexes_mutex_); docdb::VectorIndexesPtr VectorIndexesList() const EXCLUDES(vector_indexes_mutex_); + docdb::HistoryCutoff AllowedHistoryCutoff(); + mutable std::mutex flush_filter_mutex_; std::function mem_table_flush_filter_factory_ GUARDED_BY(flush_filter_mutex_); @@ -1274,6 +1276,7 @@ class Tablet : public AbstractTablet, std::function num_sst_files_changed_listener_ GUARDED_BY(num_sst_files_changed_listener_mutex_); + AllowedHistoryCutoffProvider allowed_history_cutoff_provider_; std::shared_ptr retention_policy_; std::mutex full_compaction_token_mutex_; diff --git a/src/yb/tablet/tablet_retention_policy.cc b/src/yb/tablet/tablet_retention_policy.cc index f951e713cab8..abeb90ef8e44 100644 --- a/src/yb/tablet/tablet_retention_policy.cc +++ b/src/yb/tablet/tablet_retention_policy.cc @@ -93,29 +93,10 @@ TabletRetentionPolicy::TabletRetentionPolicy( } } -void TabletRetentionPolicy::MakeAtLeast(HistoryCutoff value) { - if (value.cotables_cutoff_ht) { - committed_history_cutoff_information_.cotables_cutoff_ht = std::max( - committed_history_cutoff_information_.cotables_cutoff_ht, - value.cotables_cutoff_ht); - } - if (value.primary_cutoff_ht) { - committed_history_cutoff_information_.primary_cutoff_ht = std::max( - committed_history_cutoff_information_.primary_cutoff_ht, - value.primary_cutoff_ht); - } -} - -HistoryCutoff TabletRetentionPolicy::UpdateCommittedHistoryCutoff( - HistoryCutoff value) { +HistoryCutoff TabletRetentionPolicy::UpdateCommittedHistoryCutoff(HistoryCutoff value) { std::lock_guard lock(mutex_); - if (!value.cotables_cutoff_ht && !value.primary_cutoff_ht) { - return committed_history_cutoff_information_; - } - VLOG_WITH_PREFIX(4) << __func__ << "(" << value << ")"; - - MakeAtLeast(value); + committed_history_cutoff_information_.MakeAtLeast(value); return committed_history_cutoff_information_; } @@ -129,7 +110,7 @@ HistoryRetentionDirective TabletRetentionPolicy::GetRetentionDirective() { } else { history_cutoff = EffectiveHistoryCutoff(); VLOG(4) << "Effective history cutoff " << history_cutoff; - MakeAtLeast(history_cutoff); + committed_history_cutoff_information_.MakeAtLeast(history_cutoff); } } @@ -185,14 +166,14 @@ HybridTime TabletRetentionPolicy::GetEarliestAllowedReadHt() { committed_history_cutoff_information_.primary_cutoff_ht); } -HistoryCutoff TabletRetentionPolicy::HistoryCutoffToPropagate( - HybridTime last_write_ht) { +HistoryCutoff TabletRetentionPolicy::HistoryCutoffToPropagate(HybridTime last_write_ht) { std::lock_guard lock(mutex_); auto now = CoarseMonoClock::now(); - VLOG_WITH_PREFIX(4) << __func__ << "(" << last_write_ht << "), left to wait: " - << MonoDelta(next_history_cutoff_propagation_ - now); + VLOG_WITH_PREFIX_AND_FUNC(4) + << ", last_write_ht: " << last_write_ht << ", left to wait: " + << MonoDelta(next_history_cutoff_propagation_ - now); HybridTime earliest_ht = GetEarliestAllowedReadHt(); if (disable_counter_ != 0 || !FLAGS_enable_history_cutoff_propagation || now < next_history_cutoff_propagation_ || last_write_ht <= earliest_ht) { diff --git a/src/yb/tablet/tablet_retention_policy.h b/src/yb/tablet/tablet_retention_policy.h index 14bbb94c5c8b..83933fd54e91 100644 --- a/src/yb/tablet/tablet_retention_policy.h +++ b/src/yb/tablet/tablet_retention_policy.h @@ -66,7 +66,6 @@ class TabletRetentionPolicy : public docdb::HistoryRetentionPolicy { docdb::HistoryCutoff SanitizeHistoryCutoff( docdb::HistoryCutoff proposed_history_cutoff) REQUIRES(mutex_); - void MakeAtLeast(docdb::HistoryCutoff value) REQUIRES(mutex_); HybridTime GetEarliestAllowedReadHt() REQUIRES(mutex_); const std::string& LogPrefix() const { diff --git a/src/yb/tablet/tablet_snapshots.cc b/src/yb/tablet/tablet_snapshots.cc index fb01fd6eb522..8c594be89fe7 100644 --- a/src/yb/tablet/tablet_snapshots.cc +++ b/src/yb/tablet/tablet_snapshots.cc @@ -44,6 +44,7 @@ #include "yb/util/scope_exit.h" #include "yb/util/status_format.h" #include "yb/util/status_log.h" +#include "yb/util/stol_utils.h" using std::string; @@ -57,6 +58,9 @@ DEFINE_test_flag(int32, delay_tablet_export_metadata_ms, 0, "How much time in milliseconds to delay before exporting tablet metadata during " "snapshot creation."); +DEFINE_test_flag(double, delay_create_snapshot_probability, 0.0, + "The probability to delay creating snapshot by 1 second"); + DEFINE_RUNTIME_int32(max_wait_for_aborting_transactions_during_restore_ms, 200, "How much time in milliseconds to wait for tablet transactions to abort while " "applying the raft restore operation to a tablet."); @@ -69,11 +73,27 @@ namespace { const std::string kTempSnapshotDirSuffix = ".tmp"; const std::string kTabletMetadataFile = "tablet.metadata"; +const std::string kLastSnapshotPrefix = "last_snapshot."; std::string TabletMetadataFile(const std::string& dir) { return JoinPathSegments(dir, kTabletMetadataFile); } +std::string LastSnapshotHybridTimePath( + const std::string& top_dir, SnapshotScheduleId schedule_id, HybridTime time) { + return JoinPathSegments( + top_dir, Format("$0$1.$2", kLastSnapshotPrefix, schedule_id, time.ToUint64())); +} + +void CleanupLastSnapshotHybridTime( + Env& env, const std::string& top_dir, SnapshotScheduleId schedule_id, HybridTime time) { + auto path = LastSnapshotHybridTimePath(top_dir, schedule_id, time); + VLOG(2) << "Cleanup snapshot ht: " << path; + if (env.FileExists(path)) { + WARN_NOT_OK(env.DeleteFile(path), "Failed to cleanup last snapshot time"); + } +} + } // namespace struct TabletSnapshots::RestoreMetadata { @@ -93,6 +113,52 @@ struct TabletSnapshots::ColocatedTableMetadata { TabletSnapshots::TabletSnapshots(Tablet* tablet) : TabletComponent(tablet) {} +Status TabletSnapshots::Open() { + auto dir = metadata().snapshots_dir(); + if (!env().DirExists(dir)) { + return Status::OK(); + } + std::vector> last_snapshot_ht_list; + for (const auto& child : VERIFY_RESULT(env().GetChildren(dir, ExcludeDots::kTrue))) { + if (!child.starts_with(kLastSnapshotPrefix)) { + continue; + } + auto pos = child.find('.', kLastSnapshotPrefix.size()); + if (pos == std::string::npos) { + LOG_WITH_PREFIX(DFATAL) << "Wrong last snapshot file name: " << child; + continue; + } + auto snapshot_id_str = child.substr( + kLastSnapshotPrefix.size(), pos - kLastSnapshotPrefix.size()); + auto hybrid_time_str = child.substr(pos + 1); + auto snapshot_id = SnapshotScheduleIdFromString(snapshot_id_str); + if (!snapshot_id.ok()) { + LOG_WITH_PREFIX(DFATAL) << "Wrong snapshot id in " << child << ": " << snapshot_id.status(); + continue; + } + auto hybrid_time = CheckedStoll(hybrid_time_str); + if (!hybrid_time.ok()) { + LOG_WITH_PREFIX(DFATAL) << "Wrong hybrid time in " << child << ": " << hybrid_time.status(); + continue; + } + last_snapshot_ht_list.emplace_back(*snapshot_id, *hybrid_time); + } + std::sort(last_snapshot_ht_list.begin(), last_snapshot_ht_list.end()); + std::lock_guard lock(last_snapshot_ht_mutex_); + for (auto it = last_snapshot_ht_list.begin(); it != last_snapshot_ht_list.end();) { + auto next = it; + ++next; + if (next != last_snapshot_ht_list.end() && it->first == next->first) { + CleanupLastSnapshotHybridTime(env(), dir, it->first, it->second); + } else { + last_snapshot_ht_.emplace(it->first, it->second); + } + it = next; + } + VLOG_WITH_PREFIX_AND_FUNC(1) << "last snapshot ht: " << AsString(last_snapshot_ht_); + return Status::OK(); +} + std::string TabletSnapshots::SnapshotsDirName(const std::string& rocksdb_dir) { return docdb::GetStorageDir(rocksdb_dir, kSnapshotsDirName); } @@ -118,6 +184,11 @@ Status TabletSnapshots::Create(SnapshotOperation* operation) { Status TabletSnapshots::Create(const CreateSnapshotData& data) { LongOperationTracker long_operation_tracker("Create snapshot", 5s); + if (RandomActWithProbability(FLAGS_TEST_delay_create_snapshot_probability)) { + LOG_WITH_PREFIX_AND_FUNC(INFO) << "TEST: Sleep"; + std::this_thread::sleep_for(1s); + } + ScopedRWOperation scoped_read_operation(&pending_op_counter_blocking_rocksdb_shutdown_start()); RETURN_NOT_OK(scoped_read_operation); @@ -129,7 +200,7 @@ Status TabletSnapshots::Create(const CreateSnapshotData& data) { const std::string& snapshot_dir = data.snapshot_dir; - Env* const env = metadata().fs_manager()->env(); + Env& env = this->env(); auto snapshot_hybrid_time = data.snapshot_hybrid_time; auto is_transactional_snapshot = snapshot_hybrid_time.is_valid(); @@ -147,12 +218,12 @@ Status TabletSnapshots::Create(const CreateSnapshotData& data) { bool exit_on_failure = true; // Delete snapshot (RocksDB checkpoint) directories on exit. auto se = ScopeExit( - [this, env, &exit_on_failure, &snapshot_dir, &tmp_snapshot_dir, &top_snapshots_dir] { + [this, &env, &exit_on_failure, &snapshot_dir, &tmp_snapshot_dir, &top_snapshots_dir] { bool do_sync = false; - if (env->FileExists(tmp_snapshot_dir)) { + if (env.FileExists(tmp_snapshot_dir)) { do_sync = true; - const Status deletion_status = env->DeleteRecursively(tmp_snapshot_dir); + const Status deletion_status = env.DeleteRecursively(tmp_snapshot_dir); if (PREDICT_FALSE(!deletion_status.ok())) { LOG_WITH_PREFIX(WARNING) << "Cannot recursively delete temp snapshot dir " @@ -160,9 +231,9 @@ Status TabletSnapshots::Create(const CreateSnapshotData& data) { } } - if (exit_on_failure && env->FileExists(snapshot_dir)) { + if (exit_on_failure && env.FileExists(snapshot_dir)) { do_sync = true; - const Status deletion_status = env->DeleteRecursively(snapshot_dir); + const Status deletion_status = env.DeleteRecursively(snapshot_dir); if (PREDICT_FALSE(!deletion_status.ok())) { LOG_WITH_PREFIX(WARNING) << "Cannot recursively delete snapshot dir " << snapshot_dir << ": " << deletion_status; @@ -170,7 +241,7 @@ Status TabletSnapshots::Create(const CreateSnapshotData& data) { } if (do_sync) { - const Status sync_status = env->SyncDir(top_snapshots_dir); + const Status sync_status = env.SyncDir(top_snapshots_dir); if (PREDICT_FALSE(!sync_status.ok())) { LOG_WITH_PREFIX(WARNING) << "Cannot sync top snapshots dir " << top_snapshots_dir << ": " << sync_status; @@ -204,10 +275,10 @@ Status TabletSnapshots::Create(const CreateSnapshotData& data) { RETURN_NOT_OK(tablet().metadata()->SaveTo(TabletMetadataFile(tmp_snapshot_dir))); RETURN_NOT_OK_PREPEND( - env->RenameFile(tmp_snapshot_dir, snapshot_dir), + env.RenameFile(tmp_snapshot_dir, snapshot_dir), Format("Cannot rename temp snapshot dir $0 to $1", tmp_snapshot_dir, snapshot_dir)); RETURN_NOT_OK_PREPEND( - env->SyncDir(top_snapshots_dir), + env.SyncDir(top_snapshots_dir), Format("Cannot sync top snapshots dir $0", top_snapshots_dir)); if (need_flush) { @@ -217,6 +288,23 @@ Status TabletSnapshots::Create(const CreateSnapshotData& data) { LOG_WITH_PREFIX(INFO) << "Complete snapshot creation in folder: " << snapshot_dir << ", snapshot hybrid time: " << snapshot_hybrid_time; + if (data.schedule_id) { + auto path = LastSnapshotHybridTimePath( + top_snapshots_dir, data.schedule_id, data.snapshot_hybrid_time); + RETURN_NOT_OK(WriteStringToFile(&env, Slice(), path)); + HybridTime previous_last_snapshot_ht; + { + std::lock_guard lock(last_snapshot_ht_mutex_); + auto& existing = last_snapshot_ht_[data.schedule_id]; + previous_last_snapshot_ht = existing; + existing = data.snapshot_hybrid_time; + } + if (previous_last_snapshot_ht) { + CleanupLastSnapshotHybridTime( + env, top_snapshots_dir, data.schedule_id, previous_last_snapshot_ht); + } + } + exit_on_failure = false; return Status::OK(); } @@ -640,6 +728,23 @@ Status TabletSnapshots::RestoreFinished(SnapshotOperation* operation) { HybridTime::FromPB(operation->request()->restoration_hybrid_time())); } +HybridTime TabletSnapshots::AllowedHistoryCutoff() { + auto schedules = metadata().SnapshotSchedules(); + if (schedules.empty()) { + return HybridTime::kMax; + } + auto result = HybridTime::kMax; + std::lock_guard lock(last_snapshot_ht_mutex_); + VLOG_WITH_PREFIX_AND_FUNC(4) + << "schedules: " << AsString(schedules) << ", last snapshot ht: " + << AsString(last_snapshot_ht_); + for (const auto& schedule : schedules) { + auto it = last_snapshot_ht_.find(schedule); + result.MakeAtMost(it != last_snapshot_ht_.end() ? it->second : HybridTime::kMin); + } + return result; +} + Result TabletRestorePatch::ShouldSkipEntry(const Slice& key, const Slice& value) { KeyBuffer key_copy; key_copy = key; diff --git a/src/yb/tablet/tablet_snapshots.h b/src/yb/tablet/tablet_snapshots.h index 69ee2abb53c1..9fc2e2d8d47b 100644 --- a/src/yb/tablet/tablet_snapshots.h +++ b/src/yb/tablet/tablet_snapshots.h @@ -56,6 +56,8 @@ class TabletSnapshots : public TabletComponent { public: explicit TabletSnapshots(Tablet* tablet); + Status Open(); + // Create snapshot for this tablet. Status Create(SnapshotOperation* operation); @@ -90,6 +92,8 @@ class TabletSnapshots : public TabletComponent { Status CreateDirectories(const std::string& rocksdb_dir, FsManager* fs); + HybridTime AllowedHistoryCutoff(); + static std::string SnapshotsDirName(const std::string& rocksdb_dir); static bool IsTempSnapshotDir(const std::string& dir); @@ -122,6 +126,10 @@ class TabletSnapshots : public TabletComponent { const std::string& dir, CreateIntentsCheckpointIn create_intents_checkpoint_in); std::string TEST_last_rocksdb_checkpoint_dir_; + + std::mutex last_snapshot_ht_mutex_; + std::unordered_map last_snapshot_ht_ + GUARDED_BY(last_snapshot_ht_mutex_); }; struct SequencesDataInfo { diff --git a/src/yb/tools/yb-admin-snapshot-schedule-test.cc b/src/yb/tools/yb-admin-snapshot-schedule-test.cc index 54ee3fe49130..e43542d02501 100644 --- a/src/yb/tools/yb-admin-snapshot-schedule-test.cc +++ b/src/yb/tools/yb-admin-snapshot-schedule-test.cc @@ -608,6 +608,7 @@ class YbAdminSnapshotScheduleTestWithYsqlAndPackedRow : public YbAdminSnapshotSc YbAdminSnapshotScheduleTestWithYsql::UpdateMiniClusterOptions(opts); opts->extra_tserver_flags.emplace_back("--ysql_enable_packed_row=true"); opts->extra_tserver_flags.emplace_back("--timestamp_history_retention_interval_sec=0"); + opts->extra_tserver_flags.emplace_back("--TEST_delay_create_snapshot_probability=0.25"); opts->extra_master_flags.emplace_back("--ysql_enable_packed_row=true"); opts->extra_master_flags.emplace_back("--timestamp_history_retention_interval_sec=0"); } diff --git a/src/yb/tserver/ts_tablet_manager.cc b/src/yb/tserver/ts_tablet_manager.cc index 8f7903c2966b..6b7f830d3561 100644 --- a/src/yb/tserver/ts_tablet_manager.cc +++ b/src/yb/tserver/ts_tablet_manager.cc @@ -3260,8 +3260,7 @@ Status TSTabletManager::UpdateSnapshotsInfo(const master::TSSnapshotsInfoPB& inf return Status::OK(); } -docdb::HistoryCutoff TSTabletManager::AllowedHistoryCutoff( - tablet::RaftGroupMetadata* metadata) { +docdb::HistoryCutoff TSTabletManager::AllowedHistoryCutoff(tablet::RaftGroupMetadata* metadata) { HybridTime result = HybridTime::kMax; // CDC SDK safe time if (metadata->cdc_sdk_safe_time() != HybridTime::kInvalid) { @@ -3287,25 +3286,16 @@ docdb::HistoryCutoff TSTabletManager::AllowedHistoryCutoff( result.MakeAtMost(*opt_xcluster_safe_time); } + // This logic in required to cleanup snapshot schedules that were removed at master. auto schedules = metadata->SnapshotSchedules(); if (!schedules.empty()) { std::vector schedules_to_remove; - auto se = ScopeExit([&schedules_to_remove, metadata]() { - if (schedules_to_remove.empty()) { - return; - } - bool any_removed = false; - for (const auto& schedule_id : schedules_to_remove) { - any_removed = metadata->RemoveSnapshotSchedule(schedule_id) || any_removed; - } - if (any_removed) { - WARN_NOT_OK(metadata->Flush(), "Failed to flush metadata"); - } - }); - std::lock_guard lock(snapshot_schedule_allowed_history_cutoff_mutex_); - for (const auto& schedule_id : schedules) { - auto it = snapshot_schedule_allowed_history_cutoff_.find(schedule_id); - if (it == snapshot_schedule_allowed_history_cutoff_.end()) { + { + std::lock_guard lock(snapshot_schedule_allowed_history_cutoff_mutex_); + for (const auto& schedule_id : schedules) { + if (snapshot_schedule_allowed_history_cutoff_.contains(schedule_id)) { + continue; + } // We don't know this schedule. auto emplace_result = missing_snapshot_schedules_.emplace(schedule_id, snapshot_schedules_version_); @@ -3317,18 +3307,18 @@ docdb::HistoryCutoff TSTabletManager::AllowedHistoryCutoff( // One round is not enough, because schedule could be added after heartbeat processed on // master, but response not yet received on TServer. schedules_to_remove.push_back(schedule_id); - continue; } - return { HybridTime::kInvalid, HybridTime::kMin }; } - if (!it->second) { - // Schedules does not have snapshots yet. - return { HybridTime::kInvalid, HybridTime::kMin }; - } - result.MakeAtMost(it->second); + } + bool any_removed = false; + for (const auto& schedule_id : schedules_to_remove) { + any_removed = metadata->RemoveSnapshotSchedule(schedule_id) || any_removed; + } + if (any_removed) { + WARN_NOT_OK(metadata->Flush(), "Failed to flush metadata"); } } - VLOG(1) << "Setting the allowed historycutoff: " << result + VLOG(1) << "Setting the allowed history cutoff: " << result << " for tablet: " << metadata->raft_group_id(); return { HybridTime::kInvalid, result }; }