Skip to content

Commit

Permalink
Implement obsolete file deletion (GC) in follower (#12657)
Browse files Browse the repository at this point in the history
Summary:
This PR implements deletion of obsolete files in a follower RocksDB instance. The follower tails the leader's MANIFEST and creates links to newly added SST files. These links need to be deleted once those files become obsolete in order to reclaim space. There are three cases to be considered -
1. New files added and links created, but the Version could not be installed due to some missing files. Those links need to be preserved so a subsequent catch up attempt can succeed. We insert the next file number in the `VersionSet` to `pending_outputs_` to prevent their deletion.
2. Files deleted from the previous successfully installed `Version`. These are deleted as usual in `PurgeObsoleteFiles`.
3. New files added by a `VersionEdit` and deleted by a subsequent `VersionEdit`, both processed in the same catchup attempt. Links will be created for the new files when verifying a candidate `Version`. Those need to be deleted explicitly as they're never added to `VersionStorageInfo`, and thus not deleted by `PurgeObsoleteFiles`.

Test plan -
New unit tests in `db_follower_test`.

Pull Request resolved: #12657

Reviewed By: jowlyzhang

Differential Revision: D57462697

Pulled By: anand1976

fbshipit-source-id: 898f15570638dd4930f839ffd31c560f9cb73916
  • Loading branch information
anand1976 authored and facebook-github-bot committed May 18, 2024
1 parent ffd7930 commit 0ed9355
Show file tree
Hide file tree
Showing 12 changed files with 614 additions and 28 deletions.
468 changes: 466 additions & 2 deletions db/db_follower_test.cc

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions db/db_impl/db_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1648,6 +1648,7 @@ class DBImpl : public DB {
friend class ForwardIterator;
friend struct SuperVersion;
friend class CompactedDBImpl;
friend class DBImplFollower;
#ifndef NDEBUG
friend class DBTest_ConcurrentFlushWAL_Test;
friend class DBTest_MixedSlowdownOptionsStop_Test;
Expand Down
39 changes: 38 additions & 1 deletion db/db_impl/db_impl_follower.cc
Original file line number Diff line number Diff line change
Expand Up @@ -95,17 +95,28 @@ Status DBImplFollower::TryCatchUpWithLeader() {
assert(versions_.get() != nullptr);
assert(manifest_reader_.get() != nullptr);
Status s;

TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin1");
TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:Begin2");
// read the manifest and apply new changes to the follower instance
std::unordered_set<ColumnFamilyData*> cfds_changed;
JobContext job_context(0, true /*create_superversion*/);
{
InstrumentedMutexLock lock_guard(&mutex_);
std::vector<std::string> files_to_delete;
s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
->ReadAndApply(&mutex_, &manifest_reader_,
manifest_reader_status_.get(), &cfds_changed);
manifest_reader_status_.get(), &cfds_changed,
&files_to_delete);
ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);
pending_outputs_inserted_elem_.reset(new std::list<uint64_t>::iterator(
CaptureCurrentFileNumberInPendingOutputs()));

ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
static_cast<uint64_t>(versions_->LastSequence()));
ROCKS_LOG_INFO(
immutable_db_options_.info_log, "Next file number is %" PRIu64,
static_cast<uint64_t>(versions_->current_next_file_number()));
for (ColumnFamilyData* cfd : cfds_changed) {
if (cfd->IsDropped()) {
ROCKS_LOG_DEBUG(immutable_db_options_.info_log, "[%s] is dropped\n",
Expand Down Expand Up @@ -147,9 +158,33 @@ Status DBImplFollower::TryCatchUpWithLeader() {
sv_context.NewSuperVersion();
}
}

for (auto& file : files_to_delete) {
IOStatus io_s = fs_->DeleteFile(file, IOOptions(), nullptr);
if (!io_s.ok()) {
ROCKS_LOG_INFO(immutable_db_options_.info_log,
"Cannot delete file %s: %s", file.c_str(),
io_s.ToString().c_str());
}
}
}
job_context.Clean();

// Cleanup unused, obsolete files.
JobContext purge_files_job_context(0);
{
InstrumentedMutexLock lock_guard(&mutex_);
// Currently, follower instance does not create any database files, thus
// is unnecessary for the follower to force full scan.
FindObsoleteFiles(&purge_files_job_context, /*force=*/false);
}
if (purge_files_job_context.HaveSomethingToDelete()) {
PurgeObsoleteFiles(purge_files_job_context);
}
purge_files_job_context.Clean();

TEST_SYNC_POINT("DBImplFollower::TryCatchupWithLeader:End");

return s;
}

Expand Down Expand Up @@ -199,6 +234,8 @@ Status DBImplFollower::Close() {
catch_up_thread_.reset();
}

ReleaseFileNumberFromPendingOutputs(pending_outputs_inserted_elem_);

return DBImpl::Close();
}

Expand Down
3 changes: 2 additions & 1 deletion db/db_impl/db_impl_follower.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class DBImplFollower : public DBImplSecondary {
bool OwnTablesAndLogs() const override {
// TODO: Change this to true once we've properly implemented file
// deletion for the read scaling case
return false;
return true;
}

Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
Expand All @@ -49,5 +49,6 @@ class DBImplFollower : public DBImplSecondary {
std::string src_path_;
port::Mutex mu_;
port::CondVar cv_;
std::unique_ptr<std::list<uint64_t>::iterator> pending_outputs_inserted_elem_;
};
} // namespace ROCKSDB_NAMESPACE
3 changes: 2 additions & 1 deletion db/db_impl/db_impl_secondary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -680,7 +680,8 @@ Status DBImplSecondary::TryCatchUpWithPrimary() {
InstrumentedMutexLock lock_guard(&mutex_);
s = static_cast_with_check<ReactiveVersionSet>(versions_.get())
->ReadAndApply(&mutex_, &manifest_reader_,
manifest_reader_status_.get(), &cfds_changed);
manifest_reader_status_.get(), &cfds_changed,
/*files_to_delete=*/nullptr);

ROCKS_LOG_INFO(immutable_db_options_.info_log, "Last sequence is %" PRIu64,
static_cast<uint64_t>(versions_->LastSequence()));
Expand Down
44 changes: 39 additions & 5 deletions db/version_edit_handler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -152,15 +152,15 @@ Status FileChecksumRetriever::ApplyVersionEdit(VersionEdit& edit,

VersionEditHandler::VersionEditHandler(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, bool track_missing_files,
VersionSet* version_set, bool track_found_and_missing_files,
bool no_error_if_files_missing, const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool skip_load_table_files,
EpochNumberRequirement epoch_number_requirement)
: VersionEditHandlerBase(read_options),
read_only_(read_only),
column_families_(std::move(column_families)),
version_set_(version_set),
track_missing_files_(track_missing_files),
track_found_and_missing_files_(track_found_and_missing_files),
no_error_if_files_missing_(no_error_if_files_missing),
io_tracer_(io_tracer),
skip_load_table_files_(skip_load_table_files),
Expand Down Expand Up @@ -500,7 +500,8 @@ ColumnFamilyData* VersionEditHandler::CreateCfAndInit(
assert(builders_.find(cf_id) == builders_.end());
builders_.emplace(cf_id,
VersionBuilderUPtr(new BaseReferencedVersionBuilder(cfd)));
if (track_missing_files_) {
if (track_found_and_missing_files_) {
cf_to_found_files_.emplace(cf_id, std::unordered_set<uint64_t>());
cf_to_missing_files_.emplace(cf_id, std::unordered_set<uint64_t>());
cf_to_missing_blob_files_high_.emplace(cf_id, kInvalidBlobFileNumber);
}
Expand All @@ -513,7 +514,11 @@ ColumnFamilyData* VersionEditHandler::DestroyCfAndCleanup(
auto builder_iter = builders_.find(cf_id);
assert(builder_iter != builders_.end());
builders_.erase(builder_iter);
if (track_missing_files_) {
if (track_found_and_missing_files_) {
auto found_files_iter = cf_to_found_files_.find(cf_id);
assert(found_files_iter != cf_to_found_files_.end());
cf_to_found_files_.erase(found_files_iter);

auto missing_files_iter = cf_to_missing_files_.find(cf_id);
assert(missing_files_iter != cf_to_missing_files_.end());
cf_to_missing_files_.erase(missing_files_iter);
Expand Down Expand Up @@ -729,7 +734,7 @@ VersionEditHandlerPointInTime::VersionEditHandlerPointInTime(
const ReadOptions& read_options,
EpochNumberRequirement epoch_number_requirement)
: VersionEditHandler(read_only, column_families, version_set,
/*track_missing_files=*/true,
/*track_found_and_missing_files=*/true,
/*no_error_if_files_missing=*/true, io_tracer,
read_options, epoch_number_requirement) {}

Expand Down Expand Up @@ -824,6 +829,12 @@ void VersionEditHandlerPointInTime::CheckIterationResult(

version_set_->AppendVersion(cfd, v_iter->second);
versions_.erase(v_iter);
// Let's clear found_files, since any files in that are part of the
// installed Version. Any files that got obsoleted would have already
// been moved to intermediate_files_
auto found_files_iter = cf_to_found_files_.find(cfd->GetID());
assert(found_files_iter != cf_to_found_files_.end());
found_files_iter->second.clear();
}
}
} else {
Expand Down Expand Up @@ -854,10 +865,16 @@ ColumnFamilyData* VersionEditHandlerPointInTime::DestroyCfAndCleanup(

Status VersionEditHandlerPointInTime::MaybeCreateVersion(
const VersionEdit& edit, ColumnFamilyData* cfd, bool force_create_version) {
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin1");
TEST_SYNC_POINT("VersionEditHandlerPointInTime::MaybeCreateVersion:Begin2");
assert(cfd != nullptr);
if (!force_create_version) {
assert(edit.GetColumnFamily() == cfd->GetID());
}
auto found_files_iter = cf_to_found_files_.find(cfd->GetID());
assert(found_files_iter != cf_to_found_files_.end());
std::unordered_set<uint64_t>& found_files = found_files_iter->second;

auto missing_files_iter = cf_to_missing_files_.find(cfd->GetID());
assert(missing_files_iter != cf_to_missing_files_.end());
std::unordered_set<uint64_t>& missing_files = missing_files_iter->second;
Expand Down Expand Up @@ -889,6 +906,18 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
auto fiter = missing_files.find(file_num);
if (fiter != missing_files.end()) {
missing_files.erase(fiter);
} else {
fiter = found_files.find(file_num);
// Only mark new files added during this catchup attempt for deletion.
// These files were never installed in VersionStorageInfo.
// Already referenced files that are deleted by a VersionEdit will
// be added to the VersionStorageInfo's obsolete files when the old
// version is dereferenced.
if (fiter != found_files.end()) {
intermediate_files_.emplace_back(
MakeTableFileName(cfd->ioptions()->cf_paths[0].path, file_num));
found_files.erase(fiter);
}
}
}

Expand All @@ -904,9 +933,14 @@ Status VersionEditHandlerPointInTime::MaybeCreateVersion(
s = VerifyFile(cfd, fpath, level, meta);
if (s.IsPathNotFound() || s.IsNotFound() || s.IsCorruption()) {
missing_files.insert(file_num);
if (s.IsCorruption()) {
found_files.insert(file_num);
}
s = Status::OK();
} else if (!s.ok()) {
break;
} else {
found_files.insert(file_num);
}
}

Expand Down
26 changes: 17 additions & 9 deletions db/version_edit_handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ using VersionBuilderUPtr = std::unique_ptr<BaseReferencedVersionBuilder>;
// To use this class and its subclasses,
// 1. Create an object of VersionEditHandler or its subclasses.
// VersionEditHandler handler(read_only, column_families, version_set,
// track_missing_files,
// track_found_and_missing_files,
// no_error_if_files_missing);
// 2. Status s = handler.Iterate(reader, &db_id);
// 3. Check s and handle possible errors.
Expand All @@ -116,16 +116,17 @@ class VersionEditHandler : public VersionEditHandlerBase {
explicit VersionEditHandler(
bool read_only,
const std::vector<ColumnFamilyDescriptor>& column_families,
VersionSet* version_set, bool track_missing_files,
VersionSet* version_set, bool track_found_and_missing_files,
bool no_error_if_files_missing,
const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options,
EpochNumberRequirement epoch_number_requirement =
EpochNumberRequirement::kMustPresent)
: VersionEditHandler(
read_only, column_families, version_set, track_missing_files,
no_error_if_files_missing, io_tracer, read_options,
/*skip_load_table_files=*/false, epoch_number_requirement) {}
: VersionEditHandler(read_only, column_families, version_set,
track_found_and_missing_files,
no_error_if_files_missing, io_tracer, read_options,
/*skip_load_table_files=*/false,
epoch_number_requirement) {}

~VersionEditHandler() override {}

Expand All @@ -144,7 +145,7 @@ class VersionEditHandler : public VersionEditHandlerBase {
protected:
explicit VersionEditHandler(
bool read_only, std::vector<ColumnFamilyDescriptor> column_families,
VersionSet* version_set, bool track_missing_files,
VersionSet* version_set, bool track_found_and_missing_files,
bool no_error_if_files_missing,
const std::shared_ptr<IOTracer>& io_tracer,
const ReadOptions& read_options, bool skip_load_table_files,
Expand Down Expand Up @@ -195,7 +196,8 @@ class VersionEditHandler : public VersionEditHandlerBase {
// by subsequent manifest records, Recover() will return failure status.
std::unordered_map<uint32_t, std::string> column_families_not_found_;
VersionEditParams version_edit_params_;
const bool track_missing_files_;
const bool track_found_and_missing_files_;
std::unordered_map<uint32_t, std::unordered_set<uint64_t>> cf_to_found_files_;
std::unordered_map<uint32_t, std::unordered_set<uint64_t>>
cf_to_missing_files_;
std::unordered_map<uint32_t, uint64_t> cf_to_missing_blob_files_high_;
Expand Down Expand Up @@ -273,6 +275,8 @@ class VersionEditHandlerPointInTime : public VersionEditHandler {

bool in_atomic_group_ = false;

std::vector<std::string> intermediate_files_;

private:
bool AtomicUpdateVersionsCompleted();
bool AtomicUpdateVersionsContains(uint32_t cfid);
Expand Down Expand Up @@ -310,6 +314,10 @@ class ManifestTailer : public VersionEditHandlerPointInTime {
return cfds_changed_;
}

std::vector<std::string>& GetIntermediateFiles() {
return intermediate_files_;
}

protected:
Status Initialize() override;

Expand Down Expand Up @@ -342,7 +350,7 @@ class DumpManifestHandler : public VersionEditHandler {
bool json)
: VersionEditHandler(
/*read_only=*/true, column_families, version_set,
/*track_missing_files=*/false,
/*track_found_and_missing_files=*/false,
/*no_error_if_files_missing=*/false, io_tracer, read_options,
/*skip_load_table_files=*/true),
verbose_(verbose),
Expand Down
10 changes: 7 additions & 3 deletions db/version_set.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6063,8 +6063,8 @@ Status VersionSet::Recover(
true /* checksum */, 0 /* log_number */);
VersionEditHandler handler(
read_only, column_families, const_cast<VersionSet*>(this),
/*track_missing_files=*/false, no_error_if_files_missing, io_tracer_,
read_options, EpochNumberRequirement::kMightMissing);
/*track_found_and_missing_files=*/false, no_error_if_files_missing,
io_tracer_, read_options, EpochNumberRequirement::kMightMissing);
handler.Iterate(reader, &log_read_status);
s = handler.status();
if (s.ok()) {
Expand Down Expand Up @@ -7439,7 +7439,8 @@ Status ReactiveVersionSet::ReadAndApply(
InstrumentedMutex* mu,
std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
Status* manifest_read_status,
std::unordered_set<ColumnFamilyData*>* cfds_changed) {
std::unordered_set<ColumnFamilyData*>* cfds_changed,
std::vector<std::string>* files_to_delete) {
assert(manifest_reader != nullptr);
assert(cfds_changed != nullptr);
mu->AssertHeld();
Expand All @@ -7456,6 +7457,9 @@ Status ReactiveVersionSet::ReadAndApply(
if (s.ok()) {
*cfds_changed = std::move(manifest_tailer_->GetUpdatedColumnFamilies());
}
if (files_to_delete) {
*files_to_delete = std::move(manifest_tailer_->GetIntermediateFiles());
}

return s;
}
Expand Down
3 changes: 2 additions & 1 deletion db/version_set.h
Original file line number Diff line number Diff line change
Expand Up @@ -1741,7 +1741,8 @@ class ReactiveVersionSet : public VersionSet {
InstrumentedMutex* mu,
std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
Status* manifest_read_status,
std::unordered_set<ColumnFamilyData*>* cfds_changed);
std::unordered_set<ColumnFamilyData*>* cfds_changed,
std::vector<std::string>* files_to_delete);

Status Recover(const std::vector<ColumnFamilyDescriptor>& column_families,
std::unique_ptr<log::FragmentBufferedReader>* manifest_reader,
Expand Down
15 changes: 10 additions & 5 deletions db/version_set_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2742,7 +2742,8 @@ TEST_F(VersionSetAtomicGroupTest,
std::unordered_set<ColumnFamilyData*> cfds_changed;
mu.Lock();
EXPECT_OK(reactive_versions_->ReadAndApply(
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
/*files_to_delete=*/nullptr));
mu.Unlock();
EXPECT_TRUE(first_in_atomic_group_);
EXPECT_TRUE(last_in_atomic_group_);
Expand Down Expand Up @@ -2797,7 +2798,8 @@ TEST_F(VersionSetAtomicGroupTest,
std::unordered_set<ColumnFamilyData*> cfds_changed;
mu.Lock();
EXPECT_OK(reactive_versions_->ReadAndApply(
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
/*files_to_delete=*/nullptr));
mu.Unlock();
// Reactive version set should be empty now.
EXPECT_TRUE(reactive_versions_->TEST_read_edits_in_atomic_group() == 0);
Expand Down Expand Up @@ -2826,7 +2828,8 @@ TEST_F(VersionSetAtomicGroupTest,
std::unordered_set<ColumnFamilyData*> cfds_changed;
mu.Lock();
EXPECT_OK(reactive_versions_->ReadAndApply(
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
/*files_to_delete=*/nullptr));
mu.Unlock();
EXPECT_TRUE(first_in_atomic_group_);
EXPECT_FALSE(last_in_atomic_group_);
Expand Down Expand Up @@ -2882,7 +2885,8 @@ TEST_F(VersionSetAtomicGroupTest,
AddNewEditsToLog(kAtomicGroupSize);
mu.Lock();
EXPECT_NOK(reactive_versions_->ReadAndApply(
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
/*files_to_delete=*/nullptr));
mu.Unlock();
EXPECT_EQ(edits_[kAtomicGroupSize / 2].DebugString(),
corrupted_edit_.DebugString());
Expand Down Expand Up @@ -2932,7 +2936,8 @@ TEST_F(VersionSetAtomicGroupTest,
AddNewEditsToLog(kAtomicGroupSize);
mu.Lock();
EXPECT_NOK(reactive_versions_->ReadAndApply(
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed));
&mu, &manifest_reader, manifest_reader_status.get(), &cfds_changed,
/*files_to_delete=*/nullptr));
mu.Unlock();
EXPECT_EQ(edits_[1].DebugString(),
edit_with_incorrect_group_size_.DebugString());
Expand Down
Loading

0 comments on commit 0ed9355

Please sign in to comment.