From 54fecd96518b556d6e083542044dc6b17a263999 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Tue, 25 Jan 2022 10:53:14 +0800 Subject: [PATCH 01/23] add ColumnFileSetReader --- .../DeltaMerge/ColumnFile/ColumnFile.cpp | 16 +- .../DeltaMerge/ColumnFile/ColumnFile.h | 13 +- .../DeltaMerge/ColumnFile/ColumnFileBig.cpp | 18 +- .../DeltaMerge/ColumnFile/ColumnFileBig.h | 7 +- .../ColumnFile/ColumnFileDeleteRange.h | 2 +- .../ColumnFile/ColumnFileInMemory.cpp | 14 +- .../ColumnFile/ColumnFileInMemory.h | 13 +- .../ColumnFile/ColumnFilePersisted.cpp | 4 +- .../ColumnFile/ColumnFilePersisted.h | 19 +- .../ColumnFile/ColumnFileSetReader.cpp | 266 ++++++++++++++++++ .../ColumnFile/ColumnFileSetReader.h | 104 +++++++ .../ColumnFile/ColumnFileSetSnapshot.cpp | 20 ++ .../ColumnFile/ColumnFileSetSnapshot.h | 92 ++++++ .../DeltaMerge/ColumnFile/ColumnFileTiny.cpp | 16 +- .../DeltaMerge/ColumnFile/ColumnFileTiny.h | 11 +- .../DeltaMerge/ColumnFile/ColumnFile_V2.cpp | 22 +- .../DeltaMerge/ColumnFile/ColumnFile_V3.cpp | 4 +- 17 files changed, 556 insertions(+), 85 deletions(-) create mode 100644 dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp create mode 100644 dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h create mode 100644 dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp create mode 100644 dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp index 277d5dc293d..4d80e18ef95 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -82,20 +83,20 @@ ColumnFileBig * ColumnFile::tryToBigFile() return !isBigFile() ? nullptr : static_cast(this); } -String columnFilesToString(const ColumnFiles & column_files) +template +String columnFilesToString(const T & column_files) { String column_files_info = "["; for (const auto & f : column_files) { if (f->isInMemoryFile()) - column_files_info += "B_" + DB::toString(f->getRows()); + column_files_info += "M_" + DB::toString(f->getRows()) + ","; else if (f->isTinyFile()) - column_files_info += "B_" + DB::toString(f->getRows()); + column_files_info += "T_" + DB::toString(f->getRows()) + ","; else if (f->isBigFile()) - column_files_info += "F_" + DB::toString(f->getRows()); + column_files_info += "F_" + DB::toString(f->getRows()) + ","; else if (auto * f_delete = f->tryToDeleteRange(); f_delete) - column_files_info += "D_" + f_delete->getDeleteRange().toString(); - column_files_info += (f->isSaved() ? "_S," : "_N,"); + column_files_info += "D_" + f_delete->getDeleteRange().toString() + ","; } if (!column_files.empty()) @@ -104,5 +105,8 @@ String columnFilesToString(const ColumnFiles & column_files) return column_files_info; } +template String columnFilesToString(const ColumnFiles & column_files); +template String columnFilesToString(const ColumnFilePersisteds & column_files); + } // namespace DM } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.h index 386fe95fb0b..fed7e50f214 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.h @@ -75,10 +75,6 @@ class ColumnFile public: /// This id is only used to to do equal check in DeltaValueSpace::checkHeadAndCloneTail. UInt64 getId() const { return id; } - /// This column file is already saved to disk or not. Only saved packs can be recovered after reboot. - /// "saved" can only be true, after the content data and the metadata are all written to disk. - bool isSaved() const { return saved; } - void setSaved() { saved = true; } virtual size_t getRows() const { return 0; } virtual size_t getBytes() const { return 0; }; @@ -111,12 +107,6 @@ class ColumnFile throw Exception("Unsupported operation", ErrorCodes::LOGICAL_ERROR); } - /// Put the data's page id into the corresponding WriteBatch. - /// The actual remove will be done later. - virtual void removeData(WriteBatches &) const {}; - - virtual void serializeMetadata(WriteBuffer & buf, bool save_schema) const = 0; - virtual String toString() const = 0; }; @@ -152,6 +142,7 @@ size_t copyColumnsData( /// Debugging string -String columnFilesToString(const ColumnFiles & column_files); +template +String columnFilesToString(const T & column_files); } // namespace DM } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp index 4e307a33ff0..e7ca6aa9d13 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp @@ -30,7 +30,7 @@ void ColumnFileBig::calculateStat(const DMContext & context) ColumnFileReaderPtr ColumnFileBig::getReader(const DMContext & context, const StorageSnapshotPtr & /*storage_snap*/, const ColumnDefinesPtr & col_defs) const { - return std::make_shared(context, *this, col_defs); + return std::make_shared(context, *this, col_defs); } void ColumnFileBig::serializeMetadata(WriteBuffer & buf, bool /*save_schema*/) const @@ -60,7 +60,7 @@ ColumnFilePtr ColumnFileBig::deserializeMetadata(DMContext & context, // return std::shared_ptr(dp_file); } -void ColumnBigFileReader::initStream() +void ColumnFileBigReader::initStream() { if (file_stream) return; @@ -94,7 +94,7 @@ void ColumnBigFileReader::initStream() } } -size_t ColumnBigFileReader::readRowsRepeatedly(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) +size_t ColumnFileBigReader::readRowsRepeatedly(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) { if (unlikely(rows_offset + rows_limit > column_file.valid_rows)) throw Exception("Try to read more rows", ErrorCodes::LOGICAL_ERROR); @@ -126,13 +126,13 @@ size_t ColumnBigFileReader::readRowsRepeatedly(MutableColumns & output_cols, siz return actual_read; } -size_t ColumnBigFileReader::readRowsOnce(MutableColumns & output_cols, // +size_t ColumnFileBigReader::readRowsOnce(MutableColumns & output_cols, // size_t rows_offset, size_t rows_limit, const RowKeyRange * range) { auto read_next_block = [&, this]() -> bool { - rows_before_cur_block += ((bool)cur_block) ? cur_block.rows() : 0; + rows_before_cur_block += (static_cast(cur_block)) ? cur_block.rows() : 0; cur_block_data.clear(); cur_block = file_stream->read(); @@ -186,7 +186,7 @@ size_t ColumnBigFileReader::readRowsOnce(MutableColumns & output_cols, // return actual_read; } -size_t ColumnBigFileReader::readRows(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) +size_t ColumnFileBigReader::readRows(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) { initStream(); @@ -204,17 +204,17 @@ size_t ColumnBigFileReader::readRows(MutableColumns & output_cols, size_t rows_o } } -Block ColumnBigFileReader::readNextBlock() +Block ColumnFileBigReader::readNextBlock() { initStream(); return file_stream->read(); } -ColumnFileReaderPtr ColumnBigFileReader::createNewReader(const ColumnDefinesPtr & new_col_defs) +ColumnFileReaderPtr ColumnFileBigReader::createNewReader(const ColumnDefinesPtr & new_col_defs) { // Currently we don't reuse the cache data. - return std::make_shared(context, column_file, new_col_defs); + return std::make_shared(context, column_file, new_col_defs); } } // namespace DM diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h index fae37314401..716a55e76de 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h @@ -15,7 +15,7 @@ using ColumnBigFilePtr = std::shared_ptr; /// A column file which contains a DMFile. The DMFile could have many Blocks. class ColumnFileBig : public ColumnFilePersisted { - friend class ColumnBigFileReader; + friend class ColumnFileBigReader; private: DMFilePtr file; @@ -79,12 +79,11 @@ class ColumnFileBig : public ColumnFilePersisted { String s = "{big_file,rows:" + DB::toString(getRows()) // + ",bytes:" + DB::toString(getBytes()) + "}"; // - +",saved:" + DB::toString(saved) + "}"; // return s; } }; -class ColumnBigFileReader : public ColumnFileReader +class ColumnFileBigReader : public ColumnFileReader { private: const DMContext & context; @@ -113,7 +112,7 @@ class ColumnBigFileReader : public ColumnFileReader size_t readRowsOnce(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range); public: - ColumnBigFileReader(const DMContext & context_, const ColumnFileBig & column_file_, const ColumnDefinesPtr & col_defs_) + ColumnFileBigReader(const DMContext & context_, const ColumnFileBig & column_file_, const ColumnDefinesPtr & col_defs_) : context(context_) , column_file(column_file_) , col_defs(col_defs_) diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h index 0559a5dc22e..87d9c38dc39 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h @@ -44,7 +44,7 @@ class ColumnFileDeleteRange : public ColumnFilePersisted static ColumnFilePtr deserializeMetadata(ReadBuffer & buf); - String toString() const override { return "{delete_range:" + delete_range.toString() + ", saved: " + DB::toString(saved) + "}"; } + String toString() const override { return "{delete_range:" + delete_range.toString() + "}"; } }; class ColumnFileEmptyReader : public ColumnFileReader diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp index a92ec34d023..3804fd88c92 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp @@ -44,7 +44,7 @@ void ColumnFileInMemory::fillColumns(const ColumnDefines & col_defs, size_t col_ ColumnFileReaderPtr ColumnFileInMemory::getReader(const DMContext & /*context*/, const StorageSnapshotPtr & /*storage_snap*/, const ColumnDefinesPtr & col_defs) const { - return std::make_shared(*this, col_defs); + return std::make_shared(*this, col_defs); } bool ColumnFileInMemory::append(DMContext & context, const Block & data, size_t offset, size_t limit, size_t data_bytes) @@ -85,19 +85,19 @@ Block ColumnFileInMemory::readDataForFlush() const } -ColumnPtr ColumnInMemoryFileReader::getPKColumn() +ColumnPtr ColumnFileInMemoryReader::getPKColumn() { memory_file.fillColumns(*col_defs, 1, cols_data_cache); return cols_data_cache[0]; } -ColumnPtr ColumnInMemoryFileReader::getVersionColumn() +ColumnPtr ColumnFileInMemoryReader::getVersionColumn() { memory_file.fillColumns(*col_defs, 2, cols_data_cache); return cols_data_cache[1]; } -size_t ColumnInMemoryFileReader::readRows(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) +size_t ColumnFileInMemoryReader::readRows(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) { memory_file.fillColumns(*col_defs, output_cols.size(), cols_data_cache); @@ -105,7 +105,7 @@ size_t ColumnInMemoryFileReader::readRows(MutableColumns & output_cols, size_t r return copyColumnsData(cols_data_cache, pk_col, output_cols, rows_offset, rows_limit, range); } -Block ColumnInMemoryFileReader::readNextBlock() +Block ColumnFileInMemoryReader::readNextBlock() { if (read_done) return {}; @@ -118,10 +118,10 @@ Block ColumnInMemoryFileReader::readNextBlock() return genBlock(*col_defs, columns); } -ColumnFileReaderPtr ColumnInMemoryFileReader::createNewReader(const ColumnDefinesPtr & new_col_defs) +ColumnFileReaderPtr ColumnFileInMemoryReader::createNewReader(const ColumnDefinesPtr & new_col_defs) { // Reuse the cache data. - return std::make_shared(memory_file, new_col_defs, cols_data_cache); + return std::make_shared(memory_file, new_col_defs, cols_data_cache); } } // namespace DM diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.h index 925f1f82ab0..cfd27f05f2e 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.h @@ -12,7 +12,7 @@ using ColumnInMemoryFilePtr = std::shared_ptr; /// A column file which is only resides in memory class ColumnFileInMemory : public ColumnFile { - friend class ColumnInMemoryFileReader; + friend class ColumnFileInMemoryReader; private: BlockPtr schema; @@ -80,11 +80,6 @@ class ColumnFileInMemory : public ColumnFile Block readDataForFlush() const; - void serializeMetadata(WriteBuffer & /*buf*/, bool /*save_schema*/) const override - { - throw Exception("Unsupported operation", ErrorCodes::LOGICAL_ERROR); - } - String toString() const override { String s = "{in_memory_file,rows:" + DB::toString(rows) // @@ -97,7 +92,7 @@ class ColumnFileInMemory : public ColumnFile }; -class ColumnInMemoryFileReader : public ColumnFileReader +class ColumnFileInMemoryReader : public ColumnFileReader { private: const ColumnFileInMemory & memory_file; @@ -107,7 +102,7 @@ class ColumnInMemoryFileReader : public ColumnFileReader bool read_done = false; public: - ColumnInMemoryFileReader(const ColumnFileInMemory & memory_file_, + ColumnFileInMemoryReader(const ColumnFileInMemory & memory_file_, const ColumnDefinesPtr & col_defs_, const Columns & cols_data_cache_) : memory_file(memory_file_) @@ -116,7 +111,7 @@ class ColumnInMemoryFileReader : public ColumnFileReader { } - ColumnInMemoryFileReader(const ColumnFileInMemory & memory_file_, const ColumnDefinesPtr & col_defs_) + ColumnFileInMemoryReader(const ColumnFileInMemory & memory_file_, const ColumnDefinesPtr & col_defs_) : memory_file(memory_file_) , col_defs(col_defs_) { diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp index 961e8823b0a..98f0d889342 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp @@ -75,7 +75,7 @@ void deserializeColumn(IColumn & column, const DataTypePtr & type, const ByteBuf {}); } -void serializeSavedColumnFiles(WriteBuffer & buf, const ColumnFiles & column_files) +void serializeSavedColumnFiles(WriteBuffer & buf, const ColumnFilePersisteds & column_files) { writeIntBinary(STORAGE_FORMAT_CURRENT.delta, buf); // Add binary version switch (STORAGE_FORMAT_CURRENT.delta) @@ -93,7 +93,7 @@ void serializeSavedColumnFiles(WriteBuffer & buf, const ColumnFiles & column_fil } } -ColumnFiles deserializeSavedColumnFiles(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf) +ColumnFilePersisteds deserializeSavedColumnFiles(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf) { // Check binary version DeltaFormat::Version version; diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h index b02e4694cc9..75a45717a02 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h @@ -10,10 +10,15 @@ class ColumnFilePersisted; using ColumnFilePersistedPtr = std::shared_ptr; using ColumnFilePersisteds = std::vector; -// TODO: move `removeData` and `serializeMetadata` into this class // represents ColumnFile that can be saved on disk class ColumnFilePersisted : public ColumnFile { +public: + /// Put the data's page id into the corresponding WriteBatch. + /// The actual remove will be done later. + virtual void removeData(WriteBatches &) const {}; + + virtual void serializeMetadata(WriteBuffer & buf, bool save_schema) const = 0; }; void serializeSchema(WriteBuffer & buf, const BlockPtr & schema); @@ -24,15 +29,15 @@ void deserializeColumn(IColumn & column, const DataTypePtr & type, const ByteBuf /// Serialize those packs' metadata into buf. /// Note that this method stop at the first unsaved pack. -void serializeSavedColumnFiles(WriteBuffer & buf, const ColumnFiles & column_files); +void serializeSavedColumnFiles(WriteBuffer & buf, const ColumnFilePersisteds & column_files); /// Recreate pack instances from buf. -ColumnFiles deserializeSavedColumnFiles(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf); +ColumnFilePersisteds deserializeSavedColumnFiles(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf); -void serializeSavedColumnFilesInV2Format(WriteBuffer & buf, const ColumnFiles & column_files); -ColumnFiles deserializeSavedColumnFilesInV2Format(ReadBuffer & buf, UInt64 version); +void serializeSavedColumnFilesInV2Format(WriteBuffer & buf, const ColumnFilePersisteds & column_files); +ColumnFilePersisteds deserializeSavedColumnFilesInV2Format(ReadBuffer & buf, UInt64 version); -void serializeSavedColumnFilesInV3Format(WriteBuffer & buf, const ColumnFiles & column_files); -ColumnFiles deserializeSavedColumnFilesInV3Format(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf, UInt64 version); +void serializeSavedColumnFilesInV3Format(WriteBuffer & buf, const ColumnFilePersisteds & column_files); +ColumnFilePersisteds deserializeSavedColumnFilesInV3Format(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf, UInt64 version); } // namespace DM } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp new file mode 100644 index 00000000000..631e396ad80 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp @@ -0,0 +1,266 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace DM +{ +std::pair findColumnFile(const ColumnFiles & column_files, size_t rows_offset, size_t deletes_offset) +{ + size_t rows_count = 0; + size_t deletes_count = 0; + size_t column_file_index = 0; + for (; column_file_index < column_files.size(); ++column_file_index) + { + if (rows_count == rows_offset && deletes_count == deletes_offset) + return {column_file_index, 0}; + auto & column_file = column_files[column_file_index]; + + if (column_file->isDeleteRange()) + { + if (deletes_count == deletes_offset) + { + if (unlikely(rows_count != rows_offset)) + throw Exception("rows_count and rows_offset are expected to be equal. pack_index: " + DB::toString(column_file_index) + + ", pack_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) + + ", deletes_offset: " + DB::toString(deletes_offset)); + return {column_file_index, 0}; + } + ++deletes_count; + } + else + { + rows_count += column_file->getRows(); + if (rows_count > rows_offset) + { + if (unlikely(deletes_count != deletes_offset)) + throw Exception("deletes_count and deletes_offset are expected to be equal. pack_index: " + DB::toString(column_file_index) + + ", pack_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) + + ", deletes_offset: " + DB::toString(deletes_offset)); + + return {column_file_index, column_file->getRows() - (rows_count - rows_offset)}; + } + } + } + if (rows_count != rows_offset || deletes_count != deletes_offset) + throw Exception("illegal rows_offset and deletes_offset. pack_size: " + DB::toString(column_files.size()) + + ", rows_count: " + DB::toString(rows_count) + ", rows_offset: " + DB::toString(rows_offset) + + ", deletes_count: " + DB::toString(deletes_count) + ", deletes_offset: " + DB::toString(deletes_offset)); + + return {column_file_index, 0}; +} + +ColumnFileSetReader::ColumnFileSetReader( + const DMContext & context, + const ColumnFileSetSnapshotPtr & snapshot_, + const ColumnDefinesPtr & col_defs_, + const RowKeyRange & segment_range_) + : snapshot(snapshot_) + , col_defs(col_defs_) + , segment_range(segment_range_) +{ + size_t total_rows = 0; + for (auto & f : snapshot->getColumnFiles()) + { + total_rows += f->getRows(); + column_file_rows.push_back(f->getRows()); + column_file_rows_end.push_back(total_rows); + column_file_readers.push_back(f->getReader(context, snapshot->getStorageSnapshot(), col_defs)); + } +} + +ColumnFileSetReaderPtr ColumnFileSetReader::createNewReader(const ColumnDefinesPtr & new_col_defs) +{ + auto * new_reader = new ColumnFileSetReader(); + new_reader->snapshot = snapshot; + new_reader->col_defs = new_col_defs; + new_reader->segment_range = segment_range; + new_reader->column_file_rows = column_file_rows; + new_reader->column_file_rows_end = column_file_rows_end; + + for (auto & fr : column_file_readers) + new_reader->column_file_readers.push_back(fr->createNewReader(new_col_defs)); + + return std::shared_ptr(new_reader); +} + +Block ColumnFileSetReader::readPKVersion(size_t offset, size_t limit) +{ + MutableColumns cols; + for (size_t i = 0; i < 2; ++i) + cols.push_back((*col_defs)[i].type->createColumn()); + readRows(cols, offset, limit, nullptr); + Block block; + for (size_t i = 0; i < 2; ++i) + { + const auto & cd = (*col_defs)[i]; + block.insert(ColumnWithTypeAndName(std::move(cols[i]), cd.type, cd.name, cd.id)); + } + return block; +} + +size_t ColumnFileSetReader::readRows(MutableColumns & output_columns, size_t offset, size_t limit, const RowKeyRange * range) +{ + // Note that DeltaMergeBlockInputStream could ask for rows with larger index than total_delta_rows, + // because DeltaIndex::placed_rows could be larger than total_delta_rows. + // Here is the example: + // 1. Thread A create a delta snapshot with 10 rows. Now DeltaValueSnapshot::shared_delta_index->placed_rows == 10. + // 2. Thread B insert 5 rows into the delta + // 3. Thread B call Segment::ensurePlace to generate a new DeltaTree, placed_rows = 15, and update DeltaValueSnapshot::shared_delta_index = 15 + // 4. Thread A call Segment::ensurePlace, and DeltaValueReader::shouldPlace will return false. Because placed_rows(15) >= 10 + // 5. Thread A use the DeltaIndex with placed_rows = 15 to do the merge in DeltaMergeBlockInputStream + // + // So here, we should filter out those out-of-range rows. + + auto total_delta_rows = snapshot->getRows(); + + auto start = std::min(offset, total_delta_rows); + auto end = std::min(offset + limit, total_delta_rows); + if (end == start) + return 0; + + auto [start_pack_index, rows_start_in_start_pack] = locatePosByAccumulation(column_file_rows_end, start); + auto [end_pack_index, rows_end_in_end_pack] = locatePosByAccumulation(column_file_rows_end, end); + + size_t actual_read = 0; + for (size_t pack_index = start_pack_index; pack_index <= end_pack_index; ++pack_index) + { + size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; + size_t rows_end_in_pack = pack_index == end_pack_index ? rows_end_in_end_pack : column_file_rows[pack_index]; + size_t rows_in_pack_limit = rows_end_in_pack - rows_start_in_pack; + + // Nothing to read. + if (rows_start_in_pack == rows_end_in_pack) + continue; + + auto & column_file_reader = column_file_readers[pack_index]; + actual_read += column_file_reader->readRows(output_columns, rows_start_in_pack, rows_in_pack_limit, range); + } + return actual_read; +} + +void ColumnFileSetReader::getPlaceItems(BlockOrDeletes & place_items, size_t rows_begin, size_t deletes_begin, size_t rows_end, size_t deletes_end, size_t place_rows_offset) +{ + /// Note that we merge the consecutive DeltaPackBlock together, which are seperated in groups by DeltaPackDelete and DeltePackFile. + auto & column_files = snapshot->getColumnFiles(); + + auto [start_pack_index, rows_start_in_start_pack] = findColumnFile(column_files, rows_begin, deletes_begin); + auto [end_pack_index, rows_end_in_end_pack] = findColumnFile(column_files, rows_end, deletes_end); + + size_t block_rows_start = rows_begin; + size_t block_rows_end = rows_begin; + + for (size_t pack_index = start_pack_index; pack_index < column_files.size() && pack_index <= end_pack_index; ++pack_index) + { + auto & pack = *column_files[pack_index]; + + if (pack.isDeleteRange() || pack.isBigFile()) + { + // First, compact the DeltaPackBlocks before this pack into one block. + if (block_rows_end != block_rows_start) + { + auto block = readPKVersion(block_rows_start, block_rows_end - block_rows_start); + place_items.emplace_back(std::move(block), block_rows_start + place_rows_offset); + } + + // Second, take current pack. + if (auto * pack_delete = pack.tryToDeleteRange(); pack_delete) + { + place_items.emplace_back(pack_delete->getDeleteRange()); + } + else if (pack.isBigFile() && pack.getRows()) + { + auto block = readPKVersion(block_rows_end, pack.getRows()); + place_items.emplace_back(std::move(block), block_rows_end + place_rows_offset); + } + + block_rows_end += pack.getRows(); + block_rows_start = block_rows_end; + } + else + { + // It is a DeltaPackBlock. + size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; + size_t rows_end_in_pack = pack_index == end_pack_index ? rows_end_in_end_pack : pack.getRows(); + + block_rows_end += rows_end_in_pack - rows_start_in_pack; + + if (pack_index == column_files.size() - 1 || pack_index == end_pack_index) + { + // It is the last pack. + if (block_rows_end != block_rows_start) + { + auto block = readPKVersion(block_rows_start, block_rows_end - block_rows_start); + place_items.emplace_back(std::move(block), block_rows_start + place_rows_offset); + } + block_rows_start = block_rows_end; + } + } + } +} + +bool ColumnFileSetReader::shouldPlace(const DMContext & context, + const RowKeyRange & relevant_range, + UInt64 max_version, + size_t placed_rows) +{ + auto & column_files = snapshot->getColumnFiles(); + auto [start_pack_index, rows_start_in_start_pack] = locatePosByAccumulation(column_file_rows_end, placed_rows); + + for (size_t pack_index = start_pack_index; pack_index < snapshot->getColumnFileCount(); ++pack_index) + { + auto & column_file = column_files[pack_index]; + + // Always do place index if DeltaPackFile exists. + if (column_file->isBigFile()) + return true; + if (unlikely(column_file->isDeleteRange())) + throw Exception("pack is delete range", ErrorCodes::LOGICAL_ERROR); + + size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; + size_t rows_end_in_pack = column_file_rows[pack_index]; + + auto & pack_reader = column_file_readers[pack_index]; + if (column_file->isInMemoryFile()) + { + auto & dpb_reader = typeid_cast(*pack_reader); + auto pk_column = dpb_reader.getPKColumn(); + auto version_column = dpb_reader.getVersionColumn(); + + auto rkcc = RowKeyColumnContainer(pk_column, context.is_common_handle); + auto & version_col_data = toColumnVectorData(version_column); + + for (auto i = rows_start_in_pack; i < rows_end_in_pack; ++i) + { + if (version_col_data[i] <= max_version && relevant_range.check(rkcc.getRowKeyValue(i))) + return true; + } + } + else + { + auto & dpb_reader = typeid_cast(*pack_reader); + auto pk_column = dpb_reader.getPKColumn(); + auto version_column = dpb_reader.getVersionColumn(); + + auto rkcc = RowKeyColumnContainer(pk_column, context.is_common_handle); + auto & version_col_data = toColumnVectorData(version_column); + + for (auto i = rows_start_in_pack; i < rows_end_in_pack; ++i) + { + if (version_col_data[i] <= max_version && relevant_range.check(rkcc.getRowKeyValue(i))) + return true; + } + } + } + + return false; +} + +} // namespace DM +} // namespace DB \ No newline at end of file diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h new file mode 100644 index 00000000000..16b3ce3aa53 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h @@ -0,0 +1,104 @@ +#pragma once + +#include + +namespace DB +{ +namespace DM +{ +class ColumnFileSetReader +{ + friend class ColumnFileSetInputStream; + +private: + ColumnFileSetSnapshotPtr snapshot; + + // The columns expected to read. Note that we will do reading exactly in this column order. + ColumnDefinesPtr col_defs; + RowKeyRange segment_range; + + // The row count of each pack. Cache here to speed up checking. + std::vector column_file_rows; + // The cumulative rows of packs. Used to fast locate specific packs according to rows offset by binary search. + std::vector column_file_rows_end; + + std::vector column_file_readers; + +private: + ColumnFileSetReader() = default; + + Block readPKVersion(size_t offset, size_t limit); + +public: + ColumnFileSetReader(const DMContext & context_, + const ColumnFileSetSnapshotPtr & snapshot_, + const ColumnDefinesPtr & col_defs_, + const RowKeyRange & segment_range_); + + // If we need to read columns besides pk and version, a DeltaValueReader can NOT be used more than once. + // This method create a new reader based on then current one. It will reuse some caches in the current reader. + ColumnFileSetReaderPtr createNewReader(const ColumnDefinesPtr & new_col_defs); + + // Use for DeltaMergeBlockInputStream to read rows from MemTableSet to do full compaction with other layer. + // This method will check whether offset and limit are valid. It only return those valid rows. + size_t readRows(MutableColumns & output_columns, size_t offset, size_t limit, const RowKeyRange * range); + + void getPlaceItems(BlockOrDeletes & place_items, size_t rows_begin, size_t deletes_begin, size_t rows_end, size_t deletes_end, size_t place_rows_offset = 0); + + bool shouldPlace(const DMContext & context, + const RowKeyRange & relevant_range, + UInt64 max_version, + size_t placed_rows); +}; + +class ColumnFileSetInputStream : public IBlockInputStream +{ +private: + ColumnFileSetReader reader; + ColumnFiles & column_files; + size_t column_files_count; + + ColumnFileReaderPtr cur_column_file_reader = {}; + size_t next_pack_index = 0; + +public: + ColumnFileSetInputStream(const DMContext & context_, + const ColumnFileSetSnapshotPtr & delta_snap_, + const ColumnDefinesPtr & col_defs_, + const RowKeyRange & segment_range_) + : reader(context_, delta_snap_, col_defs_, segment_range_) + , column_files(reader.snapshot->getColumnFiles()) + , column_files_count(column_files.size()) + {} + + String getName() const override { return "ColumnFileSet"; } + Block getHeader() const override { return toEmptyBlock(*(reader.col_defs)); } + + Block read() override + { + while (cur_column_file_reader || next_pack_index < column_files_count) + { + if (!cur_column_file_reader) + { + if (column_files[next_pack_index]->isDeleteRange()) + { + ++next_pack_index; + continue; + } + else + { + cur_column_file_reader = reader.column_file_readers[next_pack_index]; + ++next_pack_index; + } + } + Block block = cur_column_file_reader->readNextBlock(); + if (block) + return block; + else + cur_column_file_reader = {}; + } + return {}; + } +}; +} // namespace DM +} // namespace DB \ No newline at end of file diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp new file mode 100644 index 00000000000..8fef83f9166 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp @@ -0,0 +1,20 @@ +#include +#include + +namespace DB +{ +namespace DM +{ +RowKeyRange ColumnFileSetSnapshot::getSquashDeleteRange() const +{ + RowKeyRange squashed_delete_range = RowKeyRange::newNone(is_common_handle, rowkey_column_size); + for (auto iter = column_files.cbegin(); iter != column_files.cend(); ++iter) + { + const auto & column_file = *iter; + if (auto f_delete = column_file->tryToDeleteRange(); f_delete) + squashed_delete_range = squashed_delete_range.merge(f_delete->getDeleteRange()); + } + return squashed_delete_range; +} +} // namespace DM +} // namespace DB \ No newline at end of file diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h new file mode 100644 index 00000000000..979f6749b07 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h @@ -0,0 +1,92 @@ +#pragma once + +#include + +namespace DB +{ +namespace DM +{ +class ColumnFileSetSnapshot; +using ColumnFileSetSnapshotPtr = std::shared_ptr; +class ColumnFileSetReader; +using ColumnFileSetReaderPtr = std::shared_ptr; + +class BlockOrDelete +{ +private: + Block block; + size_t block_offset; + + RowKeyRange delete_range; + +public: + BlockOrDelete(Block && block_, size_t offset_) + : block(block_) + , block_offset(offset_) + {} + explicit BlockOrDelete(const RowKeyRange & delete_range_) + : delete_range(delete_range_) + {} + + bool isBlock() { return (bool)block; } + auto & getBlock() { return block; }; + auto getBlockOffset() { return block_offset; } + auto & getDeleteRange() { return delete_range; } +}; + +using BlockOrDeletes = std::vector; + +class ColumnFileSetSnapshot : public std::enable_shared_from_this + , private boost::noncopyable +{ + friend class MemTableSet; + friend class ColumnStableFileSet; + +private: + StorageSnapshotPtr storage_snap; + + ColumnFiles column_files; + size_t rows; + size_t bytes; + size_t deletes; + + bool is_common_handle; + size_t rowkey_column_size; + +public: + explicit ColumnFileSetSnapshot(const StorageSnapshotPtr & storage_snap_) + : storage_snap{storage_snap_} + {} + + explicit ColumnFileSetSnapshot(StorageSnapshotPtr && storage_snap_) + : storage_snap{std::move(storage_snap_)} + {} + + ColumnFileSetSnapshotPtr clone() + { + auto c = std::make_shared(storage_snap); + c->storage_snap = storage_snap; + c->column_files = column_files; + c->rows = rows; + c->bytes = bytes; + c->deletes = deletes; + c->is_common_handle = is_common_handle; + c->rowkey_column_size = rowkey_column_size; + + return c; + } + + ColumnFiles & getColumnFiles() { return column_files; } + + size_t getColumnFileCount() const { return column_files.size(); } + size_t getRows() const { return rows; } + size_t getBytes() const { return bytes; } + size_t getDeletes() const { return deletes; } + + RowKeyRange getSquashDeleteRange() const; + + const auto & getStorageSnapshot() { return storage_snap; } +}; + +} // namespace DM +} // namespace DB \ No newline at end of file diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp index 360a9338cdc..b32b1f6be7c 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp @@ -106,7 +106,7 @@ void ColumnFileTiny::fillColumns(const PageReader & page_reader, const ColumnDef ColumnFileReaderPtr ColumnFileTiny::getReader(const DMContext & /*context*/, const StorageSnapshotPtr & storage_snap, const ColumnDefinesPtr & col_defs) const { - return std::make_shared(*this, storage_snap, col_defs); + return std::make_shared(*this, storage_snap, col_defs); } void ColumnFileTiny::serializeMetadata(WriteBuffer & buf, bool save_schema) const @@ -191,7 +191,7 @@ PageId ColumnFileTiny::writeColumnFileData(DMContext & context, const Block & bl MemoryWriteBuffer write_buf; PageFieldSizes col_data_sizes; - for (auto & col : block) + for (const auto & col : block) { auto last_buf_size = write_buf.count(); serializeColumn(write_buf, *col.column, col.type, offset, limit, true); @@ -206,19 +206,19 @@ PageId ColumnFileTiny::writeColumnFileData(DMContext & context, const Block & bl } -ColumnPtr ColumnTinyFileReader::getPKColumn() +ColumnPtr ColumnFileTinyReader::getPKColumn() { tiny_file.fillColumns(storage_snap->log_reader, *col_defs, 1, cols_data_cache); return cols_data_cache[0]; } -ColumnPtr ColumnTinyFileReader::getVersionColumn() +ColumnPtr ColumnFileTinyReader::getVersionColumn() { tiny_file.fillColumns(storage_snap->log_reader, *col_defs, 2, cols_data_cache); return cols_data_cache[1]; } -size_t ColumnTinyFileReader::readRows(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) +size_t ColumnFileTinyReader::readRows(MutableColumns & output_cols, size_t rows_offset, size_t rows_limit, const RowKeyRange * range) { tiny_file.fillColumns(storage_snap->log_reader, *col_defs, output_cols.size(), cols_data_cache); @@ -226,7 +226,7 @@ size_t ColumnTinyFileReader::readRows(MutableColumns & output_cols, size_t rows_ return copyColumnsData(cols_data_cache, pk_col, output_cols, rows_offset, rows_limit, range); } -Block ColumnTinyFileReader::readNextBlock() +Block ColumnFileTinyReader::readNextBlock() { if (read_done) return {}; @@ -239,10 +239,10 @@ Block ColumnTinyFileReader::readNextBlock() return genBlock(*col_defs, columns); } -ColumnFileReaderPtr ColumnTinyFileReader::createNewReader(const ColumnDefinesPtr & new_col_defs) +ColumnFileReaderPtr ColumnFileTinyReader::createNewReader(const ColumnDefinesPtr & new_col_defs) { // Reuse the cache data. - return std::make_shared(tiny_file, storage_snap, new_col_defs, cols_data_cache); + return std::make_shared(tiny_file, storage_snap, new_col_defs, cols_data_cache); } } // namespace DM diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h index 9e0366650db..f4be243c049 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h @@ -16,7 +16,7 @@ using ColumnTinyFilePtr = std::shared_ptr; /// And it may have cache data if the column file is small enough(The details are in the flush process). class ColumnFileTiny : public ColumnFilePersisted { - friend class ColumnTinyFileReader; + friend class ColumnFileTinyReader; private: BlockPtr schema; @@ -106,13 +106,12 @@ class ColumnFileTiny : public ColumnFilePersisted + ",bytes:" + DB::toString(bytes) // + ",data_page_id:" + DB::toString(data_page_id) // + ",schema:" + (schema ? schema->dumpStructure() : "none") // - + ",cache_block:" + (cache ? cache->block.dumpStructure() : "none") - + ",saved: " + DB::toString(saved) + "}"; + + ",cache_block:" + (cache ? cache->block.dumpStructure() : "none") + "}"; return s; } }; -class ColumnTinyFileReader : public ColumnFileReader +class ColumnFileTinyReader : public ColumnFileReader { private: const ColumnFileTiny & tiny_file; @@ -123,7 +122,7 @@ class ColumnTinyFileReader : public ColumnFileReader bool read_done = false; public: - ColumnTinyFileReader(const ColumnFileTiny & tiny_file_, + ColumnFileTinyReader(const ColumnFileTiny & tiny_file_, const StorageSnapshotPtr & storage_snap_, const ColumnDefinesPtr & col_defs_, const Columns & cols_data_cache_) @@ -134,7 +133,7 @@ class ColumnTinyFileReader : public ColumnFileReader { } - ColumnTinyFileReader(const ColumnFileTiny & tiny_file_, const StorageSnapshotPtr & storage_snap_, const ColumnDefinesPtr & col_defs_) + ColumnFileTinyReader(const ColumnFileTiny & tiny_file_, const StorageSnapshotPtr & storage_snap_, const ColumnDefinesPtr & col_defs_) : tiny_file(tiny_file_) , storage_snap(storage_snap_) , col_defs(col_defs_) diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp index 89df82085f3..37a8334ff04 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp @@ -19,31 +19,27 @@ struct ColumnFile_V2 using ColumnFile_V2Ptr = std::shared_ptr; using ColumnFiles_V2 = std::vector; -inline ColumnFiles transform_V2_to_V3(const ColumnFiles_V2 & column_files_v2) +inline ColumnFilePersisteds transform_V2_to_V3(const ColumnFiles_V2 & column_files_v2) { - ColumnFiles column_files_v3; + ColumnFilePersisteds column_files_v3; for (const auto & f : column_files_v2) { - ColumnFilePtr f_v3; + ColumnFilePersistedPtr f_v3; if (f->isDeleteRange()) f_v3 = std::make_shared(std::move(f->delete_range)); else f_v3 = std::make_shared(f->schema, f->rows, f->bytes, f->data_page_id); - f_v3->setSaved(); column_files_v3.push_back(f_v3); } return column_files_v3; } -inline ColumnFiles_V2 transformSaved_V3_to_V2(const ColumnFiles & column_files_v3) +inline ColumnFiles_V2 transformSaved_V3_to_V2(const ColumnFilePersisteds & column_files_v3) { ColumnFiles_V2 column_files_v2; for (const auto & f : column_files_v3) { - if (!f->isSaved()) - break; - auto * f_v2 = new ColumnFile_V2(); if (auto * f_delete = f->tryToDeleteRange(); f_delete) @@ -75,7 +71,7 @@ inline void serializeColumnFile_V2(const ColumnFile_V2 & column_file, const Bloc writeIntBinary(column_file.data_page_id, buf); if (schema) { - writeIntBinary((UInt32)schema->columns(), buf); + writeIntBinary(static_cast(schema->columns()), buf); for (auto & col : *column_file.schema) { writeIntBinary(col.column_id, buf); @@ -85,7 +81,7 @@ inline void serializeColumnFile_V2(const ColumnFile_V2 & column_file, const Bloc } else { - writeIntBinary((UInt32)0, buf); + writeIntBinary(static_cast(0), buf); } } @@ -115,7 +111,7 @@ void serializeSavedColumnFiles_V2(WriteBuffer & buf, const ColumnFiles_V2 & colu } } -void serializeSavedColumnFilesInV2Format(WriteBuffer & buf, const ColumnFiles & column_files) +void serializeSavedColumnFilesInV2Format(WriteBuffer & buf, const ColumnFilePersisteds & column_files) { serializeSavedColumnFiles_V2(buf, transformSaved_V3_to_V2(column_files)); } @@ -147,7 +143,7 @@ inline ColumnFile_V2Ptr deserializeColumnFile_V2(ReadBuffer & buf, UInt64 versio return column_file; } -ColumnFiles deserializeSavedColumnFilesInV2Format(ReadBuffer & buf, UInt64 version) +ColumnFilePersisteds deserializeSavedColumnFilesInV2Format(ReadBuffer & buf, UInt64 version) { size_t size; readIntBinary(size, buf); @@ -169,4 +165,4 @@ ColumnFiles deserializeSavedColumnFilesInV2Format(ReadBuffer & buf, UInt64 versi } } // namespace DM -} // namespace DB \ No newline at end of file +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp index 6396829fb0b..69468c47261 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp @@ -7,7 +7,7 @@ namespace DB { namespace DM { -void serializeSavedColumnFilesInV3Format(WriteBuffer & buf, const ColumnFiles & column_files) +void serializeSavedColumnFilesInV3Format(WriteBuffer & buf, const ColumnFilePersisteds & column_files) { size_t saved_packs = std::find_if(column_files.begin(), column_files.end(), [](const ColumnFilePtr & p) { return !p->isSaved(); }) - column_files.begin(); @@ -50,7 +50,7 @@ void serializeSavedColumnFilesInV3Format(WriteBuffer & buf, const ColumnFiles & } } -ColumnFiles deserializeSavedColumnFilesInV3Format(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf, UInt64 /*version*/) +ColumnFilePersisteds deserializeSavedColumnFilesInV3Format(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf, UInt64 /*version*/) { size_t column_file_count; readIntBinary(column_file_count, buf); From dc512e2e63098c3ab859ba4f2ba9560a6481ea2d Mon Sep 17 00:00:00 2001 From: lidezhu Date: Tue, 25 Jan 2022 11:33:02 +0800 Subject: [PATCH 02/23] add MemTableSet and ColumnFilePersistedSet --- .../DeltaMerge/Delta/ColumnFileFlushTask.cpp | 72 ++++ .../DeltaMerge/Delta/ColumnFileFlushTask.h | 62 +++ .../Delta/ColumnFilePersistedSet.cpp | 399 ++++++++++++++++++ .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 120 ++++++ .../DeltaMerge/Delta/CompactDelta.cpp | 240 ----------- .../Storages/DeltaMerge/Delta/FlushDelta.cpp | 293 ------------- .../Storages/DeltaMerge/Delta/MemTableSet.cpp | 195 +++++++++ .../Storages/DeltaMerge/Delta/MemTableSet.h | 84 ++++ .../DeltaMerge/Delta/MinorCompaction.cpp | 64 +++ .../DeltaMerge/Delta/MinorCompaction.h | 84 ++++ 10 files changed, 1080 insertions(+), 533 deletions(-) create mode 100644 dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp create mode 100644 dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h create mode 100644 dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp create mode 100644 dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h delete mode 100644 dbms/src/Storages/DeltaMerge/Delta/CompactDelta.cpp delete mode 100644 dbms/src/Storages/DeltaMerge/Delta/FlushDelta.cpp create mode 100644 dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp create mode 100644 dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h create mode 100644 dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp create mode 100644 dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp new file mode 100644 index 00000000000..f913019e71f --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp @@ -0,0 +1,72 @@ +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace DM +{ +ColumnFileFlushTask::ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t current_flush_version_) + : context{context_} + , mem_table_set{mem_table_set_} + , current_flush_version{current_flush_version_} +{} + +DeltaIndex::Updates ColumnFileFlushTask::prepare(WriteBatches & wbs) +{ + DeltaIndex::Updates delta_index_updates; + /// Write prepared data to disk. + for (auto & task : tasks) + { + if (!task.block_data) + { + results.push_back(std::static_pointer_cast(task.column_file)); + } + else + { + IColumn::Permutation perm; + task.sorted = sortBlockByPk(getExtraHandleColumnDefine(context.is_common_handle), task.block_data, perm); + if (task.sorted) + delta_index_updates.emplace_back(task.deletes_offset, task.rows_offset, perm); + + auto * mem_file = task.column_file->tryToInMemoryFile(); + ColumnFilePersistedPtr tiny_file; + bool is_small_file = mem_file->getRows() < context.delta_small_pack_rows || mem_file->getBytes() < context.delta_small_pack_bytes; + if (is_small_file) + { + tiny_file = std::make_shared(mem_file->getSchema(), + mem_file->getRows(), + mem_file->getBytes(), + task.data_page, + !task.sorted ? mem_file->getCache() : std::make_shared(std::move(task.block_data))); + } + else + { + tiny_file = std::make_shared(mem_file->getSchema(), + mem_file->getRows(), + mem_file->getBytes(), + task.data_page, + nullptr); + } + results.push_back(tiny_file); + } + } + + wbs.writeLogAndData(); + return delta_index_updates; +} + +bool ColumnFileFlushTask::commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs) +{ + // update metadata + if (!persisted_file_set->appendColumnStableFilesToLevel0(current_flush_version, results, wbs)) + return false; + + mem_table_set->removeColumnFilesInFlushTask(*this); + return true; +} +} // namespace DM +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h new file mode 100644 index 00000000000..7389e3d78e2 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h @@ -0,0 +1,62 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace DM +{ +class MemTableSet; +using MemTableSetPtr = std::shared_ptr; +class ColumnFilePersistedSet; +using ColumnFilePersistedSetPtr = std::shared_ptr; +class ColumnFileFlushTask; +using ColumnFileFlushTaskPtr = std::shared_ptr; + +class ColumnFileFlushTask +{ + friend class MemTableSet; + friend class ColumnFilePersistedSet; + +public: + struct Task + { + explicit Task(const ColumnFilePtr & column_file_) + : column_file(column_file_) + {} + + ColumnFilePtr column_file; + + Block block_data; + PageId data_page = 0; + + bool sorted = false; + size_t rows_offset = 0; + size_t deletes_offset = 0; + }; + using Tasks = std::vector; + +private: + Tasks tasks; + ColumnFilePersisteds results; + DMContext & context; + MemTableSetPtr mem_table_set; + size_t current_flush_version; + +public: + ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t current_flush_version_); + + DeltaIndex::Updates prepare(WriteBatches & wbs); + + bool commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs); +}; +} // namespace DM +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp new file mode 100644 index 00000000000..fbf47eeb404 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -0,0 +1,399 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace DB +{ +namespace DM +{ +inline void serializeColumnStableFileLevels(WriteBatches & wbs, PageId id, const ColumnFilePersistedSet::ColumnStableFileLevels & file_levels) +{ + MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); + ColumnStableFiles column_files; + for (const auto & level : file_levels) + { + for (const auto & file : level) + { + column_files.emplace_back(file); + } + } + serializeColumnStableFiles(buf, column_files); + auto data_size = buf.count(); + wbs.meta.putPage(id, 0, buf.tryGetReadBuffer(), data_size); +} + +void ColumnFilePersistedSet::updateStats() +{ + size_t new_stable_files_count = 0; + size_t new_rows = 0; + size_t new_bytes = 0; + size_t new_deletes = 0; + for (auto & file_level : stable_files_levels) + { + new_stable_files_count += file_level.size(); + for (auto & file : file_level) + { + new_rows += file->getRows(); + new_bytes += file->getBytes(); + new_deletes += file->getDeletes(); + } + } + stable_files_count = new_stable_files_count; + rows = new_rows; + bytes = new_bytes; + deletes = new_deletes; +} + +ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const ColumnStableFiles & column_stable_files) + : metadata_id(metadata_id_) + , log(&Poco::Logger::get("ColumnStableFileSet")) +{ + // TODO: place column file to different levels + stable_files_levels.push_back(column_stable_files); + + updateStats(); +} + +ColumnStableFileSetPtr ColumnFilePersistedSet::restore(DMContext & context, const RowKeyRange & segment_range, PageId id) +{ + Page page = context.storage_pool.meta()->read(id, nullptr); + ReadBufferFromMemory buf(page.data.begin(), page.data.size()); + auto column_files = deserializeColumnStableFiles(context, segment_range, buf); + return std::make_shared(id, column_files); +} + +void ColumnFilePersistedSet::saveMeta(WriteBatches & wbs) const +{ + serializeColumnStableFileLevels(wbs, metadata_id, stable_files_levels); +} + +void ColumnFilePersistedSet::recordRemoveColumnFilesPages(WriteBatches & wbs) const +{ + for (const auto & level : stable_files_levels) + { + for (const auto & file : level) + file->removeData(wbs); + } +} + + +ColumnStableFiles ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & context, + const RowKeyRange & target_range, + const ColumnFiles & head_column_files, + WriteBatches & wbs) const +{ + // We check in the direction from the last level to the first level. + // In every level, we check from the begin to the last. + auto it_1 = head_column_files.begin(); + auto level_it = stable_files_levels.rbegin(); + auto it_2 = level_it->begin(); + bool check_success = true; + if (likely(head_column_files.size() <= stable_files_count.load())) + { + while (it_1 != head_column_files.end() && level_it != stable_files_levels.rend()) + { + if (it_2 == level_it->end()) + { + level_it++; + if (unlikely(level_it == stable_files_levels.rend())) + throw Exception("Delta Check head algorithm broken", ErrorCodes::LOGICAL_ERROR); + it_2 = level_it->begin(); + continue; + } + if ((*it_1)->getId() != (*it_2)->getId() || (*it_1)->getRows() != (*it_2)->getRows()) + { + check_success = false; + break; + } + it_1++; + it_2++; + } + } + + if (unlikely(!check_success)) + { + LOG_ERROR(log, + info() << ", Delta Check head failed, unexpected size. head column files: " << columnFilesToString(head_column_files) + << ", level details: " << levelsInfo()); + throw Exception("Check head failed, unexpected size", ErrorCodes::LOGICAL_ERROR); + } + + ColumnStableFiles cloned_tail; + while (level_it != stable_files_levels.rend()) + { + if (it_2 == level_it->end()) + { + level_it++; + if (level_it == stable_files_levels.rend()) + break; + it_2 = level_it->begin(); + } + const auto & column_file = *it_2; + if (auto * cr = column_file->tryToDeleteRange(); cr) + { + auto new_dr = cr->getDeleteRange().shrink(target_range); + if (!new_dr.none()) + { + // Only use the available delete_range pack. + cloned_tail.push_back(cr->cloneWith(new_dr)); + } + } + else if (auto * tf = column_file->tryToTinyFile(); tf) + { + // Use a newly created page_id to reference the data page_id of current column file. + PageId new_data_page_id = context.storage_pool.newLogPageId(); + wbs.log.putRefPage(new_data_page_id, tf->getDataPageId()); + auto new_column_file = tf->cloneWith(new_data_page_id); + cloned_tail.push_back(new_column_file); + } + else if (auto * f = column_file->tryToBigFile(); f) + { + auto delegator = context.path_pool.getStableDiskDelegator(); + auto new_ref_id = context.storage_pool.newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); + auto file_id = f->getFile()->fileId(); + wbs.data.putRefPage(new_ref_id, file_id); + auto file_parent_path = delegator.getDTFilePath(file_id); + auto new_file = DMFile::restore(context.db_context.getFileProvider(), file_id, /* ref_id= */ new_ref_id, file_parent_path, DMFile::ReadMetaMode::all()); + + auto new_big_file = f->cloneWith(context, new_file, target_range); + cloned_tail.push_back(new_big_file); + } + else + { + throw Exception("Meet unknown type of column file", ErrorCodes::LOGICAL_ERROR); + } + it_2++; + } + + return cloned_tail; +} + +size_t ColumnFilePersistedSet::getTotalCacheRows() const +{ + size_t cache_rows = 0; + for (const auto & level : stable_files_levels) + { + for (const auto & file : level) + { + if (auto * tf = file->tryToTinyFile(); tf) + { + if (auto && c = tf->getCache(); c) + cache_rows += c->block.rows(); + } + } + } + return cache_rows; +} + +size_t ColumnFilePersistedSet::getTotalCacheBytes() const +{ + size_t cache_bytes = 0; + for (const auto & level : stable_files_levels) + { + for (const auto & file : level) + { + if (auto * tf = file->tryToTinyFile(); tf) + { + if (auto && c = tf->getCache(); c) + cache_bytes += c->block.allocatedBytes(); + } + } + } + return cache_bytes; +} + +size_t ColumnFilePersistedSet::getValidCacheRows() const +{ + size_t cache_rows = 0; + for (const auto & level : stable_files_levels) + { + for (const auto & file : level) + { + if (auto * tf = file->tryToTinyFile(); tf) + { + if (auto && c = tf->getCache(); c) + cache_rows += tf->getRows(); + } + } + } + return cache_rows; +} + +bool ColumnFilePersistedSet::appendColumnStableFilesToLevel0(size_t prev_flush_version, const ColumnFilePersisteds & column_files, WriteBatches & wbs) +{ + if (prev_flush_version != flush_version) + { + LOG_DEBUG(log, simpleInfo() << " Stop flush because structure got updated"); + return false; + } + flush_version += 1; + ColumnStableFileLevels new_stable_files_levels; + for (auto & level : stable_files_levels) + { + auto & new_level = new_stable_files_levels.emplace_back(); + for (auto & file : level) + new_level.push_back(file); + } + if (new_stable_files_levels.empty()) + new_stable_files_levels.emplace_back(); + auto & new_level_0 = new_stable_files_levels[0]; + for (const auto & f : column_files) + new_level_0.push_back(f); + + /// Save the new metadata of column files to disk. + serializeColumnStableFileLevels(wbs, metadata_id, new_stable_files_levels); + wbs.writeMeta(); + + /// Commit updates in memory. + stable_files_levels.swap(new_stable_files_levels); + updateStats(); + + return true; +} + +MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & context) +{ + // Every time we try to compact all column files in a specific level. + // For ColumnTinyFile, we will try to combine small `ColumnTinyFile`s to a bigger one. + // For ColumnDeleteRangeFile and ColumnBigFile, we will simply move them to the next level. + // And only if there some small `ColumnTinyFile`s which can be combined together we will actually do the compaction. + size_t check_level_num = 0; + while (check_level_num < stable_files_levels.size()) + { + if (next_compaction_level >= stable_files_levels.size()) + next_compaction_level = 0; + + auto compaction = std::make_shared(next_compaction_level); + auto & level = stable_files_levels[next_compaction_level]; + if (!level.empty()) + { + bool is_all_trivial_move = true; + MinorCompaction::Task cur_task; + for (auto & file : level) + { + auto packup_cur_task = [&]() { + bool is_trivial_move = compaction->packUpTask(std::move(cur_task)); + is_all_trivial_move = is_all_trivial_move && is_trivial_move; + cur_task = {}; + }; + + if (auto * t_file = file->tryToTinyFile(); t_file) + { + bool cur_task_full = cur_task.total_rows >= context.delta_small_pack_rows; + bool small_column_file = t_file->getRows() < context.delta_small_pack_rows; + bool schema_ok + = cur_task.to_compact.empty(); + if (!schema_ok) + { + if (auto * last_t_file = cur_task.to_compact.back()->tryToTinyFile(); last_t_file) + schema_ok = t_file->getSchema() == last_t_file->getSchema(); + } + + if (cur_task_full || !small_column_file || !schema_ok) + packup_cur_task(); + + cur_task.addColumnFile(file); + } + else + { + packup_cur_task(); + cur_task.addColumnFile(file); + } + } + bool is_trivial_move = compaction->packUpTask(std::move(cur_task)); + is_all_trivial_move = is_all_trivial_move && is_trivial_move; + + if (!is_all_trivial_move) + return compaction; + } + next_compaction_level++; + } + return nullptr; +} + +bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & compaction, WriteBatches & wbs) +{ + if (compaction->current_compaction_version != minor_compaction_version) + { + LOG_WARNING(log, "Structure has been updated during compact"); + return false; + } + ColumnStableFileLevels new_stable_files_levels; + for (size_t i = 0; i < compaction->compaction_src_level; i++) + { + auto & new_level = new_stable_files_levels.emplace_back(); + for (const auto & f : stable_files_levels[i]) + new_level.push_back(f); + } + // Create a new empty level for `compaction_src_level` because all the column files is compacted to next level + new_stable_files_levels.emplace_back(); + + // Add new file to the target level + auto target_level = compaction->compaction_src_level + 1; + auto & target_level_files = new_stable_files_levels.emplace_back(); + if (stable_files_levels.size() > target_level) + { + for (auto & column_file : stable_files_levels[target_level]) + target_level_files.emplace_back(column_file); + } + for (auto & task : compaction->tasks) + { + if (task.is_trivial_move) + target_level_files.push_back(task.to_compact[0]); + else + target_level_files.push_back(task.result); + } + + // Append remaining levels + for (size_t i = target_level + 1; i < stable_files_levels.size(); i++) + { + auto & new_level = new_stable_files_levels.emplace_back(); + for (const auto & f : stable_files_levels[i]) + new_level.push_back(f); + } + + /// Save the new metadata of column files to disk. + serializeColumnStableFileLevels(wbs, metadata_id, new_stable_files_levels); + wbs.writeMeta(); + + /// Commit updates in memory. + stable_files_levels.swap(new_stable_files_levels); + updateStats(); + + return true; +} + +ColumnFileSetSnapshotPtr ColumnFilePersistedSet::createSnapshot(const DMContext & context) +{ + auto storage_snap = std::make_shared(context.storage_pool, context.getReadLimiter(), true); + auto snap = std::make_shared(std::move(storage_snap)); + snap->rows = rows; + snap->bytes = bytes; + snap->deletes = deletes; + + size_t total_rows = 0; + size_t total_deletes = 0; + for (const auto & level : stable_files_levels) + { + for (const auto & file : level) + { + snap->column_files.push_back(file); + total_rows += file->getRows(); + total_deletes += file->getDeletes(); + } + } + + if (unlikely(total_rows != rows || total_deletes != deletes)) + throw Exception("Rows and deletes check failed!", ErrorCodes::LOGICAL_ERROR); + + return snap; +} +} // namespace DM +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h new file mode 100644 index 00000000000..bc5f43bf5e5 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -0,0 +1,120 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace DM +{ +class ColumnFilePersistedSet; +using ColumnStableFileSetPtr = std::shared_ptr; + +/// This class is not thread safe, manipulate on it requires acquire extra synchronization +class ColumnFilePersistedSet : public std::enable_shared_from_this + , private boost::noncopyable +{ +public: + using ColumnStableFileLevel = ColumnStableFiles; + using ColumnStableFileLevels = std::vector; + +private: + PageId metadata_id; + ColumnStableFileLevels stable_files_levels; + std::atomic stable_files_count; + + std::atomic rows = 0; + std::atomic bytes = 0; + std::atomic deletes = 0; + + UInt64 flush_version = 0; + + size_t next_compaction_level = 0; + UInt64 minor_compaction_version = 0; + + Poco::Logger * log; + +private: + void updateStats(); + +public: + ColumnFilePersistedSet(PageId metadata_id_, const ColumnStableFiles & column_stable_files = {}); + + /// Restore the metadata of this instance. + /// Only called after reboot. + static ColumnStableFileSetPtr restore(DMContext & context, const RowKeyRange & segment_range, PageId id); + + String simpleInfo() const { return "ColumnStableFileSet [" + DB::toString(metadata_id) + "]"; } + String info() const + { + String levels_summary; + for (size_t i = 0; i < stable_files_levels.size(); i++) + levels_summary += fmt::format("[{}]: {}", i, stable_files_levels[i].size()); + + return fmt::format("ColumnStableFileSet [{}][{}]: {} column files, {} rows, {} bytes, {} deletes", + metadata_id, + levels_summary, + stable_files_count.load(), + rows.load(), + bytes.load(), + deletes.load()); + } + String levelsInfo() const + { + String levels_info; + for (size_t i = 0; i < stable_files_levels.size(); i++) + levels_info += fmt::format("[{}]: {}", i, columnFilesToString(stable_files_levels[i])); + return levels_info; + } + + void saveMeta(WriteBatches & wbs) const; + + void recordRemoveColumnFilesPages(WriteBatches & wbs) const; + + ColumnStableFiles + checkHeadAndCloneTail(DMContext & context, const RowKeyRange & target_range, const ColumnFiles & head_column_files, WriteBatches & wbs) const; + + PageId getId() const { return metadata_id; } + + size_t getColumnFileCount() const { return stable_files_count.load(); } + size_t getRows() const { return rows.load(); } + size_t getBytes() const { return bytes.load(); } + size_t getDeletes() const { return deletes.load(); } + + size_t getTotalCacheRows() const; + size_t getTotalCacheBytes() const; + size_t getValidCacheRows() const; + + size_t getCurrentFlushVersion() const { return flush_version; } + + bool appendColumnStableFilesToLevel0(size_t prev_flush_version, const ColumnFilePersisteds & column_files, WriteBatches & wbs); + + MinorCompactionPtr pickUpMinorCompaction(DMContext & context); + + bool installCompactionResults(const MinorCompactionPtr & compaction, WriteBatches & wbs); + + ColumnFileSetSnapshotPtr createSnapshot(const DMContext & context); +}; + +} // namespace DM +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/CompactDelta.cpp b/dbms/src/Storages/DeltaMerge/Delta/CompactDelta.cpp deleted file mode 100644 index 32327beca6e..00000000000 --- a/dbms/src/Storages/DeltaMerge/Delta/CompactDelta.cpp +++ /dev/null @@ -1,240 +0,0 @@ -#include -#include -#include -#include -#include -#include - -#include -#include - -namespace CurrentMetrics -{ -extern const Metric DT_SnapshotOfDeltaCompact; -} // namespace CurrentMetrics - -namespace DB::DM -{ -struct CompackTask -{ - CompackTask() {} - - ColumnFiles to_compact; - size_t total_rows = 0; - size_t total_bytes = 0; - - ColumnFilePtr result; - - void addColumnFile(const ColumnFilePtr & column_file) - { - total_rows += column_file->getRows(); - total_bytes += column_file->getBytes(); - to_compact.push_back(column_file); - } -}; -using CompackTasks = std::vector; - -bool DeltaValueSpace::compact(DMContext & context) -{ - LOG_DEBUG(log, info() << " Compact start"); - - bool v = false; - // Other thread is doing structure update, just return. - if (!is_updating.compare_exchange_strong(v, true)) - { - LOG_DEBUG(log, simpleInfo() << " Compact stop because updating"); - - return true; - } - SCOPE_EXIT({ - bool v = true; - if (!is_updating.compare_exchange_strong(v, false)) - throw Exception(simpleInfo() + " is expected to be updating", ErrorCodes::LOGICAL_ERROR); - }); - - CompackTasks tasks; - PageStorage::SnapshotPtr log_storage_snap; - CurrentMetrics::Increment snapshot_metrics{CurrentMetrics::DT_SnapshotOfDeltaCompact, 0}; - - { - /// Prepare compact tasks. - - std::scoped_lock lock(mutex); - if (abandoned.load(std::memory_order_relaxed)) - { - LOG_DEBUG(log, simpleInfo() << " Compact stop because abandoned"); - return false; - } - - CompackTask cur_task; - for (auto & pack : column_files) - { - if (!pack->isSaved()) - break; - - auto packup_cur_task = [&]() { - if (cur_task.to_compact.size() >= 2) - { - tasks.push_back(std::move(cur_task)); - } - else - { - // Maybe this pack is small, but it cannot be merged with other packs, so also remove it's cache. - for (auto & p : cur_task.to_compact) - { - p->tryToTinyFile()->clearCache(); - } - } - - cur_task = {}; - }; - - if (auto dp_block = pack->tryToTinyFile(); dp_block) - { - bool cur_task_full = cur_task.total_rows >= context.delta_small_pack_rows; - bool small_pack = pack->getRows() < context.delta_small_pack_rows; - bool schema_ok - = cur_task.to_compact.empty() || dp_block->getSchema() == cur_task.to_compact.back()->tryToTinyFile()->getSchema(); - - if (cur_task_full || !small_pack || !schema_ok) - packup_cur_task(); - - if (small_pack) - cur_task.addColumnFile(pack); - else - // Then this pack's cache should not exist. - dp_block->clearCache(); - } - else - { - packup_cur_task(); - } - } - if (cur_task.to_compact.size() >= 2) - tasks.push_back(std::move(cur_task)); - - if (tasks.empty()) - { - LOG_DEBUG(log, simpleInfo() << " Nothing to compact"); - return true; - } - - log_storage_snap = context.storage_pool.log()->getSnapshot(); - snapshot_metrics.changeTo(1); // add metrics for snapshot - } - - /// Write generated compact packs' data. - - size_t total_compact_packs = 0; - size_t total_compact_rows = 0; - - WriteBatches wbs(context.storage_pool, context.getWriteLimiter()); - PageReader reader(context.storage_pool.log(), std::move(log_storage_snap), context.getReadLimiter()); - for (auto & task : tasks) - { - auto & schema = *(task.to_compact[0]->tryToTinyFile()->getSchema()); - auto compact_columns = schema.cloneEmptyColumns(); - - // Read data from old packs - for (auto & pack : task.to_compact) - { - auto dp_block = pack->tryToTinyFile(); - if (unlikely(!dp_block)) - throw Exception("The compact candidate is not a DeltaPackBlock", ErrorCodes::LOGICAL_ERROR); - - // We ensure schema of all packs are the same - Block block = dp_block->readBlockForMinorCompaction(reader); - size_t block_rows = block.rows(); - for (size_t i = 0; i < schema.columns(); ++i) - { - compact_columns[i]->insertRangeFrom(*block.getByPosition(i).column, 0, block_rows); - } - - wbs.removed_log.delPage(dp_block->getDataPageId()); - } - - Block compact_block = schema.cloneWithColumns(std::move(compact_columns)); - auto compact_rows = compact_block.rows(); - - // Note that after compact, caches are no longer exist. - - // Use the original schema instance, so that we can avoid serialize the new schema instance. - auto compact_column_file = ColumnFileTiny::writeColumnFile(context, compact_block, 0, compact_rows, wbs, task.to_compact.front()->tryToTinyFile()->getSchema()); - compact_column_file->setSaved(); - - wbs.writeLogAndData(); - task.result = compact_column_file; - - total_compact_packs += task.to_compact.size(); - total_compact_rows += compact_rows; - } - - { - std::scoped_lock lock(mutex); - - /// Check before commit. - if (abandoned.load(std::memory_order_relaxed)) - { - wbs.rollbackWrittenLogAndData(); - LOG_DEBUG(log, simpleInfo() << " Stop compact because abandoned"); - return false; - } - - ColumnFiles new_packs; - auto old_packs_offset = column_files.begin(); - for (auto & task : tasks) - { - auto old_it = old_packs_offset; - auto locate_it = [&](const ColumnFilePtr & pack) { - for (; old_it != column_files.end(); ++old_it) - { - if (*old_it == pack) - return old_it; - } - return old_it; - }; - - auto start_it = locate_it(task.to_compact.front()); - auto end_it = locate_it(task.to_compact.back()); - - if (unlikely(start_it == column_files.end() || end_it == column_files.end())) - { - LOG_WARNING(log, "Structure has been updated during compact"); - wbs.rollbackWrittenLogAndData(); - LOG_DEBUG(log, simpleInfo() << " Compact stop because structure got updated"); - return false; - } - - new_packs.insert(new_packs.end(), old_packs_offset, start_it); - new_packs.push_back(task.result); - - old_packs_offset = end_it + 1; - } - new_packs.insert(new_packs.end(), old_packs_offset, column_files.end()); - - checkColumnFiles(new_packs); - - /// Save the new metadata of packs to disk. - MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); - serializeSavedColumnFiles(buf, new_packs); - const auto data_size = buf.count(); - - wbs.meta.putPage(id, 0, buf.tryGetReadBuffer(), data_size); - wbs.writeMeta(); - - /// Update packs in memory. - column_files.swap(new_packs); - - last_try_compact_packs = std::min(column_files.size(), last_try_compact_packs.load()); - - LOG_DEBUG(log, - simpleInfo() << " Successfully compacted " << total_compact_packs << " packs into " << tasks.size() << " packs, total " - << total_compact_rows << " rows."); - } - - wbs.writeRemoves(); - - return true; -} - -} // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Delta/FlushDelta.cpp b/dbms/src/Storages/DeltaMerge/Delta/FlushDelta.cpp deleted file mode 100644 index 5c54066c609..00000000000 --- a/dbms/src/Storages/DeltaMerge/Delta/FlushDelta.cpp +++ /dev/null @@ -1,293 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ProfileEvents -{ -extern const Event DMWriteBytes; -extern const Event PSMWriteBytes; -extern const Event WriteBufferFromFileDescriptorWriteBytes; -extern const Event WriteBufferAIOWriteBytes; -} // namespace ProfileEvents - -namespace DB::DM -{ -struct FlushPackTask -{ - FlushPackTask(const ColumnFilePtr & pack_) - : pack(pack_) - {} - - ColumnFilePtr pack; - - Block block_data; - PageId data_page = 0; - - bool sorted = false; - size_t rows_offset = 0; - size_t deletes_offset = 0; -}; -using FlushPackTasks = std::vector; - -bool DeltaValueSpace::flush(DMContext & context) -{ - LOG_DEBUG(log, info() << ", Flush start"); - - /// We have two types of data needed to flush to disk: - /// 1. The cache data in DeltaPackBlock - /// 2. The serialized metadata of packs in DeltaValueSpace - - FlushPackTasks tasks; - WriteBatches wbs(context.storage_pool, context.getWriteLimiter()); - - size_t flush_rows = 0; - size_t flush_bytes = 0; - size_t flush_deletes = 0; - - DeltaIndexPtr cur_delta_index; - { - /// Prepare data which will be written to disk. - std::scoped_lock lock(mutex); - if (abandoned.load(std::memory_order_relaxed)) - { - LOG_DEBUG(log, simpleInfo() << "Flush stop because abandoned"); - return false; - } - - size_t total_rows = 0; - size_t total_deletes = 0; - for (auto & pack : column_files) - { - if (unlikely(!tasks.empty() && pack->isSaved())) - { - String msg = "Pack should not already saved, because previous packs are not saved."; - - LOG_ERROR(log, simpleInfo() << msg << " Packs: " << columnFilesToString(column_files)); - throw Exception(msg, ErrorCodes::LOGICAL_ERROR); - } - - - if (!pack->isSaved()) - { - auto & task = tasks.emplace_back(pack); - - if (auto * dpb = pack->tryToInMemoryFile(); dpb) - { - // Stop other threads appending to this pack. - dpb->disableAppend(); - task.rows_offset = total_rows; - task.deletes_offset = total_deletes; - task.block_data = dpb->readDataForFlush(); - } - - flush_rows += pack->getRows(); - flush_bytes += pack->getBytes(); - flush_deletes += pack->isDeleteRange(); - } - - total_rows += pack->getRows(); - total_deletes += pack->isDeleteRange(); - } - - if (unlikely(flush_rows != unsaved_rows || flush_deletes != unsaved_deletes || total_rows != rows || total_deletes != deletes)) - throw Exception("Rows and deletes check failed", ErrorCodes::LOGICAL_ERROR); - - cur_delta_index = delta_index; - } - - // No update, return successfully. - if (tasks.empty()) - { - LOG_DEBUG(log, simpleInfo() << " Nothing to flush"); - return true; - } - - DeltaIndex::Updates delta_index_updates; - DeltaIndexPtr new_delta_index; - { - /// Write prepared data to disk. - for (auto & task : tasks) - { - if (!task.block_data) - continue; - IColumn::Permutation perm; - task.sorted = sortBlockByPk(getExtraHandleColumnDefine(context.is_common_handle), task.block_data, perm); - if (task.sorted) - delta_index_updates.emplace_back(task.deletes_offset, task.rows_offset, perm); - - task.data_page = ColumnFileTiny::writeColumnFileData(context, task.block_data, 0, task.block_data.rows(), wbs); - } - - wbs.writeLogAndData(); - } - - if (!delta_index_updates.empty()) - { - LOG_DEBUG(log, simpleInfo() << " Update index start"); - new_delta_index = cur_delta_index->cloneWithUpdates(delta_index_updates); - LOG_DEBUG(log, simpleInfo() << " Update index done"); - } - - { - /// If this instance is still valid, then commit. - std::scoped_lock lock(mutex); - - if (abandoned.load(std::memory_order_relaxed)) - { - // Delete written data. - wbs.setRollback(); - LOG_DEBUG(log, simpleInfo() << " Flush stop because abandoned"); - return false; - } - - ColumnFiles::iterator flush_start_point; - ColumnFiles::iterator flush_end_point; - - { - /// Do some checks before continue, in case other threads do some modifications during current operation, - /// as we didn't always hold the lock. - - auto p_it = column_files.begin(); - auto t_it = tasks.begin(); - for (; p_it != column_files.end(); ++p_it) - { - if (*p_it == t_it->pack) - break; - } - - flush_start_point = p_it; - - for (; t_it != tasks.end(); ++t_it, ++p_it) - { - if (p_it == column_files.end() || *p_it != t_it->pack || (*p_it)->isSaved()) - { - // The packs have been modified, or this pack already saved by another thread. - // Let's rollback and break up. - wbs.rollbackWrittenLogAndData(); - LOG_DEBUG(log, simpleInfo() << " Stop flush because structure got updated"); - return false; - } - } - - flush_end_point = p_it; - } - - /// Things look good, let's continue. - - // Create a temporary packs copy, used to generate serialized data. - // Save the previous saved packs, and the packs we are saving, and the later packs appended during the period we did not held the lock. - ColumnFiles column_files_copy(column_files.begin(), flush_start_point); - for (auto & task : tasks) - { - // Use a new column file instance to do the serializing. - ColumnFilePtr new_column_file; - if (auto * dp_block = task.pack->tryToInMemoryFile(); dp_block) - { - bool is_small_file = dp_block->getRows() < context.delta_small_pack_rows || dp_block->getBytes() < context.delta_small_pack_bytes; - if (is_small_file) - { - new_column_file = std::make_shared(dp_block->getSchema(), - dp_block->getRows(), - dp_block->getBytes(), - task.data_page, - !task.sorted ? dp_block->getCache() : std::make_shared(std::move(task.block_data))); - } - else - { - new_column_file = std::make_shared(dp_block->getSchema(), - dp_block->getRows(), - dp_block->getBytes(), - task.data_page, - nullptr); - } - } - else if (auto * t_file = task.pack->tryToTinyFile(); t_file) - { - new_column_file = std::make_shared(*t_file); - } - else if (auto * dp_file = task.pack->tryToBigFile(); dp_file) - { - new_column_file = std::make_shared(*dp_file); - } - else if (auto * dp_delete = task.pack->tryToDeleteRange(); dp_delete) - { - new_column_file = std::make_shared(*dp_delete); - } - else - { - throw Exception("Unexpected column file type", ErrorCodes::LOGICAL_ERROR); - } - - new_column_file->setSaved(); - - column_files_copy.push_back(new_column_file); - } - column_files_copy.insert(column_files_copy.end(), flush_end_point, column_files.end()); - - if constexpr (DM_RUN_CHECK) - { - size_t check_unsaved_rows = 0; - size_t check_unsaved_deletes = 0; - size_t total_rows = 0; - size_t total_deletes = 0; - for (auto & pack : column_files_copy) - { - if (!pack->isSaved()) - { - check_unsaved_rows += pack->getRows(); - check_unsaved_deletes += pack->isDeleteRange(); - } - total_rows += pack->getRows(); - total_deletes += pack->isDeleteRange(); - } - if (unlikely(check_unsaved_rows + flush_rows != unsaved_rows // - || check_unsaved_deletes + flush_deletes != unsaved_deletes // - || total_rows != rows // - || total_deletes != deletes)) - throw Exception("Rows and deletes check failed", ErrorCodes::LOGICAL_ERROR); - } - - /// Save the new metadata of packs to disk. - MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); - serializeSavedColumnFiles(buf, column_files_copy); - const auto data_size = buf.count(); - - wbs.meta.putPage(id, 0, buf.tryGetReadBuffer(), data_size); - wbs.writeMeta(); - - /// Commit updates in memory. - column_files.swap(column_files_copy); - - /// Update delta tree - if (new_delta_index) - delta_index = new_delta_index; - - unsaved_rows -= flush_rows; - unsaved_bytes -= flush_bytes; - unsaved_deletes -= flush_deletes; - - LOG_DEBUG(log, - simpleInfo() << " Flush end. Flushed " << tasks.size() << " packs, " << flush_rows << " rows and " << flush_deletes - << " deletes."); - } - - - ProfileEvents::increment(ProfileEvents::DMWriteBytes, flush_bytes); - - // Also update the write amplification - auto total_write = ProfileEvents::counters[ProfileEvents::DMWriteBytes].load(std::memory_order_relaxed); - auto actual_write = ProfileEvents::counters[ProfileEvents::PSMWriteBytes].load(std::memory_order_relaxed) - + ProfileEvents::counters[ProfileEvents::WriteBufferFromFileDescriptorWriteBytes].load(std::memory_order_relaxed) - + ProfileEvents::counters[ProfileEvents::WriteBufferAIOWriteBytes].load(std::memory_order_relaxed); - GET_METRIC(tiflash_storage_write_amplification) - .Set((double)(actual_write / 1024 / 1024) / (total_write / 1024 / 1024)); - - return true; -} - -} // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp new file mode 100644 index 00000000000..060ab47e9d5 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include + +namespace DB +{ +namespace DM +{ +// FIXME: difference from previous implementation: cannot get lastSchema from saved column_files +BlockPtr MemTableSet::lastSchema() +{ + for (auto it = column_files.rbegin(); it != column_files.rend(); ++it) + { + if (auto * m_file = (*it)->tryToInMemoryFile(); m_file) + return m_file->getSchema(); + else if (auto * t_file = (*it)->tryToTinyFile(); t_file) + return t_file->getSchema(); + } + return {}; +} + +void MemTableSet::appendColumnFileInner(const ColumnFilePtr & column_file) +{ + auto last_schema = lastSchema(); + + if (auto m_file = column_file->tryToInMemoryFile(); m_file) + { + // If this pack's schema is identical to last_schema, then use the last_schema instance, + // so that we don't have to serialize my_schema instance. + auto my_schema = m_file->getSchema(); + if (last_schema && my_schema && last_schema != my_schema && isSameSchema(*my_schema, *last_schema)) + m_file->resetIdenticalSchema(last_schema); + } + + if (!column_files.empty()) + { + auto last_column_file = column_files.back(); + if (last_column_file->isInMemoryFile()) + last_column_file->tryToInMemoryFile()->disableAppend(); + } + + column_files.push_back(column_file); + + rows += column_file->getRows(); + bytes += column_file->getBytes(); + deletes += column_file->getDeletes(); +} + +void MemTableSet::appendColumnFile(const ColumnFilePtr & column_file) +{ + appendColumnFileInner(column_file); +} + +void MemTableSet::appendToCache(DMContext & context, const Block & block, size_t offset, size_t limit) +{ + // If the `column_files` is not empty, and the last `column_file` is a `ColumnInMemoryFile`, we will merge the newly block into the last `column_file`. + // Otherwise, create a new `ColumnInMemoryFile` and write into it. + bool success = false; + size_t append_bytes = block.bytes(offset, limit); + if (!column_files.empty()) + { + auto & last_column_file = column_files.back(); + if (last_column_file->isAppendable()) + success = last_column_file->append(context, block, offset, limit, append_bytes); + } + + if (!success) + { + auto new_column_file = std::make_shared(block); + success = new_column_file->append(context, block, offset, limit, append_bytes); + if (unlikely(!success)) + throw Exception("Write to MemTableSet failed", ErrorCodes::LOGICAL_ERROR); + appendColumnFileInner(new_column_file); + } + // FIXME: update rows and bytes and etc. +} + +void MemTableSet::appendDeleteRange(const RowKeyRange & delete_range) +{ + auto f = std::make_shared(delete_range); + appendColumnFileInner(f); +} + +void MemTableSet::ingestColumnFiles(const RowKeyRange & range, const ColumnFiles & column_files_, bool clear_data_in_range) +{ + // Prepend a DeleteRange to clean data before applying packs + if (clear_data_in_range) + { + auto f = std::make_shared(range); + appendColumnFileInner(f); + } + + for (auto & f : column_files_) + { + appendColumnFileInner(f); + } +} + +ColumnFileSetSnapshotPtr MemTableSet::createSnapshot() +{ + auto snap = std::make_shared(nullptr); + snap->rows = rows; + snap->bytes = bytes; + snap->deletes = deletes; + snap->column_files.reserve(column_files.size()); + + size_t total_rows = 0; + size_t total_deletes = 0; + for (const auto & file : column_files) + { + // TODO: check the thread safety of ColumnFile + snap->column_files.push_back(file); + total_rows += file->getRows(); + total_deletes += file->getDeletes(); + } + + if (unlikely(total_rows != rows || total_deletes != deletes)) + throw Exception("Rows and deletes check failed!", ErrorCodes::LOGICAL_ERROR); + + return snap; +} + +FlushColumnFileTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t rows_offset, size_t deletes_offset, size_t flush_version) +{ + if (column_files.empty()) + return nullptr; + + // make the last column file not appendable + if (column_files.back()->isAppendable()) + column_files.back()->disableAppend(); + + size_t cur_rows_offset = rows_offset; + size_t cur_deletes_offset = deletes_offset; + size_t flush_rows = 0; + size_t flush_bytes = 0; + size_t flush_deletes = 0; + auto flush_task = std::make_shared(context, this->shared_from_this(), flush_version); + for (auto & column_file : column_files) + { + auto & task = flush_task->tasks.emplace_back(column_file); + if (auto * mfile = column_file->tryToInMemoryFile(); mfile) + { + task.rows_offset = cur_rows_offset; + task.deletes_offset = cur_deletes_offset; + task.block_data = mfile->readDataForFlush(); + } + flush_rows += column_file->getRows(); + flush_bytes += column_file->getBytes(); + flush_deletes += column_file->getDeletes(); + cur_rows_offset += column_file->getRows(); + cur_deletes_offset += column_file->getDeletes(); + } + if (unlikely(flush_rows != rows || flush_deletes != deletes)) + throw Exception("Rows and deletes check failed", ErrorCodes::LOGICAL_ERROR); + + return flush_task; +} + +void MemTableSet::removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush_task) +{ + auto & tasks = flush_task.tasks; + if (unlikely(tasks.size() > column_files.size())) + throw Exception("column_files num check failed", ErrorCodes::LOGICAL_ERROR); + + ColumnFiles new_column_files; + auto column_file_iter = column_files.begin(); + for (const auto & task : tasks) + { + if (unlikely(column_file_iter == column_files.end() || *column_file_iter != task.column_file)) + { + throw Exception("column_files check failed", ErrorCodes::LOGICAL_ERROR); + } + column_file_iter++; + } + size_t new_rows = 0; + size_t new_bytes = 0; + size_t new_deletes = 0; + while (column_file_iter != column_files.end()) + { + new_column_files.emplace_back(*column_file_iter); + new_rows += (*column_file_iter)->getRows(); + new_bytes += (*column_file_iter)->getBytes(); + new_deletes += (*column_file_iter)->getDeletes(); + } + column_files.swap(new_column_files); + rows = new_rows; + bytes = new_bytes; + deletes = new_deletes; +} + + +} // namespace DM +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h new file mode 100644 index 00000000000..cd12ea84a05 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h @@ -0,0 +1,84 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +namespace DM +{ +/// MemTableSet contains column file which data just resides in memory. +/// And the column files will be flushed periodically to ColumnFilePersistedSet. +/// +/// This class is not thread safe, manipulate on it requires acquire extra synchronization +class MemTableSet : public std::enable_shared_from_this + , private boost::noncopyable +{ +private: + ColumnFiles column_files; + + std::atomic rows = 0; + std::atomic bytes = 0; + std::atomic deletes = 0; + + Poco::Logger * log; + +private: + BlockPtr lastSchema(); + + void appendColumnFileInner(const ColumnFilePtr & column_file); + +public: + MemTableSet(const ColumnFiles & in_memory_files = {}) + : column_files(in_memory_files) + , log(&Poco::Logger::get("MemTableSet")) + { + for (const auto & file : column_files) + { + rows += file->getRows(); + bytes += file->getBytes(); + deletes += file->getDeletes(); + } + } + + String info() const + { + return fmt::format("MemTableSet: {} column files, {} rows, {} bytes, {} deletes", + column_files.size(), + rows.load(), + bytes.load(), + deletes.load()); + } + + ColumnFiles cloneColumnFiles() { return column_files; } + + size_t getColumnFileCount() const { return column_files.size(); } + size_t getRows() const { return rows; } + size_t getBytes() const { return bytes; } + size_t getDeletes() const { return deletes; } + + /// The following methods returning false means this operation failed, caused by other threads could have done + /// some updates on this instance. E.g. this instance have been abandoned. + /// Caller should try again from the beginning. + void appendColumnFile(const ColumnFilePtr & column_file); + + void appendToCache(DMContext & dm_context, const Block & block, size_t offset, size_t limit); + + void appendDeleteRange(const RowKeyRange & delete_range); + + void ingestColumnFiles(const RowKeyRange & range, const ColumnFiles & column_files_, bool clear_data_in_range); + + /// Create a constant snapshot for read. + /// Returns empty if this instance is abandoned, you should try again. + ColumnFileSetSnapshotPtr createSnapshot(); + + FlushColumnFileTaskPtr buildFlushTask(DMContext & context, size_t rows_offset, size_t deletes_offset, size_t flush_version); + + void removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush_task); +}; + +using MemTableSetPtr = std::shared_ptr; + +} // namespace DM +} // namespace DB \ No newline at end of file diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp new file mode 100644 index 00000000000..78c8502dc36 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp @@ -0,0 +1,64 @@ +#include "MinorCompaction.h" + +#include +#include +#include +#include +#include +#include + +namespace DB +{ +namespace DM +{ +MinorCompaction::MinorCompaction(size_t compaction_src_level_) + : compaction_src_level{compaction_src_level_} +{} + +void MinorCompaction::prepare(DMContext & context, WriteBatches & wbs, const PageReader & reader) +{ + for (auto & task : tasks) + { + if (task.is_trivial_move) + continue; + + auto & schema = *(task.to_compact[0]->tryToTinyFile()->getSchema()); + auto compact_columns = schema.cloneEmptyColumns(); + for (auto & file : task.to_compact) + { + auto * t_file = file->tryToTinyFile(); + if (unlikely(!t_file)) + throw Exception("The compact candidate is not a ColumnTinyFile", ErrorCodes::LOGICAL_ERROR); + + // We ensure schema of all packs are the same + Block block = t_file->readBlockForMinorCompaction(reader); + size_t block_rows = block.rows(); + for (size_t i = 0; i < schema.columns(); ++i) + { + compact_columns[i]->insertRangeFrom(*block.getByPosition(i).column, 0, block_rows); + } + + wbs.removed_log.delPage(t_file->getDataPageId()); + } + Block compact_block = schema.cloneWithColumns(std::move(compact_columns)); + auto compact_rows = compact_block.rows(); + auto compact_column_file = ColumnFileTiny::writeColumnFile(context, compact_block, 0, compact_rows, wbs, task.to_compact.front()->tryToTinyFile()->getSchema()); + wbs.writeLogAndData(); + task.result = compact_column_file; + + total_compact_files += task.to_compact.size(); + total_compact_rows += compact_rows; + } +} + +bool MinorCompaction::commit(WriteBatches & wbs) +{ + return persisted_file_set->installCompactionResults(shared_from_this(), wbs); +} + +String MinorCompaction::info() const +{ + return fmt::format("Compacted {} column files into {} column files, total {} rows.", total_compact_files, tasks.size(), total_compact_rows); +} +} // namespace DM +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h new file mode 100644 index 00000000000..35b4a2ffddc --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h @@ -0,0 +1,84 @@ +#pragma once + +#include +#include + +namespace DB +{ +namespace DM +{ +class ColumnFilePersistedSet; +using ColumnFilePersistedSetPtr = std::shared_ptr; +class MinorCompaction; +using MinorCompactionPtr = std::shared_ptr; + +class MinorCompaction : public std::enable_shared_from_this +{ + friend class ColumnFilePersistedSet; + +public: + struct Task + { + Task() = default; + + ColumnFilePersisteds to_compact; + size_t total_rows = 0; + size_t total_bytes = 0; + + bool is_trivial_move = false; + ColumnFilePersistedPtr result; + + void addColumnFile(const ColumnFilePersistedPtr & column_file) + { + total_rows += column_file->getRows(); + total_bytes += column_file->getBytes(); + to_compact.push_back(column_file); + } + }; + using Tasks = std::vector; + +private: + Tasks tasks; + + size_t compaction_src_level; + size_t current_compaction_version; + ColumnFilePersistedSetPtr persisted_file_set; + + size_t total_compact_files = 0; + size_t total_compact_rows = 0; + +public: + explicit MinorCompaction(size_t compaction_src_level_); + + // return whether this task is a trivial move + inline bool packUpTask(Task && task) + { + if (unlikely(task.to_compact.empty())) + throw Exception("task shouldn't be empty", ErrorCodes::LOGICAL_ERROR); + + bool is_trivial_move = false; + if (task.to_compact.size() == 1) + { + // Maybe this column file is small, but it cannot be merged with other packs, so also remove it's cache if possible. + for (auto & f : task.to_compact) + { + if (auto * t_file = f->tryToTinyFile(); t_file) + { + t_file->clearCache(); + } + } + is_trivial_move = true; + } + task.is_trivial_move = is_trivial_move; + tasks.push_back(std::move(task)); + return is_trivial_move; + } + + void prepare(DMContext & context, WriteBatches & wbs, const PageReader & reader); + + bool commit(WriteBatches & wbs); + + String info() const; +}; +} // namespace DM +} // namespace DB From 9e2d9321f007fd88ca2b249bd365c4e8b09bdd75 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Wed, 26 Jan 2022 13:50:04 +0800 Subject: [PATCH 03/23] seprate delta layer to two layers --- .../DeltaMerge/ColumnFile/ColumnFileBig.cpp | 6 +- .../DeltaMerge/ColumnFile/ColumnFileBig.h | 6 +- .../ColumnFile/ColumnFileDeleteRange.cpp | 2 +- .../ColumnFile/ColumnFileDeleteRange.h | 2 +- .../ColumnFile/ColumnFilePersisted.cpp | 4 +- .../ColumnFile/ColumnFileSetReader.cpp | 6 +- .../ColumnFile/ColumnFileSetSnapshot.cpp | 3 +- .../ColumnFile/ColumnFileSetSnapshot.h | 2 +- .../DeltaMerge/ColumnFile/ColumnFileTiny.cpp | 2 +- .../DeltaMerge/ColumnFile/ColumnFileTiny.h | 2 +- .../DeltaMerge/ColumnFile/ColumnFile_V3.cpp | 10 +- .../Delta/ColumnFilePersistedSet.cpp | 46 +- .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 18 +- .../DeltaMerge/Delta/DeltaValueSpace.cpp | 419 +++++++----------- .../DeltaMerge/Delta/DeltaValueSpace.h | 205 ++++----- .../Storages/DeltaMerge/Delta/MemTableSet.cpp | 12 +- .../Storages/DeltaMerge/Delta/MemTableSet.h | 2 +- .../Storages/DeltaMerge/Delta/Snapshot.cpp | 287 ++---------- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 18 +- dbms/src/Storages/DeltaMerge/Segment.cpp | 50 +-- 20 files changed, 371 insertions(+), 731 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp index e7ca6aa9d13..d8cfd75c5f9 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp @@ -40,9 +40,9 @@ void ColumnFileBig::serializeMetadata(WriteBuffer & buf, bool /*save_schema*/) c writeIntBinary(valid_bytes, buf); } -ColumnFilePtr ColumnFileBig::deserializeMetadata(DMContext & context, // - const RowKeyRange & segment_range, - ReadBuffer & buf) +ColumnFilePersistedPtr ColumnFileBig::deserializeMetadata(DMContext & context, // + const RowKeyRange & segment_range, + ReadBuffer & buf) { UInt64 file_ref_id; size_t valid_rows, valid_bytes; diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h index 716a55e76de..64799e62d47 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.h @@ -71,9 +71,9 @@ class ColumnFileBig : public ColumnFilePersisted void serializeMetadata(WriteBuffer & buf, bool save_schema) const override; - static ColumnFilePtr deserializeMetadata(DMContext & context, // - const RowKeyRange & segment_range, - ReadBuffer & buf); + static ColumnFilePersistedPtr deserializeMetadata(DMContext & context, // + const RowKeyRange & segment_range, + ReadBuffer & buf); String toString() const override { diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.cpp index 8ffcd4f17dc..92ae9dc4915 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.cpp @@ -18,7 +18,7 @@ void ColumnFileDeleteRange::serializeMetadata(WriteBuffer & buf, bool /*save_sch delete_range.serialize(buf); } -ColumnFilePtr ColumnFileDeleteRange::deserializeMetadata(ReadBuffer & buf) +ColumnFilePersistedPtr ColumnFileDeleteRange::deserializeMetadata(ReadBuffer & buf) { return std::make_shared(RowKeyRange::deserialize(buf)); } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h index 87d9c38dc39..bb9005ed3bd 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileDeleteRange.h @@ -42,7 +42,7 @@ class ColumnFileDeleteRange : public ColumnFilePersisted void serializeMetadata(WriteBuffer & buf, bool save_schema) const override; - static ColumnFilePtr deserializeMetadata(ReadBuffer & buf); + static ColumnFilePersistedPtr deserializeMetadata(ReadBuffer & buf); String toString() const override { return "{delete_range:" + delete_range.toString() + "}"; } }; diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp index 98f0d889342..017583207c5 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp @@ -99,7 +99,7 @@ ColumnFilePersisteds deserializeSavedColumnFiles(DMContext & context, const RowK DeltaFormat::Version version; readIntBinary(version, buf); - ColumnFiles column_files; + ColumnFilePersisteds column_files; switch (version) { // V1 and V2 share the same deserializer. @@ -114,8 +114,6 @@ ColumnFilePersisteds deserializeSavedColumnFiles(DMContext & context, const RowK throw Exception("Unexpected delta value version: " + DB::toString(version) + ", latest version: " + DB::toString(DeltaFormat::V3), ErrorCodes::LOGICAL_ERROR); } - for (auto & f : column_files) - f->setSaved(); return column_files; } } // namespace DM diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp index 631e396ad80..9e24dcfd10d 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp @@ -226,10 +226,10 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; size_t rows_end_in_pack = column_file_rows[pack_index]; - auto & pack_reader = column_file_readers[pack_index]; + auto & column_file_reader = column_file_readers[pack_index]; if (column_file->isInMemoryFile()) { - auto & dpb_reader = typeid_cast(*pack_reader); + auto & dpb_reader = typeid_cast(*column_file_reader); auto pk_column = dpb_reader.getPKColumn(); auto version_column = dpb_reader.getVersionColumn(); @@ -244,7 +244,7 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, } else { - auto & dpb_reader = typeid_cast(*pack_reader); + auto & dpb_reader = typeid_cast(*column_file_reader); auto pk_column = dpb_reader.getPKColumn(); auto version_column = dpb_reader.getVersionColumn(); diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp index 8fef83f9166..25df32745f7 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp @@ -1,4 +1,5 @@ #include +#include #include namespace DB @@ -17,4 +18,4 @@ RowKeyRange ColumnFileSetSnapshot::getSquashDeleteRange() const return squashed_delete_range; } } // namespace DM -} // namespace DB \ No newline at end of file +} // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h index 979f6749b07..f7f6a44f3f5 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.h @@ -40,7 +40,7 @@ class ColumnFileSetSnapshot : public std::enable_shared_from_this ColumnFileTiny::deserializeMetadata(ReadBuffer & buf, const BlockPtr & last_schema) +std::tuple ColumnFileTiny::deserializeMetadata(ReadBuffer & buf, const BlockPtr & last_schema) { auto schema = deserializeSchema(buf); if (!schema) diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h index f4be243c049..8de379e1f5e 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.h @@ -98,7 +98,7 @@ class ColumnFileTiny : public ColumnFilePersisted static PageId writeColumnFileData(DMContext & context, const Block & block, size_t offset, size_t limit, WriteBatches & wbs); - static std::tuple deserializeMetadata(ReadBuffer & buf, const BlockPtr & last_schema); + static std::tuple deserializeMetadata(ReadBuffer & buf, const BlockPtr & last_schema); String toString() const override { diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp index 69468c47261..d165c5b22a6 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V3.cpp @@ -9,15 +9,11 @@ namespace DM { void serializeSavedColumnFilesInV3Format(WriteBuffer & buf, const ColumnFilePersisteds & column_files) { - size_t saved_packs = std::find_if(column_files.begin(), column_files.end(), [](const ColumnFilePtr & p) { return !p->isSaved(); }) - column_files.begin(); - - writeIntBinary(saved_packs, buf); + writeIntBinary(column_files.size(), buf); BlockPtr last_schema; for (const auto & column_file : column_files) { - if (!column_file->isSaved()) - break; // Do not encode the schema if it is the same as previous one. writeIntBinary(column_file->getType(), buf); @@ -54,13 +50,13 @@ ColumnFilePersisteds deserializeSavedColumnFilesInV3Format(DMContext & context, { size_t column_file_count; readIntBinary(column_file_count, buf); - ColumnFiles column_files; + ColumnFilePersisteds column_files; BlockPtr last_schema; for (size_t i = 0; i < column_file_count; ++i) { std::underlying_type::type column_file_type; readIntBinary(column_file_type, buf); - ColumnFilePtr column_file; + ColumnFilePersistedPtr column_file; switch (column_file_type) { case ColumnFile::Type::DELETE_RANGE: diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index fbf47eeb404..a04ac82f8e2 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -13,10 +13,10 @@ namespace DB { namespace DM { -inline void serializeColumnStableFileLevels(WriteBatches & wbs, PageId id, const ColumnFilePersistedSet::ColumnStableFileLevels & file_levels) +inline void serializeColumnStableFileLevels(WriteBatches & wbs, PageId id, const ColumnFilePersistedSet::ColumnFilePersistedLevels & file_levels) { MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); - ColumnStableFiles column_files; + ColumnFilePersisteds column_files; for (const auto & level : file_levels) { for (const auto & file : level) @@ -24,7 +24,7 @@ inline void serializeColumnStableFileLevels(WriteBatches & wbs, PageId id, const column_files.emplace_back(file); } } - serializeColumnStableFiles(buf, column_files); + serializeSavedColumnFiles(buf, column_files); auto data_size = buf.count(); wbs.meta.putPage(id, 0, buf.tryGetReadBuffer(), data_size); } @@ -51,7 +51,27 @@ void ColumnFilePersistedSet::updateStats() deletes = new_deletes; } -ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const ColumnStableFiles & column_stable_files) +void ColumnFilePersistedSet::checkColumnFiles(const ColumnFiles & new_column_files) +{ + if constexpr (!DM_RUN_CHECK) + return; + size_t new_rows = 0; + size_t new_deletes = 0; + + for (const auto & file : new_column_files) + { + new_rows += file->getRows(); + new_deletes += file->isDeleteRange(); + } + if (unlikely(new_rows != rows || new_deletes != deletes)) + { + LOG_ERROR(log, + "Rows and deletes check failed. Current packs: " << columnFilesToString(new_column_files) << ", new packs: " << columnFilesToString(new_column_files)); + throw Exception("Rows and deletes check failed.", ErrorCodes::LOGICAL_ERROR); + } +} + +ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const ColumnFilePersisteds & column_stable_files) : metadata_id(metadata_id_) , log(&Poco::Logger::get("ColumnStableFileSet")) { @@ -61,11 +81,11 @@ ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const Column updateStats(); } -ColumnStableFileSetPtr ColumnFilePersistedSet::restore(DMContext & context, const RowKeyRange & segment_range, PageId id) +ColumnFilePersistedSetPtr ColumnFilePersistedSet::restore(DMContext & context, const RowKeyRange & segment_range, PageId id) { Page page = context.storage_pool.meta()->read(id, nullptr); ReadBufferFromMemory buf(page.data.begin(), page.data.size()); - auto column_files = deserializeColumnStableFiles(context, segment_range, buf); + auto column_files = deserializeSavedColumnFiles(context, segment_range, buf); return std::make_shared(id, column_files); } @@ -84,10 +104,10 @@ void ColumnFilePersistedSet::recordRemoveColumnFilesPages(WriteBatches & wbs) co } -ColumnStableFiles ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & context, - const RowKeyRange & target_range, - const ColumnFiles & head_column_files, - WriteBatches & wbs) const +ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & context, + const RowKeyRange & target_range, + const ColumnFiles & head_column_files, + WriteBatches & wbs) const { // We check in the direction from the last level to the first level. // In every level, we check from the begin to the last. @@ -125,7 +145,7 @@ ColumnStableFiles ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & cont throw Exception("Check head failed, unexpected size", ErrorCodes::LOGICAL_ERROR); } - ColumnStableFiles cloned_tail; + ColumnFilePersisteds cloned_tail; while (level_it != stable_files_levels.rend()) { if (it_2 == level_it->end()) @@ -234,7 +254,7 @@ bool ColumnFilePersistedSet::appendColumnStableFilesToLevel0(size_t prev_flush_v return false; } flush_version += 1; - ColumnStableFileLevels new_stable_files_levels; + ColumnFilePersistedLevels new_stable_files_levels; for (auto & level : stable_files_levels) { auto & new_level = new_stable_files_levels.emplace_back(); @@ -325,7 +345,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & LOG_WARNING(log, "Structure has been updated during compact"); return false; } - ColumnStableFileLevels new_stable_files_levels; + ColumnFilePersistedLevels new_stable_files_levels; for (size_t i = 0; i < compaction->compaction_src_level; i++) { auto & new_level = new_stable_files_levels.emplace_back(); diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h index bc5f43bf5e5..6d2fefb4ab7 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #include #include @@ -28,19 +28,19 @@ namespace DB namespace DM { class ColumnFilePersistedSet; -using ColumnStableFileSetPtr = std::shared_ptr; +using ColumnFilePersistedSetPtr = std::shared_ptr; /// This class is not thread safe, manipulate on it requires acquire extra synchronization class ColumnFilePersistedSet : public std::enable_shared_from_this , private boost::noncopyable { public: - using ColumnStableFileLevel = ColumnStableFiles; - using ColumnStableFileLevels = std::vector; + using ColumnFilePersistedLevel = ColumnFilePersisteds; + using ColumnFilePersistedLevels = std::vector; private: PageId metadata_id; - ColumnStableFileLevels stable_files_levels; + ColumnFilePersistedLevels stable_files_levels; std::atomic stable_files_count; std::atomic rows = 0; @@ -57,12 +57,14 @@ class ColumnFilePersistedSet : public std::enable_shared_from_thistryToInMemoryFile(); m_file) - return m_file->getSchema(); - else if (auto * t_file = (*it)->tryToTinyFile(); t_file) - return t_file->getSchema(); - } - return {}; -} - -void DeltaValueSpace::checkColumnFiles(const ColumnFiles & new_column_files) -{ - if constexpr (!DM_RUN_CHECK) - return; - size_t new_rows = 0; - size_t new_deletes = 0; - - bool seen_unsaved = false; - bool ok = true; - for (const auto & file : new_column_files) - { - if (file->isSaved() && seen_unsaved) - { - ok = false; - break; - } - seen_unsaved |= !file->isSaved(); - - new_rows += file->getRows(); - new_deletes += file->isDeleteRange(); - } - if (unlikely(!ok || new_rows != rows || new_deletes != deletes)) - { - LOG_ERROR(log, - "Rows and deletes check failed. Current packs: " << columnFilesToString(column_files) << ", new packs: " << columnFilesToString(column_files)); - throw Exception("Rows and deletes check failed.", ErrorCodes::LOGICAL_ERROR); - } -} - // ================================================ // Public methods // ================================================ +DeltaValueSpace::DeltaValueSpace(PageId id_, const ColumnFilePersisteds & persisted_files, const ColumnFiles & in_memory_files) + : persisted_file_set(std::make_shared(id_, persisted_files)) + , mem_table_set(std::make_shared(in_memory_files)) + , delta_index(std::make_shared()) + , log(&Poco::Logger::get("DeltaValueSpace")) +{} -DeltaValueSpace::DeltaValueSpace(PageId id_, const ColumnFiles & column_files_) - : id(id_) - , column_files(column_files_) +DeltaValueSpace::DeltaValueSpace(ColumnFilePersistedSetPtr && persisted_file_set_) + : persisted_file_set(std::move(persisted_file_set_)) + , mem_table_set(std::make_shared()) , delta_index(std::make_shared()) , log(&Poco::Logger::get("DeltaValueSpace")) -{ - for (auto & file : column_files) - { - rows += file->getRows(); - bytes += file->getBytes(); - deletes += file->isDeleteRange(); - if (!file->isSaved()) - { - unsaved_rows += file->getRows(); - unsaved_bytes += file->getBytes(); - unsaved_deletes += file->isDeleteRange(); - } - } -} +{} void DeltaValueSpace::abandon(DMContext & context) { @@ -94,197 +42,47 @@ void DeltaValueSpace::abandon(DMContext & context) DeltaValueSpacePtr DeltaValueSpace::restore(DMContext & context, const RowKeyRange & segment_range, PageId id) { - Page page = context.storage_pool.meta()->read(id, nullptr); - ReadBufferFromMemory buf(page.data.begin(), page.data.size()); - auto column_files = deserializeSavedColumnFiles(context, segment_range, buf); - return std::make_shared(id, column_files); + auto persisted_file_set_ = ColumnFilePersistedSet::restore(context, segment_range, id); + return std::make_shared(std::move(persisted_file_set_)); } void DeltaValueSpace::saveMeta(WriteBatches & wbs) const { - MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); - // Only serialize saved packs. - serializeSavedColumnFiles(buf, column_files); - auto data_size = buf.count(); - wbs.meta.putPage(id, 0, buf.tryGetReadBuffer(), data_size); + persisted_file_set->saveMeta(wbs); } -ColumnFiles DeltaValueSpace::checkHeadAndCloneTail(DMContext & context, - const RowKeyRange & target_range, - const ColumnFiles & head_column_files, - WriteBatches & wbs) const +std::pair +DeltaValueSpace::checkHeadAndCloneTail(DMContext & context, + const RowKeyRange & target_range, + const ColumnFiles & head_column_files, + WriteBatches & wbs) const { - if (head_column_files.size() > column_files.size()) - { - LOG_ERROR(log, - info() << ", Delta Check head packs failed, unexpected size. head_packs: " << columnFilesToString(head_column_files) - << ", packs: " << columnFilesToString(column_files)); - throw Exception("Check head packs failed, unexpected size", ErrorCodes::LOGICAL_ERROR); - } - - auto it_1 = head_column_files.begin(); - auto it_2 = column_files.begin(); - for (; it_1 != head_column_files.end() && it_2 != column_files.end(); ++it_1, ++it_2) - { - if ((*it_1)->getId() != (*it_2)->getId() || (*it_1)->getRows() != (*it_2)->getRows()) - { - LOG_ERROR(log, - simpleInfo() << ", Delta Check head packs failed, unexpected size. head_packs: " << columnFilesToString(head_column_files) - << ", packs: " << columnFilesToString(column_files)); - throw Exception("Check head packs failed", ErrorCodes::LOGICAL_ERROR); - } - } - - ColumnFiles cloned_tail; - for (; it_2 != column_files.end(); ++it_2) - { - const auto & column_file = *it_2; - if (auto * dr = column_file->tryToDeleteRange(); dr) - { - auto new_dr = dr->getDeleteRange().shrink(target_range); - if (!new_dr.none()) - { - // Only use the available delete_range pack. - cloned_tail.push_back(dr->cloneWith(new_dr)); - } - } - else if (auto * b = column_file->tryToInMemoryFile(); b) - { - auto new_column_file = b->clone(); - - // No matter or what, don't append to packs which cloned from old packs again. - // Because they could shared the same cache. And the cache can NOT be inserted from different packs in different delta. - new_column_file->disableAppend(); - cloned_tail.push_back(new_column_file); - } - else if (auto * t = column_file->tryToTinyFile(); t) - { - // Use a newly created page_id to reference the data page_id of current pack. - PageId new_data_page_id = context.storage_pool.newLogPageId(); - wbs.log.putRefPage(new_data_page_id, t->getDataPageId()); - auto new_column_file = t->cloneWith(new_data_page_id); - - cloned_tail.push_back(new_column_file); - } - else if (auto * f = column_file->tryToBigFile(); f) - { - auto delegator = context.path_pool.getStableDiskDelegator(); - auto new_ref_id = context.storage_pool.newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); - auto file_id = f->getFile()->fileId(); - wbs.data.putRefPage(new_ref_id, file_id); - auto file_parent_path = delegator.getDTFilePath(file_id); - auto new_file = DMFile::restore(context.db_context.getFileProvider(), file_id, /* ref_id= */ new_ref_id, file_parent_path, DMFile::ReadMetaMode::all()); - - auto new_column_file = f->cloneWith(context, new_file, target_range); - cloned_tail.push_back(new_column_file); - } - } - - return cloned_tail; + auto tail_persisted_files = persisted_file_set->checkHeadAndCloneTail(context, target_range, head_column_files, wbs); + auto memory_files = mem_table_set->cloneColumnFiles(); + return std::make_pair(std::move(tail_persisted_files), std::move(memory_files)); } size_t DeltaValueSpace::getTotalCacheRows() const { std::scoped_lock lock(mutex); - size_t cache_rows = 0; - for (const auto & pack : column_files) - { - if (auto * p = pack->tryToInMemoryFile(); p) - { - if (auto && c = p->getCache(); c) - cache_rows += c->block.rows(); - } - else if (auto * t = pack->tryToTinyFile(); t) - { - if (auto && c = t->getCache(); c) - cache_rows += c->block.rows(); - } - } - return cache_rows; + return mem_table_set->getRows() + persisted_file_set->getTotalCacheRows(); } size_t DeltaValueSpace::getTotalCacheBytes() const { std::scoped_lock lock(mutex); - size_t cache_bytes = 0; - for (auto & pack : column_files) - { - if (auto p = pack->tryToInMemoryFile(); p) - { - if (auto && c = p->getCache(); c) - cache_bytes += c->block.allocatedBytes(); - } - else if (auto * t = pack->tryToTinyFile(); t) - { - if (auto && c = t->getCache(); c) - cache_bytes += c->block.allocatedBytes(); - } - } - return cache_bytes; + return mem_table_set->getBytes() + persisted_file_set->getTotalCacheBytes(); } size_t DeltaValueSpace::getValidCacheRows() const { std::scoped_lock lock(mutex); - size_t cache_rows = 0; - for (auto & pack : column_files) - { - if (auto p = pack->tryToInMemoryFile(); p) - { - cache_rows += pack->getRows(); - } - else if (auto * t = pack->tryToTinyFile(); t) - { - if (auto && c = t->getCache(); c) - cache_rows += c->block.rows(); - } - } - return cache_rows; + return mem_table_set->getRows() + persisted_file_set->getValidCacheRows(); } -void DeltaValueSpace::recordRemovePacksPages(WriteBatches & wbs) const +void DeltaValueSpace::recordRemoveColumnFilesPages(WriteBatches & wbs) const { - for (auto & pack : column_files) - pack->removeData(wbs); -} - -void DeltaValueSpace::appendColumnFileInner(const ColumnFilePtr & column_file) -{ - auto last_schema = lastSchema(); - - if (auto * m_file = column_file->tryToInMemoryFile(); m_file) - { - // If this pack's schema is identical to last_schema, then use the last_schema instance, - // so that we don't have to serialize my_schema instance. - auto my_schema = m_file->getSchema(); - if (last_schema && my_schema && last_schema != my_schema && isSameSchema(*my_schema, *last_schema)) - m_file->resetIdenticalSchema(last_schema); - } - else if (auto * t_file = column_file->tryToTinyFile(); t_file) - { - // If this pack's schema is identical to last_schema, then use the last_schema instance, - // so that we don't have to serialize my_schema instance. - auto my_schema = t_file->getSchema(); - if (last_schema && my_schema && last_schema != my_schema && isSameSchema(*my_schema, *last_schema)) - t_file->resetIdenticalSchema(last_schema); - } - - if (!column_files.empty()) - { - auto last_pack = column_files.back(); - if (last_pack->isAppendable()) - last_pack->disableAppend(); - } - - column_files.push_back(column_file); - - rows += column_file->getRows(); - bytes += column_file->getBytes(); - deletes += column_file->getDeletes(); - - unsaved_rows += column_file->getRows(); - unsaved_bytes += column_file->getBytes(); - unsaved_deletes += column_file->getDeletes(); + persisted_file_set->recordRemoveColumnFilesPages(wbs); } bool DeltaValueSpace::appendColumnFile(DMContext & /*context*/, const ColumnFilePtr & column_file) @@ -293,8 +91,7 @@ bool DeltaValueSpace::appendColumnFile(DMContext & /*context*/, const ColumnFile if (abandoned.load(std::memory_order_relaxed)) return false; - appendColumnFileInner(column_file); - + mem_table_set->appendColumnFile(column_file); return true; } @@ -304,34 +101,7 @@ bool DeltaValueSpace::appendToCache(DMContext & context, const Block & block, si if (abandoned.load(std::memory_order_relaxed)) return false; - // If the `column_files` is not empty, and the last `column_file` is a `ColumnInMemoryFile`, we will merge the newly block into the last `column_file`. - // Otherwise, create a new `ColumnInMemoryFile` and write into it. - bool success = false; - size_t append_bytes = block.bytes(offset, limit); - if (!column_files.empty()) - { - auto & last_column_file = column_files.back(); - if (last_column_file->isAppendable()) - success = last_column_file->append(context, block, offset, limit, append_bytes); - } - - if (!success) - { - // Create a new pack. - auto last_schema = lastSchema(); - auto my_schema = (last_schema && isSameSchema(block, *last_schema)) ? last_schema : std::make_shared(block.cloneEmpty()); - auto new_column_file = std::make_shared(my_schema); - appendColumnFileInner(new_column_file); - success = new_column_file->append(context, block, offset, limit, append_bytes); - if (unlikely(!success)) - throw Exception("Write to MemTableSet failed", ErrorCodes::LOGICAL_ERROR); - } - - rows += limit; - bytes += append_bytes; - unsaved_rows += limit; - unsaved_bytes += append_bytes; - + mem_table_set->appendToCache(context, block, offset, limit); return true; } @@ -341,32 +111,145 @@ bool DeltaValueSpace::appendDeleteRange(DMContext & /*context*/, const RowKeyRan if (abandoned.load(std::memory_order_relaxed)) return false; - auto p = std::make_shared(delete_range); - appendColumnFileInner(p); - + mem_table_set->appendDeleteRange(delete_range); return true; } -bool DeltaValueSpace::ingestColumnFiles(DMContext & /*context*/, const RowKeyRange & range, const ColumnFiles & packs, bool clear_data_in_range) +bool DeltaValueSpace::ingestColumnFiles(DMContext & /*context*/, const RowKeyRange & range, const ColumnFiles & column_files, bool clear_data_in_range) { std::scoped_lock lock(mutex); if (abandoned.load(std::memory_order_relaxed)) return false; - // Prepend a DeleteRange to clean data before applying packs - if (clear_data_in_range) + mem_table_set->ingestColumnFiles(range, column_files, clear_data_in_range); + return true; +} + +bool DeltaValueSpace::flush(DMContext & context) +{ + LOG_DEBUG(log, info() << ", Flush start"); + + /// We have two types of data needed to flush to disk: + /// 1. The cache data in DeltaPackBlock + /// 2. The serialized metadata of packs in DeltaValueSpace + + ColumnFileFlushTaskPtr flush_task; + WriteBatches wbs(context.storage_pool, context.getWriteLimiter()); + DeltaIndexPtr cur_delta_index; + { + /// Prepare data which will be written to disk. + std::scoped_lock lock(mutex); + if (abandoned.load(std::memory_order_relaxed)) + { + LOG_DEBUG(log, simpleInfo() << "Flush stop because abandoned"); + return false; + } + flush_task = mem_table_set->buildFlushTask(context, persisted_file_set->getRows(), persisted_file_set->getDeletes(), persisted_file_set->getCurrentFlushVersion()); + cur_delta_index = delta_index; + } + + // No update, return successfully. + if (!flush_task) { - auto p = std::make_shared(range); - appendColumnFileInner(p); + LOG_DEBUG(log, simpleInfo() << " Nothing to flush"); + return true; } - for (auto & p : packs) + /// Write prepared data to disk. + auto delta_index_updates = flush_task->prepare(wbs); + DeltaIndexPtr new_delta_index; + if (!delta_index_updates.empty()) { - appendColumnFileInner(p); + LOG_DEBUG(log, simpleInfo() << " Update index start"); + new_delta_index = cur_delta_index->cloneWithUpdates(delta_index_updates); + LOG_DEBUG(log, simpleInfo() << " Update index done"); } + { + /// If this instance is still valid, then commit. + std::scoped_lock lock(mutex); + if (abandoned.load(std::memory_order_relaxed)) + { + // Delete written data. + wbs.setRollback(); + LOG_DEBUG(log, simpleInfo() << " Flush stop because abandoned"); + return false; + } + + if (!flush_task->commit(persisted_file_set, wbs)) + { + wbs.rollbackWrittenLogAndData(); + LOG_DEBUG(log, simpleInfo() << " Stop flush because structure got updated"); + } + + /// Update delta tree + if (new_delta_index) + delta_index = new_delta_index; + } return true; } +bool DeltaValueSpace::compact(DMContext & context) +{ + bool v = false; + // Other thread is doing structure update, just return. + if (!is_updating.compare_exchange_strong(v, true)) + { + LOG_DEBUG(log, simpleInfo() << " Compact stop because updating"); + return true; + } + SCOPE_EXIT({ + bool v = true; + if (!is_updating.compare_exchange_strong(v, false)) + throw Exception(simpleInfo() + " is expected to be updating", ErrorCodes::LOGICAL_ERROR); + }); + + MinorCompactionPtr compaction_task; + PageStorage::SnapshotPtr log_storage_snap; + { + std::scoped_lock lock(mutex); + if (abandoned.load(std::memory_order_relaxed)) + { + LOG_DEBUG(log, simpleInfo() << " Compact stop because abandoned"); + return false; + } + compaction_task = persisted_file_set->pickUpMinorCompaction(context); + if (!compaction_task) + { + LOG_DEBUG(log, simpleInfo() << " Nothing to compact"); + return true; + } + log_storage_snap = context.storage_pool.log()->getSnapshot(); + } + + // do compaction task + WriteBatches wbs(context.storage_pool, context.getWriteLimiter()); + PageReader reader(context.storage_pool.log(), std::move(log_storage_snap), context.getReadLimiter()); + compaction_task->prepare(context, wbs, reader); + + { + std::scoped_lock lock(mutex); + + /// Check before commit. + if (abandoned.load(std::memory_order_relaxed)) + { + wbs.rollbackWrittenLogAndData(); + LOG_DEBUG(log, simpleInfo() << " Stop compact because abandoned"); + return false; + } + if (!compaction_task->commit(wbs)) + { + LOG_WARNING(log, "Structure has been updated during compact"); + wbs.rollbackWrittenLogAndData(); + LOG_DEBUG(log, simpleInfo() << " Compact stop because structure got updated"); + return false; + } + + LOG_DEBUG(log, simpleInfo() << compaction_task->info()); + } + wbs.writeRemoves(); + + return true; +} } // namespace DM } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h index cb8e34b5770..15acebf314e 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include #include @@ -39,52 +41,20 @@ struct DMContext; struct WriteBatches; class StoragePool; -static std::atomic_uint64_t NEXT_PACK_ID{0}; - -class BlockOrDelete -{ -private: - Block block; - size_t block_offset; - - RowKeyRange delete_range; - -public: - BlockOrDelete(Block && block_, size_t offset_) - : block(block_) - , block_offset(offset_) - {} - BlockOrDelete(const RowKeyRange & delete_range_) - : delete_range(delete_range_) - {} - - bool isBlock() { return (bool)block; } - auto & getBlock() { return block; }; - auto getBlockOffset() { return block_offset; } - auto & getDeleteRange() { return delete_range; } -}; - -using BlockOrDeletes = std::vector; - class DeltaValueSpace : public std::enable_shared_from_this , private boost::noncopyable { - friend class DeltaValueSpaceSnapshot; - public: using Lock = std::unique_lock; private: - PageId id; - ColumnFiles column_files; - - std::atomic rows = 0; - std::atomic bytes = 0; - std::atomic deletes = 0; - - std::atomic unsaved_rows = 0; - std::atomic unsaved_bytes = 0; - std::atomic unsaved_deletes = 0; + /// column files in `stable_file_set` are all persisted in disks and can be restored after restart. + /// column files in `mem_table_set` just resides in memory. + /// + /// Note that `persisted_file_set` and `mem_table_set` also forms a one-dimensional space + /// Specifically, files in `persisted_file_set` precedes files in `mem_table_set`. + ColumnFilePersistedSetPtr persisted_file_set; + MemTableSetPtr mem_table_set; /// This instance has been abandoned. Like after merge delta, split/merge. std::atomic_bool abandoned = false; @@ -108,26 +78,19 @@ class DeltaValueSpace : public std::enable_shared_from_this Poco::Logger * log; -private: - BlockPtr lastSchema(); - - void checkColumnFiles(const ColumnFiles & new_column_files); - - void appendColumnFileInner(const ColumnFilePtr & column_file); - public: - DeltaValueSpace(PageId id_, const ColumnFiles & column_files_ = {}); + explicit DeltaValueSpace(PageId id_, const ColumnFilePersisteds & persisted_files = {}, const ColumnFiles & in_memory_files = {}); + + explicit DeltaValueSpace(ColumnFilePersistedSetPtr && persisted_file_set_); /// Restore the metadata of this instance. /// Only called after reboot. static DeltaValueSpacePtr restore(DMContext & context, const RowKeyRange & segment_range, PageId id); - String simpleInfo() const { return "Delta [" + DB::toString(id) + "]"; } + String simpleInfo() const { return "Delta [" + DB::toString(persisted_file_set->getId()) + "]"; } String info() const { - return "{Delta [" + DB::toString(id) + "]: " + DB::toString(column_files.size()) + " column files, " + DB::toString(rows.load()) + " rows, " - + DB::toString(unsaved_rows.load()) + " unsaved_rows, " + DB::toString(unsaved_bytes.load()) + " unsaved_bytes, " - + DB::toString(deletes.load()) + " deletes, " + DB::toString(unsaved_deletes.load()) + " unsaved_deletes}"; + return fmt::format("{}, {}", mem_table_set->info(), persisted_file_set->info()); } bool getLock(Lock & lock) const @@ -146,26 +109,32 @@ class DeltaValueSpace : public std::enable_shared_from_this void saveMeta(WriteBatches & wbs) const; - void recordRemovePacksPages(WriteBatches & wbs) const; + void recordRemoveColumnFilesPages(WriteBatches & wbs) const; - /// First check whether 'head_packs' is exactly the head of packs in this instance. - /// If yes, then clone the tail of packs, using ref pages. + /// First check whether 'head_column_files' is exactly the head of column files in this instance. + /// If yes, then clone the tail of column files, using ref pages. /// Otherwise, throw an exception. /// /// Note that this method is expected to be called by some one who already have lock on this instance. - ColumnFiles + /// And the `head_column_files` must just reside in `persisted_file_set`. + std::pair checkHeadAndCloneTail(DMContext & context, const RowKeyRange & target_range, const ColumnFiles & head_column_files, WriteBatches & wbs) const; - PageId getId() const { return id; } + PageId getId() const { return persisted_file_set->getId(); } - size_t getColumnFileCount() const { return column_files.size(); } - size_t getRows(bool use_unsaved = true) const { return use_unsaved ? rows.load() : rows - unsaved_rows; } - size_t getBytes(bool use_unsaved = true) const { return use_unsaved ? bytes.load() : bytes - unsaved_bytes; } - size_t getDeletes() const { return deletes; } + size_t getColumnFileCount() const { return persisted_file_set->getColumnFileCount() + mem_table_set->getColumnFileCount(); } + size_t getRows(bool use_unsaved = true) const + { + return use_unsaved ? persisted_file_set->getRows() + mem_table_set->getRows() : persisted_file_set->getRows(); + } + size_t getBytes(bool use_unsaved = true) const + { + return use_unsaved ? persisted_file_set->getBytes() + mem_table_set->getBytes() : persisted_file_set->getBytes(); + } + size_t getDeletes() const { return persisted_file_set->getDeletes() + mem_table_set->getDeletes(); } - size_t getUnsavedRows() const { return unsaved_rows; } - size_t getUnsavedBytes() const { return unsaved_bytes; } - size_t getUnsavedDeletes() const { return unsaved_deletes; } + size_t getUnsavedRows() const { return mem_table_set->getRows(); } + size_t getUnsavedBytes() const { return mem_table_set->getBytes(); } size_t getTotalCacheRows() const; size_t getTotalCacheBytes() const; @@ -241,7 +210,7 @@ class DeltaValueSpace : public std::enable_shared_from_this bool appendDeleteRange(DMContext & context, const RowKeyRange & delete_range); - bool ingestColumnFiles(DMContext & context, const RowKeyRange & range, const ColumnFiles & packs, bool clear_data_in_range); + bool ingestColumnFiles(DMContext & context, const RowKeyRange & range, const ColumnFiles & column_files, bool clear_data_in_range); /// Flush the data of packs which haven't write to disk yet, and also save the metadata of packs. bool flush(DMContext & context); @@ -259,6 +228,8 @@ class DeltaValueSnapshot : public std::enable_shared_from_this(type); c->is_update = is_update; c->shared_delta_index = shared_delta_index; - c->storage_snap = storage_snap; - c->column_files = column_files; - c->rows = rows; - c->bytes = bytes; - c->deletes = deletes; - c->is_common_handle = is_common_handle; - c->rowkey_column_size = rowkey_column_size; + c->mem_table_snap = mem_table_snap->clone(); + c->persisted_files_snap = persisted_files_snap->clone(); c->_delta = _delta; return c; } - DeltaValueSnapshot(CurrentMetrics::Metric type_) + explicit DeltaValueSnapshot(CurrentMetrics::Metric type_) { type = type_; CurrentMetrics::add(type); @@ -316,16 +276,27 @@ class DeltaValueSnapshot : public std::enable_shared_from_thisgetColumnFiles(); + } + + size_t getColumnFileCount() const { return (mem_table_snap ? mem_table_snap->getColumnFileCount() : 0) + persisted_files_snap->getColumnFileCount(); } + size_t getRows() const { return (mem_table_snap ? mem_table_snap->getRows() : 0) + persisted_files_snap->getRows(); } + size_t getBytes() const { return (mem_table_snap ? mem_table_snap->getBytes() : 0) + persisted_files_snap->getBytes(); } + size_t getDeletes() const { return (mem_table_snap ? mem_table_snap->getDeletes() : 0) + persisted_files_snap->getDeletes(); } - size_t getPackCount() const { return column_files.size(); } - size_t getRows() const { return rows; } - size_t getBytes() const { return bytes; } - size_t getDeletes() const { return deletes; } + size_t getMemTableSetRowsOffset() const { return persisted_files_snap->getRows(); } + size_t getMemTableSetDeletesOffset() const { return persisted_files_snap->getDeletes(); } RowKeyRange getSquashDeleteRange() const; - const auto & getStorageSnapshot() { return storage_snap; } + const auto & getStorageSnapshot() { return persisted_files_snap->getStorageSnapshot(); } const auto & getSharedDeltaIndex() { return shared_delta_index; } }; @@ -339,22 +310,17 @@ class DeltaValueReader // We only keep this member here to prevent it from being released. DeltaIndexCompactedPtr _compacted_delta_index; + ColumnFileSetReaderPtr mem_table_reader; + + ColumnFileSetReaderPtr persisted_files_reader; + // The columns expected to read. Note that we will do reading exactly in this column order. ColumnDefinesPtr col_defs; RowKeyRange segment_range; - // The row count of each pack. Cache here to speed up checking. - std::vector pack_rows; - // The cumulative rows of packs. Used to fast locate specific packs according to rows offset by binary search. - std::vector pack_rows_end; - - std::vector pack_readers; - private: DeltaValueReader() = default; - Block readPKVersion(size_t offset, size_t limit); - public: DeltaValueReader(const DMContext & context_, const DeltaSnapshotPtr & delta_snap_, @@ -362,7 +328,7 @@ class DeltaValueReader const RowKeyRange & segment_range_); // If we need to read columns besides pk and version, a DeltaValueReader can NOT be used more than once. - // This method create a new reader based on then current one. It will reuse some cachees in the current reader. + // This method create a new reader based on then current one. It will reuse some caches in the current reader. DeltaValueReaderPtr createNewReader(const ColumnDefinesPtr & new_col_defs); void setDeltaIndex(const DeltaIndexCompactedPtr & delta_index_) { _compacted_delta_index = delta_index_; } @@ -388,51 +354,36 @@ class DeltaValueReader class DeltaValueInputStream : public IBlockInputStream { private: - DeltaValueReader reader; - ColumnFiles & packs; - size_t pack_count; + ColumnFileSetInputStream mem_table_input_stream; + ColumnFileSetInputStream persisted_files_input_stream; - ColumnFileReaderPtr cur_pack_reader = {}; - size_t next_pack_index = 0; + bool persisted_files_done = false; public: DeltaValueInputStream(const DMContext & context_, const DeltaSnapshotPtr & delta_snap_, const ColumnDefinesPtr & col_defs_, const RowKeyRange & segment_range_) - : reader(context_, delta_snap_, col_defs_, segment_range_) - , packs(reader.delta_snap->getColumnFiles()) - , pack_count(packs.size()) - { - } + : mem_table_input_stream(context_, delta_snap_->mem_table_snap, col_defs_, segment_range_) + , persisted_files_input_stream(context_, delta_snap_->persisted_files_snap, col_defs_, segment_range_) + {} String getName() const override { return "DeltaValue"; } - Block getHeader() const override { return toEmptyBlock(*(reader.col_defs)); } + Block getHeader() const override { return persisted_files_input_stream.getHeader(); } Block read() override { - while (cur_pack_reader || next_pack_index < pack_count) + if (persisted_files_done) + return mem_table_input_stream.read(); + + Block block = persisted_files_input_stream.read(); + if (block) + return block; + else { - if (!cur_pack_reader) - { - if (packs[next_pack_index]->isDeleteRange()) - { - ++next_pack_index; - continue; - } - else - { - cur_pack_reader = reader.pack_readers[next_pack_index]; - ++next_pack_index; - } - } - Block block = cur_pack_reader->readNextBlock(); - if (block) - return block; - else - cur_pack_reader = {}; + persisted_files_done = true; + return mem_table_input_stream.read(); } - return {}; } }; diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp index 060ab47e9d5..04226cfce4a 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp @@ -68,13 +68,17 @@ void MemTableSet::appendToCache(DMContext & context, const Block & block, size_t if (!success) { - auto new_column_file = std::make_shared(block); + // Create a new column file. + auto last_schema = lastSchema(); + auto my_schema = (last_schema && isSameSchema(block, *last_schema)) ? last_schema : std::make_shared(block.cloneEmpty()); + auto new_column_file = std::make_shared(my_schema); + appendColumnFileInner(new_column_file); success = new_column_file->append(context, block, offset, limit, append_bytes); if (unlikely(!success)) throw Exception("Write to MemTableSet failed", ErrorCodes::LOGICAL_ERROR); - appendColumnFileInner(new_column_file); } - // FIXME: update rows and bytes and etc. + rows += limit; + bytes += append_bytes; } void MemTableSet::appendDeleteRange(const RowKeyRange & delete_range) @@ -122,7 +126,7 @@ ColumnFileSetSnapshotPtr MemTableSet::createSnapshot() return snap; } -FlushColumnFileTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t rows_offset, size_t deletes_offset, size_t flush_version) +ColumnFileFlushTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t rows_offset, size_t deletes_offset, size_t flush_version) { if (column_files.empty()) return nullptr; diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h index cd12ea84a05..8ea60af69ea 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h @@ -73,7 +73,7 @@ class MemTableSet : public std::enable_shared_from_this /// Returns empty if this instance is abandoned, you should try again. ColumnFileSetSnapshotPtr createSnapshot(); - FlushColumnFileTaskPtr buildFlushTask(DMContext & context, size_t rows_offset, size_t deletes_offset, size_t flush_version); + ColumnFileFlushTaskPtr buildFlushTask(DMContext & context, size_t rows_offset, size_t deletes_offset, size_t flush_version); void removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush_task); }; diff --git a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp index 1c876b307a3..bdc82dbd204 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp @@ -7,53 +7,6 @@ namespace DB::DM { -std::pair findColumnFile(const ColumnFiles & packs, size_t rows_offset, size_t deletes_offset) -{ - size_t rows_count = 0; - size_t deletes_count = 0; - size_t pack_index = 0; - for (; pack_index < packs.size(); ++pack_index) - { - if (rows_count == rows_offset && deletes_count == deletes_offset) - return {pack_index, 0}; - const auto & pack = packs[pack_index]; - - if (pack->isDeleteRange()) - { - if (deletes_count == deletes_offset) - { - if (unlikely(rows_count != rows_offset)) - throw Exception("rows_count and rows_offset are expected to be equal. pack_index: " + DB::toString(pack_index) - + ", pack_size: " + DB::toString(packs.size()) + ", rows_count: " + DB::toString(rows_count) - + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) - + ", deletes_offset: " + DB::toString(deletes_offset)); - return {pack_index, 0}; - } - ++deletes_count; - } - else - { - rows_count += pack->getRows(); - if (rows_count > rows_offset) - { - if (unlikely(deletes_count != deletes_offset)) - throw Exception("deletes_count and deletes_offset are expected to be equal. pack_index: " + DB::toString(pack_index) - + ", pack_size: " + DB::toString(packs.size()) + ", rows_count: " + DB::toString(rows_count) - + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) - + ", deletes_offset: " + DB::toString(deletes_offset)); - - return {pack_index, pack->getRows() - (rows_count - rows_offset)}; - } - } - } - if (rows_count != rows_offset || deletes_count != deletes_offset) - throw Exception("illegal rows_offset and deletes_offset. pack_size: " + DB::toString(packs.size()) - + ", rows_count: " + DB::toString(rows_count) + ", rows_offset: " + DB::toString(rows_offset) - + ", deletes_count: " + DB::toString(deletes_count) + ", deletes_offset: " + DB::toString(deletes_offset)); - - return {pack_index, 0}; -} - // ================================================ // DeltaValueSpace // ================================================ @@ -70,72 +23,21 @@ DeltaSnapshotPtr DeltaValueSpace::createSnapshot(const DMContext & context, bool auto snap = std::make_shared(type); snap->is_update = for_update; snap->_delta = this->shared_from_this(); - snap->storage_snap = std::make_shared(context.storage_pool, context.getReadLimiter(), true); - snap->rows = rows; - snap->bytes = bytes; - snap->deletes = deletes; - snap->column_files.reserve(column_files.size()); + snap->persisted_files_snap = persisted_file_set->createSnapshot(context); snap->shared_delta_index = delta_index; - if (for_update) - { - snap->rows -= unsaved_rows; - snap->bytes -= unsaved_bytes; - snap->deletes -= unsaved_deletes; - } - - size_t check_rows = 0; - size_t check_deletes = 0; - size_t total_rows = 0; - size_t total_deletes = 0; - for (const auto & pack : column_files) - { - // If `for_update` is false, it will create a snapshot with all packs in DeltaValueSpace. - // If `for_update` is true, only persisted packs are used. - if (!for_update || pack->isSaved()) - { - if (auto * b = pack->tryToInMemoryFile(); b) - { - // Flush threads could update the value of ColumnInMemoryFile::cache, - // and since ColumnFile is not mult-threads safe, we should create a new column file object. - snap->column_files.push_back(std::make_shared(*b)); - } - else if (auto * t = pack->tryToTinyFile(); (t && t->getCache())) - { - // Compact threads could update the value of ColumnTinyFile::cache, - // and since ColumnFile is not mult-threads safe, we should create a new column file object. - snap->column_files.push_back(std::make_shared(*t)); - } - else - { - // For other packs, everything we use is constant, so no need to create a new object. - snap->column_files.push_back(pack); - } - - check_rows += pack->getRows(); - check_deletes += pack->isDeleteRange(); - } - total_rows += pack->getRows(); - total_deletes += pack->isDeleteRange(); - } - - if (unlikely(check_rows != snap->rows || check_deletes != snap->deletes || total_rows != rows || total_deletes != deletes)) - throw Exception("Rows and deletes check failed!", ErrorCodes::LOGICAL_ERROR); + if (!for_update) + snap->mem_table_snap = mem_table_set->createSnapshot(); return snap; } RowKeyRange DeltaValueSnapshot::getSquashDeleteRange() const { - RowKeyRange squashed_delete_range = RowKeyRange::newNone(is_common_handle, rowkey_column_size); - for (auto iter = column_files.cbegin(); iter != column_files.cend(); ++iter) - { - const auto & pack = *iter; - if (auto dp_delete = pack->tryToDeleteRange(); dp_delete) - squashed_delete_range = squashed_delete_range.merge(dp_delete->getDeleteRange()); - } - return squashed_delete_range; + auto delete_range1 = mem_table_snap->getSquashDeleteRange(); + auto delete_range2 = persisted_files_snap->getSquashDeleteRange(); + return delete_range1.merge(delete_range2); } // ================================================ @@ -148,31 +50,21 @@ DeltaValueReader::DeltaValueReader( const ColumnDefinesPtr & col_defs_, const RowKeyRange & segment_range_) : delta_snap(delta_snap_) + , mem_table_reader(delta_snap_->mem_table_snap ? std::make_shared(context, delta_snap_->mem_table_snap, col_defs_, segment_range_) : nullptr) + , persisted_files_reader(std::make_shared(context, delta_snap_->persisted_files_snap, col_defs_, segment_range_)) , col_defs(col_defs_) , segment_range(segment_range_) -{ - size_t total_rows = 0; - for (auto & p : delta_snap->getColumnFiles()) - { - total_rows += p->getRows(); - pack_rows.push_back(p->getRows()); - pack_rows_end.push_back(total_rows); - pack_readers.push_back(p->getReader(context, delta_snap->getStorageSnapshot(), col_defs)); - } -} +{} DeltaValueReaderPtr DeltaValueReader::createNewReader(const ColumnDefinesPtr & new_col_defs) { auto new_reader = new DeltaValueReader(); new_reader->delta_snap = delta_snap; new_reader->_compacted_delta_index = _compacted_delta_index; + new_reader->persisted_files_reader = persisted_files_reader->createNewReader(new_col_defs); + new_reader->mem_table_reader = mem_table_reader ? mem_table_reader->createNewReader(new_col_defs) : nullptr; new_reader->col_defs = new_col_defs; new_reader->segment_range = segment_range; - new_reader->pack_rows = pack_rows; - new_reader->pack_rows_end = pack_rows_end; - - for (auto & pr : pack_readers) - new_reader->pack_readers.push_back(pr->createNewReader(new_col_defs)); return std::shared_ptr(new_reader); } @@ -190,171 +82,70 @@ size_t DeltaValueReader::readRows(MutableColumns & output_cols, size_t offset, s // // So here, we should filter out those out-of-range rows. + auto mem_table_rows_offset = delta_snap->getMemTableSetRowsOffset(); auto total_delta_rows = delta_snap->getRows(); - auto start = std::min(offset, total_delta_rows); - auto end = std::min(offset + limit, total_delta_rows); - if (end == start) - return 0; - - auto [start_pack_index, rows_start_in_start_pack] = locatePosByAccumulation(pack_rows_end, start); - auto [end_pack_index, rows_end_in_end_pack] = locatePosByAccumulation(pack_rows_end, end); + auto persisted_files_start = std::min(offset, mem_table_rows_offset); + auto persisted_files_end = std::min(offset + limit, mem_table_rows_offset); + auto mem_table_start = offset <= mem_table_rows_offset ? 0 : std::min(offset - mem_table_rows_offset, total_delta_rows - mem_table_rows_offset); + auto mem_table_end = offset + limit <= mem_table_rows_offset ? 0 : std::min(offset + limit - mem_table_rows_offset, total_delta_rows - mem_table_rows_offset); size_t actual_read = 0; - for (size_t pack_index = start_pack_index; pack_index <= end_pack_index; ++pack_index) - { - size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; - size_t rows_end_in_pack = pack_index == end_pack_index ? rows_end_in_end_pack : pack_rows[pack_index]; - size_t rows_in_pack_limit = rows_end_in_pack - rows_start_in_pack; + if (persisted_files_start < persisted_files_end) + actual_read += persisted_files_reader->readRows(output_cols, persisted_files_start, persisted_files_end - persisted_files_start, range); - // Nothing to read. - if (rows_start_in_pack == rows_end_in_pack) - continue; + if ((mem_table_start < mem_table_end) && mem_table_reader) + actual_read += mem_table_reader->readRows(output_cols, mem_table_start, mem_table_end - mem_table_start, range); - auto & pack_reader = pack_readers[pack_index]; - actual_read += pack_reader->readRows(output_cols, rows_start_in_pack, rows_in_pack_limit, range); - } return actual_read; } -Block DeltaValueReader::readPKVersion(size_t offset, size_t limit) -{ - MutableColumns cols; - for (size_t i = 0; i < 2; ++i) - cols.push_back((*col_defs)[i].type->createColumn()); - readRows(cols, offset, limit, nullptr); - Block block; - for (size_t i = 0; i < 2; ++i) - { - const auto & cd = (*col_defs)[i]; - block.insert(ColumnWithTypeAndName(std::move(cols[i]), cd.type, cd.name, cd.id)); - } - return block; -} - - BlockOrDeletes DeltaValueReader::getPlaceItems(size_t rows_begin, size_t deletes_begin, size_t rows_end, size_t deletes_end) { /// Note that we merge the consecutive DeltaPackBlock together, which are seperated in groups by DeltaPackDelete and DeltePackFile. BlockOrDeletes res; + auto mem_table_rows_offset = delta_snap->getMemTableSetRowsOffset(); + auto mem_table_deletes_offset = delta_snap->getMemTableSetDeletesOffset(); + auto total_delta_rows = delta_snap->getRows(); + auto total_delta_deletes = delta_snap->getDeletes(); - auto & packs = delta_snap->getColumnFiles(); - - auto [start_pack_index, rows_start_in_start_pack] = findColumnFile(packs, rows_begin, deletes_begin); - auto [end_pack_index, rows_end_in_end_pack] = findColumnFile(packs, rows_end, deletes_end); - - size_t block_rows_start = rows_begin; - size_t block_rows_end = rows_begin; - - for (size_t pack_index = start_pack_index; pack_index < packs.size() && pack_index <= end_pack_index; ++pack_index) - { - auto & pack = *packs[pack_index]; - - if (pack.isDeleteRange() || pack.isBigFile()) - { - // First, compact the DeltaPackBlocks before this pack into one block. - if (block_rows_end != block_rows_start) - { - auto block = readPKVersion(block_rows_start, block_rows_end - block_rows_start); - res.emplace_back(std::move(block), block_rows_start); - } - - // Second, take current pack. - if (auto pack_delete = pack.tryToDeleteRange(); pack_delete) - { - res.emplace_back(pack_delete->getDeleteRange()); - } - else if (pack.isBigFile() && pack.getRows()) - { - auto block = readPKVersion(block_rows_end, pack.getRows()); - res.emplace_back(std::move(block), block_rows_end); - } - - block_rows_end += pack.getRows(); - block_rows_start = block_rows_end; - } - else - { - // It is a DeltaPackBlock. - size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; - size_t rows_end_in_pack = pack_index == end_pack_index ? rows_end_in_end_pack : pack.getRows(); + auto persisted_files_rows_begin = std::min(rows_begin, mem_table_rows_offset); + auto persisted_files_deletes_begin = std::min(deletes_begin, mem_table_deletes_offset); + auto persisted_files_rows_end = std::min(rows_end, mem_table_rows_offset); + auto persisted_files_deletes_end = std::min(deletes_end, mem_table_deletes_offset); - block_rows_end += rows_end_in_pack - rows_start_in_pack; + auto mem_table_rows_begin = rows_begin <= mem_table_rows_offset ? 0 : std::min(rows_begin - mem_table_rows_offset, total_delta_rows - mem_table_rows_offset); + auto mem_table_deletes_begin = deletes_begin <= mem_table_deletes_offset ? 0 : std::min(deletes_begin - mem_table_deletes_offset, total_delta_deletes - mem_table_deletes_offset); + auto mem_table_rows_end = rows_end <= mem_table_rows_offset ? 0 : std::min(rows_end - mem_table_rows_offset, total_delta_rows - mem_table_rows_offset); + auto mem_table_deletes_end = deletes_end <= mem_table_deletes_offset ? 0 : std::min(deletes_end - mem_table_deletes_offset, total_delta_deletes - mem_table_deletes_offset); - if (pack_index == packs.size() - 1 || pack_index == end_pack_index) - { - // It is the last pack. - if (block_rows_end != block_rows_start) - { - auto block = readPKVersion(block_rows_start, block_rows_end - block_rows_start); - res.emplace_back(std::move(block), block_rows_start); - } - block_rows_start = block_rows_end; - } - } - } + persisted_files_reader->getPlaceItems(res, persisted_files_rows_begin, persisted_files_deletes_begin, persisted_files_rows_end, persisted_files_deletes_end); + if (mem_table_reader) + mem_table_reader->getPlaceItems(res, mem_table_rows_begin, mem_table_deletes_begin, mem_table_rows_end, mem_table_deletes_end, mem_table_rows_offset); return res; } bool DeltaValueReader::shouldPlace(const DMContext & context, DeltaIndexPtr my_delta_index, - const RowKeyRange & segment_range, + const RowKeyRange & segment_range_, const RowKeyRange & relevant_range, UInt64 max_version) { auto [placed_rows, placed_delete_ranges] = my_delta_index->getPlacedStatus(); - auto & packs = delta_snap->getColumnFiles(); // Already placed. if (placed_rows >= delta_snap->getRows() && placed_delete_ranges == delta_snap->getDeletes()) return false; - if (relevant_range.all() || relevant_range == segment_range // + if (relevant_range.all() || relevant_range == segment_range_ // || delta_snap->getRows() - placed_rows > context.delta_cache_limit_rows // || placed_delete_ranges != delta_snap->getDeletes()) return true; - auto [start_pack_index, rows_start_in_start_pack] = locatePosByAccumulation(pack_rows_end, placed_rows); - - for (size_t pack_index = start_pack_index; pack_index < delta_snap->getPackCount(); ++pack_index) - { - auto & pack = packs[pack_index]; - - // Always do place index if DeltaPackFile exists. - if (pack->isBigFile()) - return true; - if (unlikely(pack->isDeleteRange())) - throw Exception("pack is delete range", ErrorCodes::LOGICAL_ERROR); - - size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; - size_t rows_end_in_pack = pack_rows[pack_index]; - - auto & pack_reader = pack_readers[pack_index]; - ColumnPtr pk_column; - ColumnPtr version_column; - if (auto m_reader = std::dynamic_pointer_cast(pack_reader); m_reader) - { - pk_column = m_reader->getPKColumn(); - version_column = m_reader->getVersionColumn(); - } - else if (auto t_reader = std::dynamic_pointer_cast(pack_reader); t_reader) - { - pk_column = t_reader->getPKColumn(); - version_column = t_reader->getVersionColumn(); - } - auto rkcc = RowKeyColumnContainer(pk_column, context.is_common_handle); - auto & version_col_data = toColumnVectorData(version_column); - - for (auto i = rows_start_in_pack; i < rows_end_in_pack; ++i) - { - if (version_col_data[i] <= max_version && relevant_range.check(rkcc.getRowKeyValue(i))) - return true; - } - } - - return false; + return persisted_files_reader->shouldPlace(context, relevant_range, max_version, placed_rows) + || (mem_table_reader && mem_table_reader->shouldPlace(context, relevant_range, max_version, placed_rows)); } } // namespace DB::DM diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 48642233952..127629f270e 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -520,10 +520,10 @@ void DeltaMergeStore::write(const Context & db_context, const DB::Settings & db_ limit = cur_limit; auto alloc_bytes = block.bytes(offset, limit); - bool small_pack = limit < dm_context->delta_cache_limit_rows / 4 && alloc_bytes < dm_context->delta_cache_limit_bytes / 4; - // Small packs are appended to Delta Cache, the flushed later. - // While large packs are directly written to PageStorage. - if (small_pack) + bool is_small = limit < dm_context->delta_cache_limit_rows / 4 && alloc_bytes < dm_context->delta_cache_limit_bytes / 4; + // Small column fies are appended to Delta Cache, the flushed later. + // While large column fies are directly written to PageStorage. + if (is_small) { if (segment->writeToCache(*dm_context, block, offset, limit)) { @@ -533,8 +533,8 @@ void DeltaMergeStore::write(const Context & db_context, const DB::Settings & db_ } else { - // If pack haven't been written, or the pk range has changed since last write, then write it and - // delete former written pack. + // If column file haven't been written, or the pk range has changed since last write, then write it and + // delete former written column file. if (!write_column_file || (write_column_file && write_range != rowkey_range)) { wbs.rollbackWrittenLogAndData(); @@ -705,10 +705,10 @@ void DeltaMergeStore::ingestFiles( auto ref_id = storage_pool.newDataPageIdForDTFile(delegate, __PRETTY_FUNCTION__); auto ref_file = DMFile::restore(file_provider, file_id, ref_id, file_parent_path, DMFile::ReadMetaMode::all()); - auto pack = std::make_shared(*dm_context, ref_file, segment_range); - if (pack->getRows() != 0) + auto column_file = std::make_shared(*dm_context, ref_file, segment_range); + if (column_file->getRows() != 0) { - column_files.emplace_back(std::move(pack)); + column_files.emplace_back(std::move(column_file)); wbs.data.putRefPage(ref_id, file_id); } } diff --git a/dbms/src/Storages/DeltaMerge/Segment.cpp b/dbms/src/Storages/DeltaMerge/Segment.cpp index d33ba7bcf24..08dba3c6694 100644 --- a/dbms/src/Storages/DeltaMerge/Segment.cpp +++ b/dbms/src/Storages/DeltaMerge/Segment.cpp @@ -296,10 +296,10 @@ bool Segment::write(DMContext & dm_context, const Block & block) LOG_FMT_TRACE(log, "Segment [{}] write to disk rows: {}", segment_id, block.rows()); WriteBatches wbs(dm_context.storage_pool, dm_context.getWriteLimiter()); - auto pack = ColumnFileTiny::writeColumnFile(dm_context, block, 0, block.rows(), wbs); + auto column_file = ColumnFileTiny::writeColumnFile(dm_context, block, 0, block.rows(), wbs); wbs.writeAll(); - if (delta->appendColumnFile(dm_context, pack)) + if (delta->appendColumnFile(dm_context, column_file)) { flushCache(dm_context); return true; @@ -576,7 +576,7 @@ StableValueSpacePtr Segment::prepareMergeDelta(DMContext & dm_context, LOG_FMT_INFO(log, "Segment [{}] prepare merge delta start. delta packs: {}, delta total rows: {}, delta total size: {}", segment_id, - segment_snap->delta->getPackCount(), + segment_snap->delta->getColumnFileCount(), segment_snap->delta->getRows(), segment_snap->delta->getBytes()); @@ -604,11 +604,11 @@ SegmentPtr Segment::applyMergeDelta(DMContext & context, { LOG_FMT_INFO(log, "Before apply merge delta: {}", info()); - auto later_packs = delta->checkHeadAndCloneTail(context, rowkey_range, segment_snap->delta->getColumnFiles(), wbs); + auto [persisted_column_files, in_memory_files] = delta->checkHeadAndCloneTail(context, rowkey_range, segment_snap->delta->getHeadColumnFilesForCheck(), wbs); // Created references to tail pages' pages in "log" storage, we need to write them down. wbs.writeLogAndData(); - auto new_delta = std::make_shared(delta->getId(), later_packs); + auto new_delta = std::make_shared(delta->getId(), persisted_column_files, in_memory_files); new_delta->saveMeta(wbs); auto new_me = std::make_shared(epoch + 1, // @@ -625,7 +625,7 @@ SegmentPtr Segment::applyMergeDelta(DMContext & context, new_me->serialize(wbs.meta); // Remove old segment's delta. - delta->recordRemovePacksPages(wbs); + delta->recordRemoveColumnFilesPages(wbs); // Remove old stable's files. stable->recordRemovePacksPages(wbs); @@ -1065,10 +1065,10 @@ SegmentPair Segment::applySplit(DMContext & dm_context, // RowKeyRange my_range(rowkey_range.start, split_info.split_point, is_common_handle, rowkey_column_size); RowKeyRange other_range(split_info.split_point, rowkey_range.end, is_common_handle, rowkey_column_size); ColumnFiles empty_files; - ColumnFiles * head_files = split_info.is_logical ? &empty_files : &segment_snap->delta->getColumnFiles(); + ColumnFiles * head_files = split_info.is_logical ? &empty_files : &segment_snap->delta->getHeadColumnFilesForCheck(); - auto my_delta_files = delta->checkHeadAndCloneTail(dm_context, my_range, *head_files, wbs); - auto other_delta_files = delta->checkHeadAndCloneTail(dm_context, other_range, *head_files, wbs); + auto [my_persisted_files, my_in_memory_files] = delta->checkHeadAndCloneTail(dm_context, my_range, *head_files, wbs); + auto [other_persisted_files, other_in_memory_files] = delta->checkHeadAndCloneTail(dm_context, other_range, *head_files, wbs); // Created references to tail pages' pages in "log" storage, we need to write them down. wbs.writeLogAndData(); @@ -1076,8 +1076,8 @@ SegmentPair Segment::applySplit(DMContext & dm_context, // auto other_segment_id = dm_context.storage_pool.newMetaPageId(); auto other_delta_id = dm_context.storage_pool.newMetaPageId(); - auto my_delta = std::make_shared(delta->getId(), my_delta_files); - auto other_delta = std::make_shared(other_delta_id, other_delta_files); + auto my_delta = std::make_shared(delta->getId(), my_persisted_files, my_in_memory_files); + auto other_delta = std::make_shared(other_delta_id, other_persisted_files, other_in_memory_files); auto new_me = std::make_shared(this->epoch + 1, // my_range, @@ -1102,7 +1102,7 @@ SegmentPair Segment::applySplit(DMContext & dm_context, // other->serialize(wbs.meta); // Remove old segment's delta. - delta->recordRemovePacksPages(wbs); + delta->recordRemoveColumnFilesPages(wbs); // Remove old stable's files. stable->recordRemovePacksPages(wbs); @@ -1209,27 +1209,21 @@ SegmentPtr Segment::applyMerge(DMContext & dm_context, // RowKeyRange merged_range(left->rowkey_range.start, right->rowkey_range.end, left->is_common_handle, left->rowkey_column_size); - auto left_tail_files = left->delta->checkHeadAndCloneTail(dm_context, merged_range, left_snap->delta->getColumnFiles(), wbs); - auto right_tail_files = right->delta->checkHeadAndCloneTail(dm_context, merged_range, right_snap->delta->getColumnFiles(), wbs); + auto [left_persisted_files, left_in_memory_files] = left->delta->checkHeadAndCloneTail(dm_context, merged_range, left_snap->delta->getHeadColumnFilesForCheck(), wbs); + auto [right_persisted_files, right_in_memory_files] = right->delta->checkHeadAndCloneTail(dm_context, merged_range, right_snap->delta->getHeadColumnFilesForCheck(), wbs); + // Created references to tail pages' pages in "log" storage, we need to write them down. wbs.writeLogAndData(); /// Make sure saved packs are appended before unsaved packs. - ColumnFiles merged_files; - - auto l_first_unsaved - = std::find_if(left_tail_files.begin(), left_tail_files.end(), [](const ColumnFilePtr & p) { return !p->isSaved(); }); - auto r_first_unsaved - = std::find_if(right_tail_files.begin(), right_tail_files.end(), [](const ColumnFilePtr & p) { return !p->isSaved(); }); - - merged_files.insert(merged_files.end(), left_tail_files.begin(), l_first_unsaved); - merged_files.insert(merged_files.end(), right_tail_files.begin(), r_first_unsaved); + ColumnFilePersisteds merged_persisted_column_files = std::move(left_persisted_files); + ColumnFiles merged_in_memory_files = std::move(left_in_memory_files); - merged_files.insert(merged_files.end(), l_first_unsaved, left_tail_files.end()); - merged_files.insert(merged_files.end(), r_first_unsaved, right_tail_files.end()); + merged_persisted_column_files.insert(merged_persisted_column_files.end(), right_persisted_files.begin(), right_persisted_files.end()); + merged_in_memory_files.insert(merged_in_memory_files.end(), right_in_memory_files.begin(), right_in_memory_files.end()); - auto merged_delta = std::make_shared(left->delta->getId(), merged_files); + auto merged_delta = std::make_shared(left->delta->getId(), merged_persisted_column_files, merged_in_memory_files); auto merged = std::make_shared(left->epoch + 1, // merged_range, @@ -1243,10 +1237,10 @@ SegmentPtr Segment::applyMerge(DMContext & dm_context, // merged->stable->saveMeta(wbs.meta); merged->serialize(wbs.meta); - left->delta->recordRemovePacksPages(wbs); + left->delta->recordRemoveColumnFilesPages(wbs); left->stable->recordRemovePacksPages(wbs); - right->delta->recordRemovePacksPages(wbs); + right->delta->recordRemoveColumnFilesPages(wbs); right->stable->recordRemovePacksPages(wbs); wbs.removed_meta.delPage(right->segmentId()); From 39ebd23be1d4459d6f25ee318caff55ce6c260ea Mon Sep 17 00:00:00 2001 From: lidezhu Date: Wed, 26 Jan 2022 14:58:15 +0800 Subject: [PATCH 04/23] do some rename --- .../DeltaMerge/ColumnFile/ColumnFile.cpp | 1 - .../ColumnFile/ColumnFileSetReader.cpp | 22 ++++-- .../ColumnFile/ColumnFileSetReader.h | 4 +- .../DeltaMerge/Delta/ColumnFileFlushTask.cpp | 6 +- .../DeltaMerge/Delta/ColumnFileFlushTask.h | 4 +- .../Delta/ColumnFilePersistedSet.cpp | 75 ++++++++++--------- .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 20 ++--- .../DeltaMerge/Delta/DeltaValueSpace.cpp | 4 +- .../DeltaMerge/Delta/DeltaValueSpace.h | 9 ++- .../Storages/DeltaMerge/Delta/MemTableSet.cpp | 9 +-- .../Storages/DeltaMerge/Delta/MemTableSet.h | 2 +- .../Storages/DeltaMerge/Delta/Snapshot.cpp | 4 +- 12 files changed, 83 insertions(+), 77 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp index 4d80e18ef95..9617118d0bd 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp @@ -107,6 +107,5 @@ String columnFilesToString(const T & column_files) template String columnFilesToString(const ColumnFiles & column_files); template String columnFilesToString(const ColumnFilePersisteds & column_files); - } // namespace DM } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp index 9e24dcfd10d..b5372f7d4d9 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp @@ -40,9 +40,10 @@ std::pair findColumnFile(const ColumnFiles & column_files, size_ { if (unlikely(deletes_count != deletes_offset)) throw Exception("deletes_count and deletes_offset are expected to be equal. pack_index: " + DB::toString(column_file_index) - + ", pack_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) - + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) - + ", deletes_offset: " + DB::toString(deletes_offset)); + + ", pack_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) + + ", deletes_offset: " + DB::toString(deletes_offset), + ErrorCodes::LOGICAL_ERROR); return {column_file_index, column_file->getRows() - (rows_count - rows_offset)}; } @@ -50,8 +51,9 @@ std::pair findColumnFile(const ColumnFiles & column_files, size_ } if (rows_count != rows_offset || deletes_count != deletes_offset) throw Exception("illegal rows_offset and deletes_offset. pack_size: " + DB::toString(column_files.size()) - + ", rows_count: " + DB::toString(rows_count) + ", rows_offset: " + DB::toString(rows_offset) - + ", deletes_count: " + DB::toString(deletes_count) + ", deletes_offset: " + DB::toString(deletes_offset)); + + ", rows_count: " + DB::toString(rows_count) + ", rows_offset: " + DB::toString(rows_offset) + + ", deletes_count: " + DB::toString(deletes_count) + ", deletes_offset: " + DB::toString(deletes_offset), + ErrorCodes::LOGICAL_ERROR); return {column_file_index, 0}; } @@ -234,7 +236,7 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, auto version_column = dpb_reader.getVersionColumn(); auto rkcc = RowKeyColumnContainer(pk_column, context.is_common_handle); - auto & version_col_data = toColumnVectorData(version_column); + const auto & version_col_data = toColumnVectorData(version_column); for (auto i = rows_start_in_pack; i < rows_end_in_pack; ++i) { @@ -242,14 +244,14 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, return true; } } - else + else if (column_file->isTinyFile()) { auto & dpb_reader = typeid_cast(*column_file_reader); auto pk_column = dpb_reader.getPKColumn(); auto version_column = dpb_reader.getVersionColumn(); auto rkcc = RowKeyColumnContainer(pk_column, context.is_common_handle); - auto & version_col_data = toColumnVectorData(version_column); + const auto & version_col_data = toColumnVectorData(version_column); for (auto i = rows_start_in_pack; i < rows_end_in_pack; ++i) { @@ -257,6 +259,10 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, return true; } } + else + { + throw Exception("Unknown column file: " + column_file->toString(), ErrorCodes::LOGICAL_ERROR); + } } return false; diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h index 16b3ce3aa53..5a69fc716dc 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h @@ -17,9 +17,9 @@ class ColumnFileSetReader ColumnDefinesPtr col_defs; RowKeyRange segment_range; - // The row count of each pack. Cache here to speed up checking. + // The row count of each column file. Cache here to speed up checking. std::vector column_file_rows; - // The cumulative rows of packs. Used to fast locate specific packs according to rows offset by binary search. + // The cumulative rows of column files. Used to fast locate specific column files according to rows offset by binary search. std::vector column_file_rows_end; std::vector column_file_readers; diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp index f913019e71f..2c137a7cf74 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp @@ -9,10 +9,10 @@ namespace DB { namespace DM { -ColumnFileFlushTask::ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t current_flush_version_) +ColumnFileFlushTask::ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t flush_version_) : context{context_} , mem_table_set{mem_table_set_} - , current_flush_version{current_flush_version_} + , flush_version{flush_version_} {} DeltaIndex::Updates ColumnFileFlushTask::prepare(WriteBatches & wbs) @@ -62,7 +62,7 @@ DeltaIndex::Updates ColumnFileFlushTask::prepare(WriteBatches & wbs) bool ColumnFileFlushTask::commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs) { // update metadata - if (!persisted_file_set->appendColumnStableFilesToLevel0(current_flush_version, results, wbs)) + if (!persisted_file_set->appendColumnStableFilesToLevel0(flush_version, results, wbs)) return false; mem_table_set->removeColumnFilesInFlushTask(*this); diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h index 7389e3d78e2..5ee5c06ac35 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h @@ -49,10 +49,10 @@ class ColumnFileFlushTask ColumnFilePersisteds results; DMContext & context; MemTableSetPtr mem_table_set; - size_t current_flush_version; + size_t flush_version; public: - ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t current_flush_version_); + ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t flush_version_); DeltaIndex::Updates prepare(WriteBatches & wbs); diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index a04ac82f8e2..93d9a9dc090 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -13,7 +13,7 @@ namespace DB { namespace DM { -inline void serializeColumnStableFileLevels(WriteBatches & wbs, PageId id, const ColumnFilePersistedSet::ColumnFilePersistedLevels & file_levels) +inline void serializeColumnFilePersistedLevels(WriteBatches & wbs, PageId id, const ColumnFilePersistedSet::ColumnFilePersistedLevels & file_levels) { MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); ColumnFilePersisteds column_files; @@ -31,13 +31,13 @@ inline void serializeColumnStableFileLevels(WriteBatches & wbs, PageId id, const void ColumnFilePersistedSet::updateStats() { - size_t new_stable_files_count = 0; + size_t new_persisted_files_count = 0; size_t new_rows = 0; size_t new_bytes = 0; size_t new_deletes = 0; - for (auto & file_level : stable_files_levels) + for (auto & file_level : persisted_files_levels) { - new_stable_files_count += file_level.size(); + new_persisted_files_count += file_level.size(); for (auto & file : file_level) { new_rows += file->getRows(); @@ -45,7 +45,7 @@ void ColumnFilePersistedSet::updateStats() new_deletes += file->getDeletes(); } } - stable_files_count = new_stable_files_count; + persisted_files_count = new_persisted_files_count; rows = new_rows; bytes = new_bytes; deletes = new_deletes; @@ -71,12 +71,12 @@ void ColumnFilePersistedSet::checkColumnFiles(const ColumnFiles & new_column_fil } } -ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const ColumnFilePersisteds & column_stable_files) +ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const ColumnFilePersisteds & persisted_column_files) : metadata_id(metadata_id_) , log(&Poco::Logger::get("ColumnStableFileSet")) { // TODO: place column file to different levels - stable_files_levels.push_back(column_stable_files); + persisted_files_levels.push_back(persisted_column_files); updateStats(); } @@ -91,12 +91,12 @@ ColumnFilePersistedSetPtr ColumnFilePersistedSet::restore(DMContext & context, c void ColumnFilePersistedSet::saveMeta(WriteBatches & wbs) const { - serializeColumnStableFileLevels(wbs, metadata_id, stable_files_levels); + serializeColumnFilePersistedLevels(wbs, metadata_id, persisted_files_levels); } void ColumnFilePersistedSet::recordRemoveColumnFilesPages(WriteBatches & wbs) const { - for (const auto & level : stable_files_levels) + for (const auto & level : persisted_files_levels) { for (const auto & file : level) file->removeData(wbs); @@ -112,17 +112,17 @@ ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & c // We check in the direction from the last level to the first level. // In every level, we check from the begin to the last. auto it_1 = head_column_files.begin(); - auto level_it = stable_files_levels.rbegin(); + auto level_it = persisted_files_levels.rbegin(); auto it_2 = level_it->begin(); bool check_success = true; - if (likely(head_column_files.size() <= stable_files_count.load())) + if (likely(head_column_files.size() <= persisted_files_count.load())) { - while (it_1 != head_column_files.end() && level_it != stable_files_levels.rend()) + while (it_1 != head_column_files.end() && level_it != persisted_files_levels.rend()) { if (it_2 == level_it->end()) { level_it++; - if (unlikely(level_it == stable_files_levels.rend())) + if (unlikely(level_it == persisted_files_levels.rend())) throw Exception("Delta Check head algorithm broken", ErrorCodes::LOGICAL_ERROR); it_2 = level_it->begin(); continue; @@ -146,12 +146,12 @@ ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & c } ColumnFilePersisteds cloned_tail; - while (level_it != stable_files_levels.rend()) + while (level_it != persisted_files_levels.rend()) { if (it_2 == level_it->end()) { level_it++; - if (level_it == stable_files_levels.rend()) + if (level_it == persisted_files_levels.rend()) break; it_2 = level_it->begin(); } @@ -198,7 +198,7 @@ ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & c size_t ColumnFilePersistedSet::getTotalCacheRows() const { size_t cache_rows = 0; - for (const auto & level : stable_files_levels) + for (const auto & level : persisted_files_levels) { for (const auto & file : level) { @@ -215,7 +215,7 @@ size_t ColumnFilePersistedSet::getTotalCacheRows() const size_t ColumnFilePersistedSet::getTotalCacheBytes() const { size_t cache_bytes = 0; - for (const auto & level : stable_files_levels) + for (const auto & level : persisted_files_levels) { for (const auto & file : level) { @@ -232,7 +232,7 @@ size_t ColumnFilePersistedSet::getTotalCacheBytes() const size_t ColumnFilePersistedSet::getValidCacheRows() const { size_t cache_rows = 0; - for (const auto & level : stable_files_levels) + for (const auto & level : persisted_files_levels) { for (const auto & file : level) { @@ -246,6 +246,7 @@ size_t ColumnFilePersistedSet::getValidCacheRows() const return cache_rows; } +// TODO: reset schema if same as before bool ColumnFilePersistedSet::appendColumnStableFilesToLevel0(size_t prev_flush_version, const ColumnFilePersisteds & column_files, WriteBatches & wbs) { if (prev_flush_version != flush_version) @@ -254,25 +255,25 @@ bool ColumnFilePersistedSet::appendColumnStableFilesToLevel0(size_t prev_flush_v return false; } flush_version += 1; - ColumnFilePersistedLevels new_stable_files_levels; - for (auto & level : stable_files_levels) + ColumnFilePersistedLevels new_persisted_files_levels; + for (auto & level : persisted_files_levels) { - auto & new_level = new_stable_files_levels.emplace_back(); + auto & new_level = new_persisted_files_levels.emplace_back(); for (auto & file : level) new_level.push_back(file); } - if (new_stable_files_levels.empty()) - new_stable_files_levels.emplace_back(); - auto & new_level_0 = new_stable_files_levels[0]; + if (new_persisted_files_levels.empty()) + new_persisted_files_levels.emplace_back(); + auto & new_level_0 = new_persisted_files_levels[0]; for (const auto & f : column_files) new_level_0.push_back(f); /// Save the new metadata of column files to disk. - serializeColumnStableFileLevels(wbs, metadata_id, new_stable_files_levels); + serializeColumnFilePersistedLevels(wbs, metadata_id, new_persisted_files_levels); wbs.writeMeta(); /// Commit updates in memory. - stable_files_levels.swap(new_stable_files_levels); + persisted_files_levels.swap(new_persisted_files_levels); updateStats(); return true; @@ -285,13 +286,13 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con // For ColumnDeleteRangeFile and ColumnBigFile, we will simply move them to the next level. // And only if there some small `ColumnTinyFile`s which can be combined together we will actually do the compaction. size_t check_level_num = 0; - while (check_level_num < stable_files_levels.size()) + while (check_level_num < persisted_files_levels.size()) { - if (next_compaction_level >= stable_files_levels.size()) + if (next_compaction_level >= persisted_files_levels.size()) next_compaction_level = 0; auto compaction = std::make_shared(next_compaction_level); - auto & level = stable_files_levels[next_compaction_level]; + auto & level = persisted_files_levels[next_compaction_level]; if (!level.empty()) { bool is_all_trivial_move = true; @@ -349,7 +350,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & for (size_t i = 0; i < compaction->compaction_src_level; i++) { auto & new_level = new_stable_files_levels.emplace_back(); - for (const auto & f : stable_files_levels[i]) + for (const auto & f : persisted_files_levels[i]) new_level.push_back(f); } // Create a new empty level for `compaction_src_level` because all the column files is compacted to next level @@ -358,9 +359,9 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & // Add new file to the target level auto target_level = compaction->compaction_src_level + 1; auto & target_level_files = new_stable_files_levels.emplace_back(); - if (stable_files_levels.size() > target_level) + if (persisted_files_levels.size() > target_level) { - for (auto & column_file : stable_files_levels[target_level]) + for (auto & column_file : persisted_files_levels[target_level]) target_level_files.emplace_back(column_file); } for (auto & task : compaction->tasks) @@ -372,19 +373,19 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & } // Append remaining levels - for (size_t i = target_level + 1; i < stable_files_levels.size(); i++) + for (size_t i = target_level + 1; i < persisted_files_levels.size(); i++) { auto & new_level = new_stable_files_levels.emplace_back(); - for (const auto & f : stable_files_levels[i]) + for (const auto & f : persisted_files_levels[i]) new_level.push_back(f); } /// Save the new metadata of column files to disk. - serializeColumnStableFileLevels(wbs, metadata_id, new_stable_files_levels); + serializeColumnFilePersistedLevels(wbs, metadata_id, new_stable_files_levels); wbs.writeMeta(); /// Commit updates in memory. - stable_files_levels.swap(new_stable_files_levels); + persisted_files_levels.swap(new_stable_files_levels); updateStats(); return true; @@ -400,7 +401,7 @@ ColumnFileSetSnapshotPtr ColumnFilePersistedSet::createSnapshot(const DMContext size_t total_rows = 0; size_t total_deletes = 0; - for (const auto & level : stable_files_levels) + for (const auto & level : persisted_files_levels) { for (const auto & file : level) { diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h index 6d2fefb4ab7..c5a7d9edbbd 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -40,8 +40,8 @@ class ColumnFilePersistedSet : public std::enable_shared_from_this stable_files_count; + ColumnFilePersistedLevels persisted_files_levels; + std::atomic persisted_files_count; std::atomic rows = 0; std::atomic bytes = 0; @@ -60,7 +60,7 @@ class ColumnFilePersistedSet : public std::enable_shared_from_thisgetColumnFiles(); } + ColumnFileSetSnapshotPtr getMemTableSetSnapshot() const { return mem_table_snap; } + ColumnFileSetSnapshotPtr getPersistedFileSetSnapshot() const { return persisted_files_snap; } + size_t getColumnFileCount() const { return (mem_table_snap ? mem_table_snap->getColumnFileCount() : 0) + persisted_files_snap->getColumnFileCount(); } size_t getRows() const { return (mem_table_snap ? mem_table_snap->getRows() : 0) + persisted_files_snap->getRows(); } size_t getBytes() const { return (mem_table_snap ? mem_table_snap->getBytes() : 0) + persisted_files_snap->getBytes(); } @@ -364,8 +365,8 @@ class DeltaValueInputStream : public IBlockInputStream const DeltaSnapshotPtr & delta_snap_, const ColumnDefinesPtr & col_defs_, const RowKeyRange & segment_range_) - : mem_table_input_stream(context_, delta_snap_->mem_table_snap, col_defs_, segment_range_) - , persisted_files_input_stream(context_, delta_snap_->persisted_files_snap, col_defs_, segment_range_) + : mem_table_input_stream(context_, delta_snap_->getMemTableSetSnapshot(), col_defs_, segment_range_) + , persisted_files_input_stream(context_, delta_snap_->getPersistedFileSetSnapshot(), col_defs_, segment_range_) {} String getName() const override { return "DeltaValue"; } diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp index 04226cfce4a..ffb2f2a4881 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp @@ -8,7 +8,6 @@ namespace DB { namespace DM { -// FIXME: difference from previous implementation: cannot get lastSchema from saved column_files BlockPtr MemTableSet::lastSchema() { for (auto it = column_files.rbegin(); it != column_files.rend(); ++it) @@ -25,7 +24,7 @@ void MemTableSet::appendColumnFileInner(const ColumnFilePtr & column_file) { auto last_schema = lastSchema(); - if (auto m_file = column_file->tryToInMemoryFile(); m_file) + if (auto * m_file = column_file->tryToInMemoryFile(); m_file) { // If this pack's schema is identical to last_schema, then use the last_schema instance, // so that we don't have to serialize my_schema instance. @@ -36,9 +35,9 @@ void MemTableSet::appendColumnFileInner(const ColumnFilePtr & column_file) if (!column_files.empty()) { - auto last_column_file = column_files.back(); - if (last_column_file->isInMemoryFile()) - last_column_file->tryToInMemoryFile()->disableAppend(); + auto & last_column_file = column_files.back(); + if (last_column_file->isAppendable()) + last_column_file->disableAppend(); } column_files.push_back(column_file); diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h index 8ea60af69ea..b9a0237f2fb 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h @@ -30,7 +30,7 @@ class MemTableSet : public std::enable_shared_from_this void appendColumnFileInner(const ColumnFilePtr & column_file); public: - MemTableSet(const ColumnFiles & in_memory_files = {}) + explicit MemTableSet(const ColumnFiles & in_memory_files = {}) : column_files(in_memory_files) , log(&Poco::Logger::get("MemTableSet")) { diff --git a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp index bdc82dbd204..eccc285d745 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp @@ -50,8 +50,8 @@ DeltaValueReader::DeltaValueReader( const ColumnDefinesPtr & col_defs_, const RowKeyRange & segment_range_) : delta_snap(delta_snap_) - , mem_table_reader(delta_snap_->mem_table_snap ? std::make_shared(context, delta_snap_->mem_table_snap, col_defs_, segment_range_) : nullptr) - , persisted_files_reader(std::make_shared(context, delta_snap_->persisted_files_snap, col_defs_, segment_range_)) + , mem_table_reader(delta_snap_->getMemTableSetSnapshot() ? std::make_shared(context, delta_snap_->getMemTableSetSnapshot(), col_defs_, segment_range_) : nullptr) + , persisted_files_reader(std::make_shared(context, delta_snap_->getPersistedFileSetSnapshot(), col_defs_, segment_range_)) , col_defs(col_defs_) , segment_range(segment_range_) {} From a84be261788f12456b1c61897656cf3dfdaef73b Mon Sep 17 00:00:00 2001 From: lidezhu Date: Wed, 26 Jan 2022 14:59:08 +0800 Subject: [PATCH 05/23] format --- dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp index 9617118d0bd..4d80e18ef95 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile.cpp @@ -107,5 +107,6 @@ String columnFilesToString(const T & column_files) template String columnFilesToString(const ColumnFiles & column_files); template String columnFilesToString(const ColumnFilePersisteds & column_files); + } // namespace DM } // namespace DB From 25b51ed7d23b8b8d31e69c11db31c366c1dd532b Mon Sep 17 00:00:00 2001 From: lidezhu Date: Wed, 26 Jan 2022 19:40:29 +0800 Subject: [PATCH 06/23] refactor flush process --- .../DeltaMerge/Delta/ColumnFileFlushTask.cpp | 103 ++++++++++++------ .../DeltaMerge/Delta/ColumnFileFlushTask.h | 11 +- .../Delta/ColumnFilePersistedSet.cpp | 32 ++++-- .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 10 +- .../DeltaMerge/Delta/DeltaValueSpace.cpp | 4 +- .../DeltaMerge/Delta/DeltaValueSpace.h | 4 +- .../Storages/DeltaMerge/Delta/MemTableSet.cpp | 64 ++++++----- .../Storages/DeltaMerge/Delta/MemTableSet.h | 21 ++-- 8 files changed, 162 insertions(+), 87 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp index 2c137a7cf74..9d11dab4b96 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -5,6 +6,14 @@ #include #include +namespace ProfileEvents +{ +extern const Event DMWriteBytes; +extern const Event PSMWriteBytes; +extern const Event WriteBufferFromFileDescriptorWriteBytes; +extern const Event WriteBufferAIOWriteBytes; +} // namespace ProfileEvents + namespace DB { namespace DM @@ -22,50 +31,82 @@ DeltaIndex::Updates ColumnFileFlushTask::prepare(WriteBatches & wbs) for (auto & task : tasks) { if (!task.block_data) - { - results.push_back(std::static_pointer_cast(task.column_file)); - } - else - { - IColumn::Permutation perm; - task.sorted = sortBlockByPk(getExtraHandleColumnDefine(context.is_common_handle), task.block_data, perm); - if (task.sorted) - delta_index_updates.emplace_back(task.deletes_offset, task.rows_offset, perm); + continue; + + IColumn::Permutation perm; + task.sorted = sortBlockByPk(getExtraHandleColumnDefine(context.is_common_handle), task.block_data, perm); + if (task.sorted) + delta_index_updates.emplace_back(task.deletes_offset, task.rows_offset, perm); + + task.data_page = ColumnFileTiny::writeColumnFileData(context, task.block_data, 0, task.block_data.rows(), wbs); + } - auto * mem_file = task.column_file->tryToInMemoryFile(); - ColumnFilePersistedPtr tiny_file; - bool is_small_file = mem_file->getRows() < context.delta_small_pack_rows || mem_file->getBytes() < context.delta_small_pack_bytes; + wbs.writeLogAndData(); + return delta_index_updates; +} + +bool ColumnFileFlushTask::commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs) +{ + if (!persisted_file_set->checkAndUpdateFlushVersion(flush_version)) + return false; + + /// Create new column file instance for ColumnFilePersistedSet + ColumnFilePersisteds new_column_files; + for (auto & task : tasks) + { + ColumnFilePersistedPtr new_column_file; + if (auto * m_file = task.column_file->tryToInMemoryFile(); m_file) + { + bool is_small_file = m_file->getRows() < context.delta_small_pack_rows || m_file->getBytes() < context.delta_small_pack_bytes; if (is_small_file) { - tiny_file = std::make_shared(mem_file->getSchema(), - mem_file->getRows(), - mem_file->getBytes(), - task.data_page, - !task.sorted ? mem_file->getCache() : std::make_shared(std::move(task.block_data))); + new_column_file = std::make_shared(m_file->getSchema(), + m_file->getRows(), + m_file->getBytes(), + task.data_page, + !task.sorted ? m_file->getCache() : std::make_shared(std::move(task.block_data))); } else { - tiny_file = std::make_shared(mem_file->getSchema(), - mem_file->getRows(), - mem_file->getBytes(), - task.data_page, - nullptr); + new_column_file = std::make_shared(m_file->getSchema(), + m_file->getRows(), + m_file->getBytes(), + task.data_page, + nullptr); } - results.push_back(tiny_file); } + else if (auto * t_file = task.column_file->tryToTinyFile(); t_file) + { + new_column_file = std::make_shared(*t_file); + } + else if (auto * b_file = task.column_file->tryToBigFile(); b_file) + { + new_column_file = std::make_shared(*b_file); + } + else if (auto * d_file = task.column_file->tryToDeleteRange(); d_file) + { + new_column_file = std::make_shared(*d_file); + } + else + { + throw Exception("Unexpected column file type", ErrorCodes::LOGICAL_ERROR); + } + new_column_files.push_back(new_column_file); } - wbs.writeLogAndData(); - return delta_index_updates; -} - -bool ColumnFileFlushTask::commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs) -{ - // update metadata - if (!persisted_file_set->appendColumnStableFilesToLevel0(flush_version, results, wbs)) + // serialize metadata and update persisted_file_set + if (!persisted_file_set->appendPersistedColumnFilesToLevel0(new_column_files, wbs)) return false; mem_table_set->removeColumnFilesInFlushTask(*this); + + // Also update the write amplification + auto total_write = ProfileEvents::counters[ProfileEvents::DMWriteBytes].load(std::memory_order_relaxed); + auto actual_write = ProfileEvents::counters[ProfileEvents::PSMWriteBytes].load(std::memory_order_relaxed) + + ProfileEvents::counters[ProfileEvents::WriteBufferFromFileDescriptorWriteBytes].load(std::memory_order_relaxed) + + ProfileEvents::counters[ProfileEvents::WriteBufferAIOWriteBytes].load(std::memory_order_relaxed); + GET_METRIC(tiflash_storage_write_amplification) + .Set((static_cast(actual_write) / 1024 / 1024) / (static_cast(total_write) / 1024 / 1024)); return true; } } // namespace DM diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h index 5ee5c06ac35..a1a298a3360 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h @@ -23,9 +23,6 @@ using ColumnFileFlushTaskPtr = std::shared_ptr; class ColumnFileFlushTask { - friend class MemTableSet; - friend class ColumnFilePersistedSet; - public: struct Task { @@ -46,7 +43,6 @@ class ColumnFileFlushTask private: Tasks tasks; - ColumnFilePersisteds results; DMContext & context; MemTableSetPtr mem_table_set; size_t flush_version; @@ -54,8 +50,15 @@ class ColumnFileFlushTask public: ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t flush_version_); + inline Task & addColumnFile(ColumnFilePtr column_file) { return tasks.emplace_back(column_file); } + + const Tasks & getAllTasks() const { return tasks; } + + // Persist data in ColumnFileInMemory DeltaIndex::Updates prepare(WriteBatches & wbs); + // Add the flushed column file to ColumnFilePersistedSet and remove the corresponding column file from MemTableSet + // Needs extra synchronization on the DeltaValueSpace bool commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs); }; } // namespace DM diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index 93d9a9dc090..da121e26d92 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -29,7 +29,7 @@ inline void serializeColumnFilePersistedLevels(WriteBatches & wbs, PageId id, co wbs.meta.putPage(id, 0, buf.tryGetReadBuffer(), data_size); } -void ColumnFilePersistedSet::updateStats() +void ColumnFilePersistedSet::updateColumnFileStats() { size_t new_persisted_files_count = 0; size_t new_rows = 0; @@ -78,7 +78,7 @@ ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const Column // TODO: place column file to different levels persisted_files_levels.push_back(persisted_column_files); - updateStats(); + updateColumnFileStats(); } ColumnFilePersistedSetPtr ColumnFilePersistedSet::restore(DMContext & context, const RowKeyRange & segment_range, PageId id) @@ -103,6 +103,19 @@ void ColumnFilePersistedSet::recordRemoveColumnFilesPages(WriteBatches & wbs) co } } +BlockPtr ColumnFilePersistedSet::getLastSchema() +{ + for (auto level_it = persisted_files_levels.rend(); level_it != persisted_files_levels.rbegin(); ++level_it) + { + for (auto it = level_it->begin(); it != level_it->end(); ++it) + { + if (auto * t_file = (*it)->tryToTinyFile(); t_file) + return t_file->getSchema(); + } + } + return {}; +} + ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & context, const RowKeyRange & target_range, @@ -246,15 +259,19 @@ size_t ColumnFilePersistedSet::getValidCacheRows() const return cache_rows; } -// TODO: reset schema if same as before -bool ColumnFilePersistedSet::appendColumnStableFilesToLevel0(size_t prev_flush_version, const ColumnFilePersisteds & column_files, WriteBatches & wbs) +bool ColumnFilePersistedSet::checkAndUpdateFlushVersion(size_t task_flush_version) const { - if (prev_flush_version != flush_version) + if (task_flush_version != flush_version) { LOG_DEBUG(log, simpleInfo() << " Stop flush because structure got updated"); return false; } flush_version += 1; + return true; +} + +bool ColumnFilePersistedSet::appendPersistedColumnFilesToLevel0(const ColumnFilePersisteds & column_files, WriteBatches & wbs) +{ ColumnFilePersistedLevels new_persisted_files_levels; for (auto & level : persisted_files_levels) { @@ -265,6 +282,7 @@ bool ColumnFilePersistedSet::appendColumnStableFilesToLevel0(size_t prev_flush_v if (new_persisted_files_levels.empty()) new_persisted_files_levels.emplace_back(); auto & new_level_0 = new_persisted_files_levels[0]; + for (const auto & f : column_files) new_level_0.push_back(f); @@ -274,7 +292,7 @@ bool ColumnFilePersistedSet::appendColumnStableFilesToLevel0(size_t prev_flush_v /// Commit updates in memory. persisted_files_levels.swap(new_persisted_files_levels); - updateStats(); + updateColumnFileStats(); return true; } @@ -386,7 +404,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & /// Commit updates in memory. persisted_files_levels.swap(new_stable_files_levels); - updateStats(); + updateColumnFileStats(); return true; } diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h index c5a7d9edbbd..f5cb43afd15 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -47,15 +47,15 @@ class ColumnFilePersistedSet : public std::enable_shared_from_this bytes = 0; std::atomic deletes = 0; + /// below are just state resides in memory UInt64 flush_version = 0; - size_t next_compaction_level = 0; UInt64 minor_compaction_version = 0; Poco::Logger * log; private: - void updateStats(); + inline void updateColumnFileStats(); void checkColumnFiles(const ColumnFiles & new_column_files); @@ -93,6 +93,8 @@ class ColumnFilePersistedSet : public std::enable_shared_from_this(id_, persisted_files)) - , mem_table_set(std::make_shared(in_memory_files)) + , mem_table_set(std::make_shared(persisted_file_set->getLastSchema(), in_memory_files)) , delta_index(std::make_shared()) , log(&Poco::Logger::get("DeltaValueSpace")) {} DeltaValueSpace::DeltaValueSpace(ColumnFilePersistedSetPtr && persisted_file_set_) : persisted_file_set(std::move(persisted_file_set_)) - , mem_table_set(std::make_shared()) + , mem_table_set(std::make_shared(persisted_file_set->getLastSchema())) , delta_index(std::make_shared()) , log(&Poco::Logger::get("DeltaValueSpace")) {} diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h index cd9fe195de3..8764f4e9eaa 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h @@ -212,10 +212,10 @@ class DeltaValueSpace : public std::enable_shared_from_this bool ingestColumnFiles(DMContext & context, const RowKeyRange & range, const ColumnFiles & column_files, bool clear_data_in_range); - /// Flush the data of packs which haven't write to disk yet, and also save the metadata of packs. + /// Flush the data of column files which haven't write to disk yet, and also save the metadata of column files. bool flush(DMContext & context); - /// Compacts fragment packs into bigger one, to save some IOPS during reading. + /// Compacts fragment column files into bigger one, to save some IOPS during reading. bool compact(DMContext & context); /// Create a constant snapshot for read. diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp index ffb2f2a4881..3f813a978bd 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp @@ -1,36 +1,38 @@ +#include #include #include #include #include #include +namespace ProfileEvents +{ +extern const Event DMWriteBytes; +} + namespace DB { namespace DM { -BlockPtr MemTableSet::lastSchema() -{ - for (auto it = column_files.rbegin(); it != column_files.rend(); ++it) - { - if (auto * m_file = (*it)->tryToInMemoryFile(); m_file) - return m_file->getSchema(); - else if (auto * t_file = (*it)->tryToTinyFile(); t_file) - return t_file->getSchema(); - } - return {}; -} - void MemTableSet::appendColumnFileInner(const ColumnFilePtr & column_file) { - auto last_schema = lastSchema(); - + // If this column file's schema is identical to last_schema, then use the last_schema instance, + // so that we don't have to serialize my_schema instance. if (auto * m_file = column_file->tryToInMemoryFile(); m_file) { - // If this pack's schema is identical to last_schema, then use the last_schema instance, - // so that we don't have to serialize my_schema instance. auto my_schema = m_file->getSchema(); if (last_schema && my_schema && last_schema != my_schema && isSameSchema(*my_schema, *last_schema)) m_file->resetIdenticalSchema(last_schema); + else + last_schema = my_schema; + } + else if (auto * t_file = column_file->tryToTinyFile(); t_file) + { + auto my_schema = t_file->getSchema(); + if (last_schema && my_schema && last_schema != my_schema && isSameSchema(*my_schema, *last_schema)) + t_file->resetIdenticalSchema(last_schema); + else + last_schema = my_schema; } if (!column_files.empty()) @@ -68,9 +70,10 @@ void MemTableSet::appendToCache(DMContext & context, const Block & block, size_t if (!success) { // Create a new column file. - auto last_schema = lastSchema(); auto my_schema = (last_schema && isSameSchema(block, *last_schema)) ? last_schema : std::make_shared(block.cloneEmpty()); auto new_column_file = std::make_shared(my_schema); + // Must append the empty `new_column_file` to `column_files` before appending data to it, + // because `appendColumnFileInner` will update stats related to `column_files` but we will update stats relate to `new_column_file` here. appendColumnFileInner(new_column_file); success = new_column_file->append(context, block, offset, limit, append_bytes); if (unlikely(!success)) @@ -86,16 +89,16 @@ void MemTableSet::appendDeleteRange(const RowKeyRange & delete_range) appendColumnFileInner(f); } -void MemTableSet::ingestColumnFiles(const RowKeyRange & range, const ColumnFiles & column_files_, bool clear_data_in_range) +void MemTableSet::ingestColumnFiles(const RowKeyRange & range, const ColumnFiles & new_column_files, bool clear_data_in_range) { - // Prepend a DeleteRange to clean data before applying packs + // Prepend a DeleteRange to clean data before applying column files if (clear_data_in_range) { auto f = std::make_shared(range); appendColumnFileInner(f); } - for (auto & f : column_files_) + for (const auto & f : new_column_files) { appendColumnFileInner(f); } @@ -113,7 +116,8 @@ ColumnFileSetSnapshotPtr MemTableSet::createSnapshot() size_t total_deletes = 0; for (const auto & file : column_files) { - // TODO: check the thread safety of ColumnFile + // All column files in MemTableSet is constant(except append data to the cache object in ColumnFileInMemory), + // and we always create new column file object when flushing, so it's safe to reuse the column file object here. snap->column_files.push_back(file); total_rows += file->getRows(); total_deletes += file->getDeletes(); @@ -137,23 +141,21 @@ ColumnFileFlushTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t r size_t cur_rows_offset = rows_offset; size_t cur_deletes_offset = deletes_offset; size_t flush_rows = 0; - size_t flush_bytes = 0; size_t flush_deletes = 0; auto flush_task = std::make_shared(context, this->shared_from_this(), flush_version); for (auto & column_file : column_files) { - auto & task = flush_task->tasks.emplace_back(column_file); - if (auto * mfile = column_file->tryToInMemoryFile(); mfile) + auto & task = flush_task->addColumnFile(column_file); + if (auto * m_file = column_file->tryToInMemoryFile(); m_file) { task.rows_offset = cur_rows_offset; task.deletes_offset = cur_deletes_offset; - task.block_data = mfile->readDataForFlush(); + task.block_data = m_file->readDataForFlush(); } - flush_rows += column_file->getRows(); - flush_bytes += column_file->getBytes(); - flush_deletes += column_file->getDeletes(); cur_rows_offset += column_file->getRows(); cur_deletes_offset += column_file->getDeletes(); + flush_rows += column_file->getRows(); + flush_deletes += column_file->getDeletes(); } if (unlikely(flush_rows != rows || flush_deletes != deletes)) throw Exception("Rows and deletes check failed", ErrorCodes::LOGICAL_ERROR); @@ -163,11 +165,12 @@ ColumnFileFlushTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t r void MemTableSet::removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush_task) { - auto & tasks = flush_task.tasks; + const auto & tasks = flush_task.getAllTasks(); if (unlikely(tasks.size() > column_files.size())) throw Exception("column_files num check failed", ErrorCodes::LOGICAL_ERROR); ColumnFiles new_column_files; + size_t flush_bytes = 0; auto column_file_iter = column_files.begin(); for (const auto & task : tasks) { @@ -175,6 +178,7 @@ void MemTableSet::removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush { throw Exception("column_files check failed", ErrorCodes::LOGICAL_ERROR); } + flush_bytes += task.column_file->getBytes(); column_file_iter++; } size_t new_rows = 0; @@ -191,6 +195,8 @@ void MemTableSet::removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush rows = new_rows; bytes = new_bytes; deletes = new_deletes; + + ProfileEvents::increment(ProfileEvents::DMWriteBytes, flush_bytes); } diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h index b9a0237f2fb..c010442dcd1 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h @@ -8,7 +8,10 @@ namespace DB { namespace DM { -/// MemTableSet contains column file which data just resides in memory. +class MemTableSet; +using MemTableSetPtr = std::shared_ptr; + +/// MemTableSet contains column file which data just resides in memory and it cannot be restored after restart. /// And the column files will be flushed periodically to ColumnFilePersistedSet. /// /// This class is not thread safe, manipulate on it requires acquire extra synchronization @@ -16,6 +19,9 @@ class MemTableSet : public std::enable_shared_from_this , private boost::noncopyable { private: + // to avoid serialize the same schema between continuous ColumnFileInMemory and ColumnFileTiny instance + BlockPtr last_schema; + ColumnFiles column_files; std::atomic rows = 0; @@ -25,13 +31,12 @@ class MemTableSet : public std::enable_shared_from_this Poco::Logger * log; private: - BlockPtr lastSchema(); - void appendColumnFileInner(const ColumnFilePtr & column_file); public: - explicit MemTableSet(const ColumnFiles & in_memory_files = {}) - : column_files(in_memory_files) + explicit MemTableSet(const BlockPtr & last_schema_, const ColumnFiles & in_memory_files = {}) + : last_schema(last_schema_) + , column_files(in_memory_files) , log(&Poco::Logger::get("MemTableSet")) { for (const auto & file : column_files) @@ -67,18 +72,16 @@ class MemTableSet : public std::enable_shared_from_this void appendDeleteRange(const RowKeyRange & delete_range); - void ingestColumnFiles(const RowKeyRange & range, const ColumnFiles & column_files_, bool clear_data_in_range); + void ingestColumnFiles(const RowKeyRange & range, const ColumnFiles & new_column_files, bool clear_data_in_range); /// Create a constant snapshot for read. - /// Returns empty if this instance is abandoned, you should try again. ColumnFileSetSnapshotPtr createSnapshot(); + /// Build a flush task which will try to flush all column files in MemTableSet now ColumnFileFlushTaskPtr buildFlushTask(DMContext & context, size_t rows_offset, size_t deletes_offset, size_t flush_version); void removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush_task); }; -using MemTableSetPtr = std::shared_ptr; - } // namespace DM } // namespace DB \ No newline at end of file From c5dfd47188ac824f6287140fd987e87d2893ad33 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Wed, 26 Jan 2022 19:51:42 +0800 Subject: [PATCH 07/23] do some rename --- .../Delta/ColumnFilePersistedSet.cpp | 43 ++++++++++++------- .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 4 +- 2 files changed, 29 insertions(+), 18 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index da121e26d92..749fd92e404 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -13,9 +13,8 @@ namespace DB { namespace DM { -inline void serializeColumnFilePersistedLevels(WriteBatches & wbs, PageId id, const ColumnFilePersistedSet::ColumnFilePersistedLevels & file_levels) +inline ColumnFilePersisteds flattenColumnFileLevels(const ColumnFilePersistedSet::ColumnFilePersistedLevels & file_levels) { - MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); ColumnFilePersisteds column_files; for (const auto & level : file_levels) { @@ -24,6 +23,13 @@ inline void serializeColumnFilePersistedLevels(WriteBatches & wbs, PageId id, co column_files.emplace_back(file); } } + return column_files; +} + +inline void serializeColumnFilePersistedLevels(WriteBatches & wbs, PageId id, const ColumnFilePersistedSet::ColumnFilePersistedLevels & file_levels) +{ + MemoryWriteBuffer buf(0, COLUMN_FILE_SERIALIZE_BUFFER_SIZE); + auto column_files = flattenColumnFileLevels(file_levels); serializeSavedColumnFiles(buf, column_files); auto data_size = buf.count(); wbs.meta.putPage(id, 0, buf.tryGetReadBuffer(), data_size); @@ -51,22 +57,25 @@ void ColumnFilePersistedSet::updateColumnFileStats() deletes = new_deletes; } -void ColumnFilePersistedSet::checkColumnFiles(const ColumnFiles & new_column_files) +void ColumnFilePersistedSet::checkColumnFiles(const ColumnFilePersistedLevels & new_column_file_levels) { if constexpr (!DM_RUN_CHECK) return; size_t new_rows = 0; size_t new_deletes = 0; - - for (const auto & file : new_column_files) + for (const auto & level : new_column_file_levels) { - new_rows += file->getRows(); - new_deletes += file->isDeleteRange(); + for (const auto & file : level) + { + new_rows += file->getRows(); + new_deletes += file->isDeleteRange(); + } } + if (unlikely(new_rows != rows || new_deletes != deletes)) { LOG_ERROR(log, - "Rows and deletes check failed. Current packs: " << columnFilesToString(new_column_files) << ", new packs: " << columnFilesToString(new_column_files)); + "Rows and deletes check failed. Current packs: " << columnFilesToString(flattenColumnFileLevels(new_column_file_levels)) << ", new packs: " << columnFilesToString(flattenColumnFileLevels(new_column_file_levels))); throw Exception("Rows and deletes check failed.", ErrorCodes::LOGICAL_ERROR); } } @@ -259,7 +268,7 @@ size_t ColumnFilePersistedSet::getValidCacheRows() const return cache_rows; } -bool ColumnFilePersistedSet::checkAndUpdateFlushVersion(size_t task_flush_version) const +bool ColumnFilePersistedSet::checkAndUpdateFlushVersion(size_t task_flush_version) { if (task_flush_version != flush_version) { @@ -364,19 +373,19 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & LOG_WARNING(log, "Structure has been updated during compact"); return false; } - ColumnFilePersistedLevels new_stable_files_levels; + ColumnFilePersistedLevels new_persisted_files_levels; for (size_t i = 0; i < compaction->compaction_src_level; i++) { - auto & new_level = new_stable_files_levels.emplace_back(); + auto & new_level = new_persisted_files_levels.emplace_back(); for (const auto & f : persisted_files_levels[i]) new_level.push_back(f); } // Create a new empty level for `compaction_src_level` because all the column files is compacted to next level - new_stable_files_levels.emplace_back(); + new_persisted_files_levels.emplace_back(); // Add new file to the target level auto target_level = compaction->compaction_src_level + 1; - auto & target_level_files = new_stable_files_levels.emplace_back(); + auto & target_level_files = new_persisted_files_levels.emplace_back(); if (persisted_files_levels.size() > target_level) { for (auto & column_file : persisted_files_levels[target_level]) @@ -393,17 +402,19 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & // Append remaining levels for (size_t i = target_level + 1; i < persisted_files_levels.size(); i++) { - auto & new_level = new_stable_files_levels.emplace_back(); + auto & new_level = new_persisted_files_levels.emplace_back(); for (const auto & f : persisted_files_levels[i]) new_level.push_back(f); } + checkColumnFiles(new_persisted_files_levels); + /// Save the new metadata of column files to disk. - serializeColumnFilePersistedLevels(wbs, metadata_id, new_stable_files_levels); + serializeColumnFilePersistedLevels(wbs, metadata_id, new_persisted_files_levels); wbs.writeMeta(); /// Commit updates in memory. - persisted_files_levels.swap(new_stable_files_levels); + persisted_files_levels.swap(new_persisted_files_levels); updateColumnFileStats(); return true; diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h index f5cb43afd15..2a804b0c740 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -57,7 +57,7 @@ class ColumnFilePersistedSet : public std::enable_shared_from_this Date: Wed, 26 Jan 2022 20:28:40 +0800 Subject: [PATCH 08/23] fix getLastSchema --- .../DeltaMerge/Delta/ColumnFileFlushTask.cpp | 24 +++++++------------ .../Delta/ColumnFilePersistedSet.cpp | 8 +++---- 2 files changed, 13 insertions(+), 19 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp index 9d11dab4b96..0305c61e904 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp @@ -57,23 +57,17 @@ bool ColumnFileFlushTask::commit(ColumnFilePersistedSetPtr & persisted_file_set, ColumnFilePersistedPtr new_column_file; if (auto * m_file = task.column_file->tryToInMemoryFile(); m_file) { - bool is_small_file = m_file->getRows() < context.delta_small_pack_rows || m_file->getBytes() < context.delta_small_pack_bytes; - if (is_small_file) + // Just keep cache for really small column file + ColumnFile::CachePtr column_file_cache = nullptr; + if (m_file->getRows() < context.delta_small_pack_rows || m_file->getBytes() < context.delta_small_pack_bytes) { - new_column_file = std::make_shared(m_file->getSchema(), - m_file->getRows(), - m_file->getBytes(), - task.data_page, - !task.sorted ? m_file->getCache() : std::make_shared(std::move(task.block_data))); - } - else - { - new_column_file = std::make_shared(m_file->getSchema(), - m_file->getRows(), - m_file->getBytes(), - task.data_page, - nullptr); + column_file_cache = !task.sorted ? m_file->getCache() : std::make_shared(std::move(task.block_data)); } + new_column_file = std::make_shared(m_file->getSchema(), + m_file->getRows(), + m_file->getBytes(), + task.data_page, + column_file_cache); } else if (auto * t_file = task.column_file->tryToTinyFile(); t_file) { diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index 749fd92e404..2049ed9fead 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -82,9 +82,9 @@ void ColumnFilePersistedSet::checkColumnFiles(const ColumnFilePersistedLevels & ColumnFilePersistedSet::ColumnFilePersistedSet(PageId metadata_id_, const ColumnFilePersisteds & persisted_column_files) : metadata_id(metadata_id_) - , log(&Poco::Logger::get("ColumnStableFileSet")) + , log(&Poco::Logger::get("ColumnFilePersistedSet")) { - // TODO: place column file to different levels + // TODO: place column file to different levels, but it seems no need to do it currently because we only do minor compaction on really small files? persisted_files_levels.push_back(persisted_column_files); updateColumnFileStats(); @@ -114,9 +114,9 @@ void ColumnFilePersistedSet::recordRemoveColumnFilesPages(WriteBatches & wbs) co BlockPtr ColumnFilePersistedSet::getLastSchema() { - for (auto level_it = persisted_files_levels.rend(); level_it != persisted_files_levels.rbegin(); ++level_it) + for (const auto & level : persisted_files_levels) { - for (auto it = level_it->begin(); it != level_it->end(); ++it) + for (auto it = level.rbegin(); it != level.rend(); ++it) { if (auto * t_file = (*it)->tryToTinyFile(); t_file) return t_file->getSchema(); From 67e11a5827ce3952160ff55f185c720ac28c6c3c Mon Sep 17 00:00:00 2001 From: lidezhu Date: Thu, 27 Jan 2022 15:10:39 +0800 Subject: [PATCH 09/23] fix minor compaction --- .../Delta/ColumnFilePersistedSet.cpp | 28 +++++++++++++------ .../DeltaMerge/Delta/DeltaValueSpace.cpp | 2 +- .../DeltaMerge/Delta/MinorCompaction.cpp | 5 ++-- .../DeltaMerge/Delta/MinorCompaction.h | 7 ++--- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index 2049ed9fead..1a2765f1767 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -309,12 +309,13 @@ bool ColumnFilePersistedSet::appendPersistedColumnFilesToLevel0(const ColumnFile MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & context) { // Every time we try to compact all column files in a specific level. - // For ColumnTinyFile, we will try to combine small `ColumnTinyFile`s to a bigger one. - // For ColumnDeleteRangeFile and ColumnBigFile, we will simply move them to the next level. - // And only if there some small `ColumnTinyFile`s which can be combined together we will actually do the compaction. + // For ColumnFileTiny, we will try to combine small `ColumnFileTiny`s to a bigger one. + // For ColumnFileDeleteRange and ColumnFileBig, we will simply move them to the next level. + // And only if there exists some small `ColumnFileTiny`s which can be combined together, we will actually do the compaction. size_t check_level_num = 0; while (check_level_num < persisted_files_levels.size()) { + check_level_num += 1; if (next_compaction_level >= persisted_files_levels.size()) next_compaction_level = 0; @@ -326,7 +327,7 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con MinorCompaction::Task cur_task; for (auto & file : level) { - auto packup_cur_task = [&]() { + auto pack_up_cur_task = [&]() { bool is_trivial_move = compaction->packUpTask(std::move(cur_task)); is_all_trivial_move = is_all_trivial_move && is_trivial_move; cur_task = {}; @@ -336,8 +337,8 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con { bool cur_task_full = cur_task.total_rows >= context.delta_small_pack_rows; bool small_column_file = t_file->getRows() < context.delta_small_pack_rows; - bool schema_ok - = cur_task.to_compact.empty(); + bool schema_ok = cur_task.to_compact.empty(); + if (!schema_ok) { if (auto * last_t_file = cur_task.to_compact.back()->tryToTinyFile(); last_t_file) @@ -345,13 +346,13 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con } if (cur_task_full || !small_column_file || !schema_ok) - packup_cur_task(); + pack_up_cur_task(); cur_task.addColumnFile(file); } else { - packup_cur_task(); + pack_up_cur_task(); cur_task.addColumnFile(file); } } @@ -434,7 +435,16 @@ ColumnFileSetSnapshotPtr ColumnFilePersistedSet::createSnapshot(const DMContext { for (const auto & file : level) { - snap->column_files.push_back(file); + if (auto * t = file->tryToTinyFile(); (t && t->getCache())) + { + // Compact threads could update the value of ColumnTinyFile::cache, + // and since ColumnFile is not mult-threads safe, we should create a new column file object. + snap->column_files.push_back(std::make_shared(*t)); + } + else + { + snap->column_files.push_back(file); + } total_rows += file->getRows(); total_deletes += file->getDeletes(); } diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp index 9ebf58df26c..202b59bb77f 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp @@ -237,7 +237,7 @@ bool DeltaValueSpace::compact(DMContext & context) LOG_DEBUG(log, simpleInfo() << " Stop compact because abandoned"); return false; } - if (!compaction_task->commit(wbs)) + if (!compaction_task->commit(persisted_file_set, wbs)) { LOG_WARNING(log, "Structure has been updated during compact"); wbs.rollbackWrittenLogAndData(); diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp index 78c8502dc36..f53fdaf0c57 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp @@ -1,9 +1,8 @@ -#include "MinorCompaction.h" - #include #include #include #include +#include #include #include @@ -51,7 +50,7 @@ void MinorCompaction::prepare(DMContext & context, WriteBatches & wbs, const Pag } } -bool MinorCompaction::commit(WriteBatches & wbs) +bool MinorCompaction::commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs) { return persisted_file_set->installCompactionResults(shared_from_this(), wbs); } diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h index 35b4a2ffddc..37a27d810cf 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h @@ -42,7 +42,6 @@ class MinorCompaction : public std::enable_shared_from_this size_t compaction_src_level; size_t current_compaction_version; - ColumnFilePersistedSetPtr persisted_file_set; size_t total_compact_files = 0; size_t total_compact_rows = 0; @@ -53,8 +52,8 @@ class MinorCompaction : public std::enable_shared_from_this // return whether this task is a trivial move inline bool packUpTask(Task && task) { - if (unlikely(task.to_compact.empty())) - throw Exception("task shouldn't be empty", ErrorCodes::LOGICAL_ERROR); + if (task.to_compact.empty()) + return true; bool is_trivial_move = false; if (task.to_compact.size() == 1) @@ -76,7 +75,7 @@ class MinorCompaction : public std::enable_shared_from_this void prepare(DMContext & context, WriteBatches & wbs, const PageReader & reader); - bool commit(WriteBatches & wbs); + bool commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs); String info() const; }; From 944e293975c6ab958eb7b8616b404f685b93b809 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Thu, 27 Jan 2022 15:22:55 +0800 Subject: [PATCH 10/23] add minor compaction version check --- .../Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp | 3 ++- dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp | 3 ++- dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h | 6 ++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index 1a2765f1767..a1ddd07094c 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -319,7 +319,7 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con if (next_compaction_level >= persisted_files_levels.size()) next_compaction_level = 0; - auto compaction = std::make_shared(next_compaction_level); + auto compaction = std::make_shared(next_compaction_level, minor_compaction_version); auto & level = persisted_files_levels[next_compaction_level]; if (!level.empty()) { @@ -374,6 +374,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & LOG_WARNING(log, "Structure has been updated during compact"); return false; } + minor_compaction_version += 1; ColumnFilePersistedLevels new_persisted_files_levels; for (size_t i = 0; i < compaction->compaction_src_level; i++) { diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp index f53fdaf0c57..2ca1d2219c0 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp @@ -10,8 +10,9 @@ namespace DB { namespace DM { -MinorCompaction::MinorCompaction(size_t compaction_src_level_) +MinorCompaction::MinorCompaction(size_t compaction_src_level_, size_t current_compaction_version_) : compaction_src_level{compaction_src_level_} + , current_compaction_version{current_compaction_version_} {} void MinorCompaction::prepare(DMContext & context, WriteBatches & wbs, const PageReader & reader) diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h index 37a27d810cf..852fbecc8de 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h @@ -12,6 +12,8 @@ using ColumnFilePersistedSetPtr = std::shared_ptr; class MinorCompaction; using MinorCompactionPtr = std::shared_ptr; +/// Combine small `ColumnFileTiny`s to a bigger one and move it to the next level. +/// For `ColumnFileBig` and `ColumnFileDeleteRange`, it just moves it to the next level. class MinorCompaction : public std::enable_shared_from_this { friend class ColumnFilePersistedSet; @@ -47,7 +49,7 @@ class MinorCompaction : public std::enable_shared_from_this size_t total_compact_rows = 0; public: - explicit MinorCompaction(size_t compaction_src_level_); + explicit MinorCompaction(size_t compaction_src_level_, size_t current_compaction_version_); // return whether this task is a trivial move inline bool packUpTask(Task && task) @@ -58,7 +60,7 @@ class MinorCompaction : public std::enable_shared_from_this bool is_trivial_move = false; if (task.to_compact.size() == 1) { - // Maybe this column file is small, but it cannot be merged with other packs, so also remove it's cache if possible. + // Maybe this column file is small, but it cannot be merged with other column files, so also remove it's cache if possible. for (auto & f : task.to_compact) { if (auto * t_file = f->tryToTinyFile(); t_file) From 60789efda04f2b8221d375b9330c2d7d54b0931b Mon Sep 17 00:00:00 2001 From: lidezhu Date: Fri, 28 Jan 2022 18:18:14 +0800 Subject: [PATCH 11/23] more rename --- .../DeltaMerge/Delta/ColumnFileFlushTask.cpp | 2 +- .../Delta/ColumnFilePersistedSet.cpp | 28 +++++++++++-------- .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 6 ++-- .../DeltaMerge/Delta/DeltaValueSpace.h | 2 +- .../Storages/DeltaMerge/Delta/MemTableSet.h | 2 +- .../DeltaMerge/Delta/MinorCompaction.h | 2 +- 6 files changed, 24 insertions(+), 18 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp index 0305c61e904..7c994e8cc58 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp @@ -47,7 +47,7 @@ DeltaIndex::Updates ColumnFileFlushTask::prepare(WriteBatches & wbs) bool ColumnFileFlushTask::commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs) { - if (!persisted_file_set->checkAndUpdateFlushVersion(flush_version)) + if (!persisted_file_set->checkAndIncreaseFlushVersion(flush_version)) return false; /// Create new column file instance for ColumnFilePersistedSet diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index a1ddd07094c..2b127408886 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -158,6 +158,10 @@ ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & c it_2++; } } + else + { + check_success = false; + } if (unlikely(!check_success)) { @@ -176,35 +180,36 @@ ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & c if (level_it == persisted_files_levels.rend()) break; it_2 = level_it->begin(); + continue; } const auto & column_file = *it_2; - if (auto * cr = column_file->tryToDeleteRange(); cr) + if (auto * d_file = column_file->tryToDeleteRange(); d_file) { - auto new_dr = cr->getDeleteRange().shrink(target_range); + auto new_dr = d_file->getDeleteRange().shrink(target_range); if (!new_dr.none()) { // Only use the available delete_range pack. - cloned_tail.push_back(cr->cloneWith(new_dr)); + cloned_tail.push_back(d_file->cloneWith(new_dr)); } } - else if (auto * tf = column_file->tryToTinyFile(); tf) + else if (auto * t_file = column_file->tryToTinyFile(); t_file) { // Use a newly created page_id to reference the data page_id of current column file. PageId new_data_page_id = context.storage_pool.newLogPageId(); - wbs.log.putRefPage(new_data_page_id, tf->getDataPageId()); - auto new_column_file = tf->cloneWith(new_data_page_id); + wbs.log.putRefPage(new_data_page_id, t_file->getDataPageId()); + auto new_column_file = t_file->cloneWith(new_data_page_id); cloned_tail.push_back(new_column_file); } - else if (auto * f = column_file->tryToBigFile(); f) + else if (auto * b_file = column_file->tryToBigFile(); b_file) { auto delegator = context.path_pool.getStableDiskDelegator(); auto new_ref_id = context.storage_pool.newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); - auto file_id = f->getFile()->fileId(); + auto file_id = b_file->getFile()->fileId(); wbs.data.putRefPage(new_ref_id, file_id); auto file_parent_path = delegator.getDTFilePath(file_id); auto new_file = DMFile::restore(context.db_context.getFileProvider(), file_id, /* ref_id= */ new_ref_id, file_parent_path, DMFile::ReadMetaMode::all()); - auto new_big_file = f->cloneWith(context, new_file, target_range); + auto new_big_file = b_file->cloneWith(context, new_file, target_range); cloned_tail.push_back(new_big_file); } else @@ -268,7 +273,7 @@ size_t ColumnFilePersistedSet::getValidCacheRows() const return cache_rows; } -bool ColumnFilePersistedSet::checkAndUpdateFlushVersion(size_t task_flush_version) +bool ColumnFilePersistedSet::checkAndIncreaseFlushVersion(size_t task_flush_version) { if (task_flush_version != flush_version) { @@ -388,6 +393,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & // Add new file to the target level auto target_level = compaction->compaction_src_level + 1; auto & target_level_files = new_persisted_files_levels.emplace_back(); + // clone the old column files in the target level first if (persisted_files_levels.size() > target_level) { for (auto & column_file : persisted_files_levels[target_level]) @@ -439,7 +445,7 @@ ColumnFileSetSnapshotPtr ColumnFilePersistedSet::createSnapshot(const DMContext if (auto * t = file->tryToTinyFile(); (t && t->getCache())) { // Compact threads could update the value of ColumnTinyFile::cache, - // and since ColumnFile is not mult-threads safe, we should create a new column file object. + // and since ColumnFile is not multi-threads safe, we should create a new column file object. snap->column_files.push_back(std::make_shared(*t)); } else diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h index 2a804b0c740..0af64127848 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -30,7 +30,7 @@ namespace DM class ColumnFilePersistedSet; using ColumnFilePersistedSetPtr = std::shared_ptr; -/// This class is not thread safe, manipulate on it requires acquire extra synchronization +/// This class is not thread safe, manipulate on it requires acquire extra synchronization on the DeltaValueSpace class ColumnFilePersistedSet : public std::enable_shared_from_this , private boost::noncopyable { @@ -66,7 +66,7 @@ class ColumnFilePersistedSet : public std::enable_shared_from_this using Lock = std::unique_lock; private: - /// column files in `stable_file_set` are all persisted in disks and can be restored after restart. + /// column files in `persisted_file_set` are all persisted in disks and can be restored after restart. /// column files in `mem_table_set` just resides in memory. /// /// Note that `persisted_file_set` and `mem_table_set` also forms a one-dimensional space diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h index c010442dcd1..79fe4cbdf22 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h @@ -14,7 +14,7 @@ using MemTableSetPtr = std::shared_ptr; /// MemTableSet contains column file which data just resides in memory and it cannot be restored after restart. /// And the column files will be flushed periodically to ColumnFilePersistedSet. /// -/// This class is not thread safe, manipulate on it requires acquire extra synchronization +/// This class is not thread safe, manipulate on it requires acquire extra synchronization on the DeltaValueSpace class MemTableSet : public std::enable_shared_from_this , private boost::noncopyable { diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h index 852fbecc8de..8b2a4143a7e 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h @@ -49,7 +49,7 @@ class MinorCompaction : public std::enable_shared_from_this size_t total_compact_rows = 0; public: - explicit MinorCompaction(size_t compaction_src_level_, size_t current_compaction_version_); + MinorCompaction(size_t compaction_src_level_, size_t current_compaction_version_); // return whether this task is a trivial move inline bool packUpTask(Task && task) From b91fd4f248dc0d1778c7d3f98fd35f7393e0a12a Mon Sep 17 00:00:00 2001 From: lidezhu Date: Fri, 28 Jan 2022 20:49:48 +0800 Subject: [PATCH 12/23] improve log --- .../DeltaMerge/Delta/ColumnFileFlushTask.h | 14 +++++++++++++- .../Storages/DeltaMerge/Delta/DeltaValueSpace.cpp | 4 ++++ dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp | 6 +----- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h index a1a298a3360..aaf4a21eb87 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.h @@ -47,13 +47,25 @@ class ColumnFileFlushTask MemTableSetPtr mem_table_set; size_t flush_version; + size_t flush_rows = 0; + size_t flush_deletes = 0; + public: ColumnFileFlushTask(DMContext & context_, const MemTableSetPtr & mem_table_set_, size_t flush_version_); - inline Task & addColumnFile(ColumnFilePtr column_file) { return tasks.emplace_back(column_file); } + inline Task & addColumnFile(ColumnFilePtr column_file) + { + flush_rows += column_file->getRows(); + flush_deletes += column_file->getDeletes(); + return tasks.emplace_back(column_file); + } const Tasks & getAllTasks() const { return tasks; } + size_t getTaskNum() const { return tasks.size(); } + size_t getFlushRows() const { return flush_rows; } + size_t getFlushDeletes() const { return flush_deletes; } + // Persist data in ColumnFileInMemory DeltaIndex::Updates prepare(WriteBatches & wbs); diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp index 202b59bb77f..7500b4dc56c 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.cpp @@ -185,6 +185,10 @@ bool DeltaValueSpace::flush(DMContext & context) /// Update delta tree if (new_delta_index) delta_index = new_delta_index; + + LOG_DEBUG(log, + simpleInfo() << " Flush end. Flushed " << flush_task->getTaskNum() << " column files, " << flush_task->getFlushRows() << " rows and " << flush_task->getFlushDeletes() + << " deletes."); } return true; } diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp index 3f813a978bd..eb6cf9947db 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp @@ -140,8 +140,6 @@ ColumnFileFlushTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t r size_t cur_rows_offset = rows_offset; size_t cur_deletes_offset = deletes_offset; - size_t flush_rows = 0; - size_t flush_deletes = 0; auto flush_task = std::make_shared(context, this->shared_from_this(), flush_version); for (auto & column_file : column_files) { @@ -154,10 +152,8 @@ ColumnFileFlushTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t r } cur_rows_offset += column_file->getRows(); cur_deletes_offset += column_file->getDeletes(); - flush_rows += column_file->getRows(); - flush_deletes += column_file->getDeletes(); } - if (unlikely(flush_rows != rows || flush_deletes != deletes)) + if (unlikely(flush_task->getFlushRows() != rows || flush_task->getFlushDeletes() != deletes)) throw Exception("Rows and deletes check failed", ErrorCodes::LOGICAL_ERROR); return flush_task; From e5197afcd53a446295b543b3037c6efe0c444830 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Mon, 7 Feb 2022 18:47:07 +0800 Subject: [PATCH 13/23] Rename and fix static analysis error --- dbms/src/DataStreams/ConcatBlockInputStream.h | 11 ++- dbms/src/Interpreters/Settings.h | 4 +- .../DeltaMerge/ColumnFile/ColumnFileBig.cpp | 2 +- .../ColumnFile/ColumnFileInMemory.cpp | 4 +- .../ColumnFile/ColumnFilePersisted.cpp | 6 +- .../ColumnFile/ColumnFilePersisted.h | 5 +- .../ColumnFile/ColumnFileSetReader.cpp | 92 ++++++++++--------- .../ColumnFile/ColumnFileSetReader.h | 12 +-- .../ColumnFile/ColumnFileSetSnapshot.cpp | 5 +- .../DeltaMerge/ColumnFile/ColumnFileTiny.cpp | 12 +-- .../DeltaMerge/ColumnFile/ColumnFile_V2.cpp | 12 +-- dbms/src/Storages/DeltaMerge/DMContext.h | 12 +-- .../DeltaMerge/DMSegmentThreadInputStream.h | 6 +- .../DeltaMerge/Delta/ColumnFileFlushTask.cpp | 2 +- .../Delta/ColumnFilePersistedSet.cpp | 34 +++---- .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 12 ++- .../DeltaMerge/Delta/DeltaValueSpace.cpp | 38 ++++---- .../DeltaMerge/Delta/DeltaValueSpace.h | 10 +- .../Storages/DeltaMerge/Delta/MemTableSet.cpp | 64 ++++++++++++- .../Storages/DeltaMerge/Delta/MemTableSet.h | 4 +- .../DeltaMerge/Delta/MinorCompaction.cpp | 2 +- .../DeltaMerge/Delta/MinorCompaction.h | 4 +- .../Storages/DeltaMerge/Delta/Snapshot.cpp | 15 ++- .../Storages/DeltaMerge/DeltaMergeStore.cpp | 73 +++++++-------- .../src/Storages/DeltaMerge/DeltaMergeStore.h | 4 +- dbms/src/Storages/DeltaMerge/Segment.cpp | 20 ++-- 26 files changed, 272 insertions(+), 193 deletions(-) diff --git a/dbms/src/DataStreams/ConcatBlockInputStream.h b/dbms/src/DataStreams/ConcatBlockInputStream.h index 1c18a985949..11bba3a0427 100644 --- a/dbms/src/DataStreams/ConcatBlockInputStream.h +++ b/dbms/src/DataStreams/ConcatBlockInputStream.h @@ -14,21 +14,24 @@ class ConcatBlockInputStream : public IProfilingBlockInputStream { public: ConcatBlockInputStream(BlockInputStreams inputs_, const LogWithPrefixPtr & log_) - : log(getMPPTaskLog(log_, getName())) + : log(getMPPTaskLog(log_, getNameImpl())) { children.insert(children.end(), inputs_.begin(), inputs_.end()); current_stream = children.begin(); } - String getName() const override { return "Concat"; } + String getName() const override { return getNameImpl(); } + + // Add this function because static analysis forbids calling virtual function in constructor + inline String getNameImpl() const { return "Concat"; } Block getHeader() const override { return children.at(0)->getHeader(); } protected: Block readImpl() override { - FilterPtr filter_; - return readImpl(filter_, false); + FilterPtr filter; + return readImpl(filter, false); } Block readImpl(FilterPtr & res_filter, bool return_filter) override diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h index e82866eba39..fa24f3099b9 100644 --- a/dbms/src/Interpreters/Settings.h +++ b/dbms/src/Interpreters/Settings.h @@ -253,8 +253,8 @@ struct Settings M(SettingUInt64, dt_segment_stop_write_delta_size, 2147483648, "Delta size before stop new writes. 2 GB by default.") \ M(SettingUInt64, dt_segment_delta_cache_limit_rows, 4096, "Max rows of cache in segment delta in DeltaTree Engine.") \ M(SettingUInt64, dt_segment_delta_cache_limit_size, 4194304, "Max size of cache in segment delta in DeltaTree Engine. 4 MB by default.") \ - M(SettingUInt64, dt_segment_delta_small_pack_rows, 2048, "Determine whether a pack in delta is small or not.") \ - M(SettingUInt64, dt_segment_delta_small_pack_size, 8388608, "Determine whether a pack in delta is small or not. 8MB by default.") \ + M(SettingUInt64, dt_segment_delta_small_column_file_rows, 2048, "Determine whether a column file in delta is small or not.") \ + M(SettingUInt64, dt_segment_delta_small_column_file_size, 8388608, "Determine whether a column file in delta is small or not. 8MB by default.") \ M(SettingUInt64, dt_segment_stable_pack_rows, DEFAULT_MERGE_BLOCK_SIZE, "Expected stable pack rows in DeltaTree Engine.") \ M(SettingFloat, dt_segment_wait_duration_factor, 1, "The factor of wait duration in a write stall.") \ M(SettingUInt64, dt_bg_gc_check_interval, 600, "Background gc thread check interval, the unit is second.") \ diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp index d8cfd75c5f9..537eacc3d0f 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileBig.cpp @@ -56,7 +56,7 @@ ColumnFilePersistedPtr ColumnFileBig::deserializeMetadata(DMContext & context, / auto dmfile = DMFile::restore(context.db_context.getFileProvider(), file_id, file_ref_id, file_parent_path, DMFile::ReadMetaMode::all()); - auto dp_file = new ColumnFileBig(dmfile, valid_rows, valid_bytes, segment_range); + auto * dp_file = new ColumnFileBig(dmfile, valid_rows, valid_bytes, segment_range); return std::shared_ptr(dp_file); } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp index 3804fd88c92..99a6165f9c5 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileInMemory.cpp @@ -62,8 +62,8 @@ bool ColumnFileInMemory::append(DMContext & context, const Block & data, size_t for (size_t i = 0; i < cache->block.columns(); ++i) { - auto & col = data.getByPosition(i).column; - auto & cache_col = *cache->block.getByPosition(i).column; + const auto & col = data.getByPosition(i).column; + const auto & cache_col = *cache->block.getByPosition(i).column; auto * mutable_cache_col = const_cast(&cache_col); mutable_cache_col->insertRangeFrom(*col, offset, limit); } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp index 017583207c5..0d7f044ad38 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.cpp @@ -15,7 +15,7 @@ void serializeSchema(WriteBuffer & buf, const BlockPtr & schema) { if (schema) { - writeIntBinary((UInt32)schema->columns(), buf); + writeIntBinary(static_cast(schema->columns()), buf); for (auto & col : *schema) { writeIntBinary(col.column_id, buf); @@ -25,7 +25,7 @@ void serializeSchema(WriteBuffer & buf, const BlockPtr & schema) } else { - writeIntBinary((UInt32)0, buf); + writeIntBinary(static_cast(0), buf); } } @@ -70,7 +70,7 @@ void deserializeColumn(IColumn & column, const DataTypePtr & type, const ByteBuf type->deserializeBinaryBulkWithMultipleStreams(column, // [&](const IDataType::SubstreamPath &) { return &compressed; }, rows, - (double)(data_buf.size()) / rows, + static_cast(data_buf.size()) / rows, true, {}); } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h index 75a45717a02..a3f92f9af84 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFilePersisted.h @@ -27,10 +27,9 @@ BlockPtr deserializeSchema(ReadBuffer & buf); void serializeColumn(MemoryWriteBuffer & buf, const IColumn & column, const DataTypePtr & type, size_t offset, size_t limit, bool compress); void deserializeColumn(IColumn & column, const DataTypePtr & type, const ByteBuffer & data_buf, size_t rows); -/// Serialize those packs' metadata into buf. -/// Note that this method stop at the first unsaved pack. +/// Serialize those column files' metadata into buf. void serializeSavedColumnFiles(WriteBuffer & buf, const ColumnFilePersisteds & column_files); -/// Recreate pack instances from buf. +/// Recreate column file instances from buf. ColumnFilePersisteds deserializeSavedColumnFiles(DMContext & context, const RowKeyRange & segment_range, ReadBuffer & buf); void serializeSavedColumnFilesInV2Format(WriteBuffer & buf, const ColumnFilePersisteds & column_files); diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp index b5372f7d4d9..8676705474f 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp @@ -18,15 +18,15 @@ std::pair findColumnFile(const ColumnFiles & column_files, size_ { if (rows_count == rows_offset && deletes_count == deletes_offset) return {column_file_index, 0}; - auto & column_file = column_files[column_file_index]; + const auto & column_file = column_files[column_file_index]; if (column_file->isDeleteRange()) { if (deletes_count == deletes_offset) { if (unlikely(rows_count != rows_offset)) - throw Exception("rows_count and rows_offset are expected to be equal. pack_index: " + DB::toString(column_file_index) - + ", pack_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + throw Exception("rows_count and rows_offset are expected to be equal. column_file_index: " + DB::toString(column_file_index) + + ", column_file_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) + ", deletes_offset: " + DB::toString(deletes_offset)); return {column_file_index, 0}; @@ -39,8 +39,8 @@ std::pair findColumnFile(const ColumnFiles & column_files, size_ if (rows_count > rows_offset) { if (unlikely(deletes_count != deletes_offset)) - throw Exception("deletes_count and deletes_offset are expected to be equal. pack_index: " + DB::toString(column_file_index) - + ", pack_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + throw Exception("deletes_count and deletes_offset are expected to be equal. column_file_index: " + DB::toString(column_file_index) + + ", column_file_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) + ", deletes_offset: " + DB::toString(deletes_offset), ErrorCodes::LOGICAL_ERROR); @@ -50,7 +50,7 @@ std::pair findColumnFile(const ColumnFiles & column_files, size_ } } if (rows_count != rows_offset || deletes_count != deletes_offset) - throw Exception("illegal rows_offset and deletes_offset. pack_size: " + DB::toString(column_files.size()) + throw Exception("illegal rows_offset and deletes_offset. column_file_size: " + DB::toString(column_files.size()) + ", rows_count: " + DB::toString(rows_count) + ", rows_offset: " + DB::toString(rows_offset) + ", deletes_count: " + DB::toString(deletes_count) + ", deletes_offset: " + DB::toString(deletes_offset), ErrorCodes::LOGICAL_ERROR); @@ -127,75 +127,79 @@ size_t ColumnFileSetReader::readRows(MutableColumns & output_columns, size_t off if (end == start) return 0; - auto [start_pack_index, rows_start_in_start_pack] = locatePosByAccumulation(column_file_rows_end, start); - auto [end_pack_index, rows_end_in_end_pack] = locatePosByAccumulation(column_file_rows_end, end); + auto [start_file_index, rows_start_in_start_file] = locatePosByAccumulation(column_file_rows_end, start); + auto [end_file_index, rows_end_in_end_file] = locatePosByAccumulation(column_file_rows_end, end); size_t actual_read = 0; - for (size_t pack_index = start_pack_index; pack_index <= end_pack_index; ++pack_index) + for (size_t file_index = start_file_index; file_index <= end_file_index; ++file_index) { - size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; - size_t rows_end_in_pack = pack_index == end_pack_index ? rows_end_in_end_pack : column_file_rows[pack_index]; - size_t rows_in_pack_limit = rows_end_in_pack - rows_start_in_pack; + size_t rows_start_in_file = file_index == start_file_index ? rows_start_in_start_file : 0; + size_t rows_end_in_file = file_index == end_file_index ? rows_end_in_end_file : column_file_rows[file_index]; + size_t rows_in_file_limit = rows_end_in_file - rows_start_in_file; // Nothing to read. - if (rows_start_in_pack == rows_end_in_pack) + if (rows_in_file_limit == 0) continue; - auto & column_file_reader = column_file_readers[pack_index]; - actual_read += column_file_reader->readRows(output_columns, rows_start_in_pack, rows_in_pack_limit, range); + auto & column_file_reader = column_file_readers[file_index]; + actual_read += column_file_reader->readRows(output_columns, rows_start_in_file, rows_in_file_limit, range); } return actual_read; } void ColumnFileSetReader::getPlaceItems(BlockOrDeletes & place_items, size_t rows_begin, size_t deletes_begin, size_t rows_end, size_t deletes_end, size_t place_rows_offset) { - /// Note that we merge the consecutive DeltaPackBlock together, which are seperated in groups by DeltaPackDelete and DeltePackFile. + /// Note that we merge the consecutive ColumnFileInMemory or ColumnFileTiny together, which are seperated in groups by ColumnFileDeleteRange and ColumnFileBig. auto & column_files = snapshot->getColumnFiles(); - auto [start_pack_index, rows_start_in_start_pack] = findColumnFile(column_files, rows_begin, deletes_begin); - auto [end_pack_index, rows_end_in_end_pack] = findColumnFile(column_files, rows_end, deletes_end); + auto [start_file_index, rows_start_in_start_file] = findColumnFile(column_files, rows_begin, deletes_begin); + auto [end_file_index, rows_end_in_end_file] = findColumnFile(column_files, rows_end, deletes_end); size_t block_rows_start = rows_begin; size_t block_rows_end = rows_begin; - for (size_t pack_index = start_pack_index; pack_index < column_files.size() && pack_index <= end_pack_index; ++pack_index) + for (size_t file_index = start_file_index; file_index < column_files.size() && file_index <= end_file_index; ++file_index) { - auto & pack = *column_files[pack_index]; + auto & column_file = *column_files[file_index]; - if (pack.isDeleteRange() || pack.isBigFile()) + if (column_file.isDeleteRange() || column_file.isBigFile()) { - // First, compact the DeltaPackBlocks before this pack into one block. + // First, compact the ColumnFileInMemory or ColumnFileTiny before this column file into one block. if (block_rows_end != block_rows_start) { auto block = readPKVersion(block_rows_start, block_rows_end - block_rows_start); place_items.emplace_back(std::move(block), block_rows_start + place_rows_offset); } - // Second, take current pack. - if (auto * pack_delete = pack.tryToDeleteRange(); pack_delete) + // Second, take current column file. + if (auto * dr = column_file.tryToDeleteRange(); dr) { - place_items.emplace_back(pack_delete->getDeleteRange()); + place_items.emplace_back(dr->getDeleteRange()); } - else if (pack.isBigFile() && pack.getRows()) + else if (column_file.isBigFile() && column_file.getRows()) { - auto block = readPKVersion(block_rows_end, pack.getRows()); + auto block = readPKVersion(block_rows_end, column_file.getRows()); place_items.emplace_back(std::move(block), block_rows_end + place_rows_offset); } + else + { + throw Exception("Unknown column file type", ErrorCodes::LOGICAL_ERROR); + } - block_rows_end += pack.getRows(); + block_rows_end += column_file.getRows(); block_rows_start = block_rows_end; } else { - // It is a DeltaPackBlock. - size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; - size_t rows_end_in_pack = pack_index == end_pack_index ? rows_end_in_end_pack : pack.getRows(); + // It is a ColumnFileInMemory or ColumnFileTiny. + size_t rows_start_in_file = file_index == start_file_index ? rows_start_in_start_file : 0; + size_t rows_end_in_file = file_index == end_file_index ? rows_end_in_end_file : column_file.getRows(); - block_rows_end += rows_end_in_pack - rows_start_in_pack; + block_rows_end += rows_end_in_file - rows_start_in_file; - if (pack_index == column_files.size() - 1 || pack_index == end_pack_index) + if (file_index == column_files.size() - 1 || file_index == end_file_index) { - // It is the last pack. + // It is the last column file. if (block_rows_end != block_rows_start) { auto block = readPKVersion(block_rows_start, block_rows_end - block_rows_start); @@ -213,22 +217,22 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, size_t placed_rows) { auto & column_files = snapshot->getColumnFiles(); - auto [start_pack_index, rows_start_in_start_pack] = locatePosByAccumulation(column_file_rows_end, placed_rows); + auto [start_file_index, rows_start_in_start_file] = locatePosByAccumulation(column_file_rows_end, placed_rows); - for (size_t pack_index = start_pack_index; pack_index < snapshot->getColumnFileCount(); ++pack_index) + for (size_t file_index = start_file_index; file_index < snapshot->getColumnFileCount(); ++file_index) { - auto & column_file = column_files[pack_index]; + auto & column_file = column_files[file_index]; - // Always do place index if DeltaPackFile exists. + // Always do place index if ColumnFileBig exists. if (column_file->isBigFile()) return true; if (unlikely(column_file->isDeleteRange())) - throw Exception("pack is delete range", ErrorCodes::LOGICAL_ERROR); + throw Exception("column file is delete range", ErrorCodes::LOGICAL_ERROR); - size_t rows_start_in_pack = pack_index == start_pack_index ? rows_start_in_start_pack : 0; - size_t rows_end_in_pack = column_file_rows[pack_index]; + size_t rows_start_in_file = file_index == start_file_index ? rows_start_in_start_file : 0; + size_t rows_end_in_file = column_file_rows[file_index]; - auto & column_file_reader = column_file_readers[pack_index]; + auto & column_file_reader = column_file_readers[file_index]; if (column_file->isInMemoryFile()) { auto & dpb_reader = typeid_cast(*column_file_reader); @@ -238,7 +242,7 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, auto rkcc = RowKeyColumnContainer(pk_column, context.is_common_handle); const auto & version_col_data = toColumnVectorData(version_column); - for (auto i = rows_start_in_pack; i < rows_end_in_pack; ++i) + for (auto i = rows_start_in_file; i < rows_end_in_file; ++i) { if (version_col_data[i] <= max_version && relevant_range.check(rkcc.getRowKeyValue(i))) return true; @@ -253,7 +257,7 @@ bool ColumnFileSetReader::shouldPlace(const DMContext & context, auto rkcc = RowKeyColumnContainer(pk_column, context.is_common_handle); const auto & version_col_data = toColumnVectorData(version_column); - for (auto i = rows_start_in_pack; i < rows_end_in_pack; ++i) + for (auto i = rows_start_in_file; i < rows_end_in_file; ++i) { if (version_col_data[i] <= max_version && relevant_range.check(rkcc.getRowKeyValue(i))) return true; diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h index 5a69fc716dc..340a62bb4ee 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h @@ -59,7 +59,7 @@ class ColumnFileSetInputStream : public IBlockInputStream size_t column_files_count; ColumnFileReaderPtr cur_column_file_reader = {}; - size_t next_pack_index = 0; + size_t next_file_index = 0; public: ColumnFileSetInputStream(const DMContext & context_, @@ -76,19 +76,19 @@ class ColumnFileSetInputStream : public IBlockInputStream Block read() override { - while (cur_column_file_reader || next_pack_index < column_files_count) + while (cur_column_file_reader || next_file_index < column_files_count) { if (!cur_column_file_reader) { - if (column_files[next_pack_index]->isDeleteRange()) + if (column_files[next_file_index]->isDeleteRange()) { - ++next_pack_index; + ++next_file_index; continue; } else { - cur_column_file_reader = reader.column_file_readers[next_pack_index]; - ++next_pack_index; + cur_column_file_reader = reader.column_file_readers[next_file_index]; + ++next_file_index; } } Block block = cur_column_file_reader->readNextBlock(); diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp index 25df32745f7..f34be590485 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetSnapshot.cpp @@ -9,10 +9,9 @@ namespace DM RowKeyRange ColumnFileSetSnapshot::getSquashDeleteRange() const { RowKeyRange squashed_delete_range = RowKeyRange::newNone(is_common_handle, rowkey_column_size); - for (auto iter = column_files.cbegin(); iter != column_files.cend(); ++iter) + for (const auto & column_file : column_files) { - const auto & column_file = *iter; - if (auto f_delete = column_file->tryToDeleteRange(); f_delete) + if (auto * f_delete = column_file->tryToDeleteRange(); f_delete) squashed_delete_range = squashed_delete_range.merge(f_delete->getDeleteRange()); } return squashed_delete_range; diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp index a5d950f2921..3a4adf37963 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileTiny.cpp @@ -150,30 +150,30 @@ Block ColumnFileTiny::readBlockForMinorCompaction(const PageReader & page_reader } else { - auto & schema_ = *schema; + const auto & schema_ref = *schema; PageStorage::PageReadFields fields; fields.first = data_page_id; - for (size_t i = 0; i < schema_.columns(); ++i) + for (size_t i = 0; i < schema_ref.columns(); ++i) fields.second.push_back(i); auto page_map = page_reader.read({fields}); auto page = page_map[data_page_id]; - auto columns = schema_.cloneEmptyColumns(); + auto columns = schema_ref.cloneEmptyColumns(); if (unlikely(columns.size() != page.fieldSize())) throw Exception("Column size and field size not the same"); - for (size_t index = 0; index < schema_.columns(); ++index) + for (size_t index = 0; index < schema_ref.columns(); ++index) { auto data_buf = page.getFieldData(index); - auto & type = schema_.getByPosition(index).type; + const auto & type = schema_ref.getByPosition(index).type; auto & column = columns[index]; deserializeColumn(*column, type, data_buf, rows); } - return schema_.cloneWithColumns(std::move(columns)); + return schema_ref.cloneWithColumns(std::move(columns)); } } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp index 37a8334ff04..36a95a89d0c 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp @@ -6,7 +6,7 @@ namespace DB { namespace DM { -struct ColumnFile_V2 +struct ColumnFileV2 { UInt64 rows = 0; UInt64 bytes = 0; @@ -16,7 +16,7 @@ struct ColumnFile_V2 bool isDeleteRange() const { return !delete_range.none(); } }; -using ColumnFile_V2Ptr = std::shared_ptr; +using ColumnFile_V2Ptr = std::shared_ptr; using ColumnFiles_V2 = std::vector; inline ColumnFilePersisteds transform_V2_to_V3(const ColumnFiles_V2 & column_files_v2) @@ -40,7 +40,7 @@ inline ColumnFiles_V2 transformSaved_V3_to_V2(const ColumnFilePersisteds & colum ColumnFiles_V2 column_files_v2; for (const auto & f : column_files_v3) { - auto * f_v2 = new ColumnFile_V2(); + auto * f_v2 = new ColumnFileV2(); if (auto * f_delete = f->tryToDeleteRange(); f_delete) { @@ -58,12 +58,12 @@ inline ColumnFiles_V2 transformSaved_V3_to_V2(const ColumnFilePersisteds & colum throw Exception("Unexpected column file type", ErrorCodes::LOGICAL_ERROR); } - column_files_v2.push_back(std::shared_ptr(f_v2)); + column_files_v2.push_back(std::shared_ptr(f_v2)); } return column_files_v2; } -inline void serializeColumnFile_V2(const ColumnFile_V2 & column_file, const BlockPtr & schema, WriteBuffer & buf) +inline void serializeColumnFile_V2(const ColumnFileV2 & column_file, const BlockPtr & schema, WriteBuffer & buf) { writeIntBinary(column_file.rows, buf); writeIntBinary(column_file.bytes, buf); @@ -118,7 +118,7 @@ void serializeSavedColumnFilesInV2Format(WriteBuffer & buf, const ColumnFilePers inline ColumnFile_V2Ptr deserializeColumnFile_V2(ReadBuffer & buf, UInt64 version) { - auto column_file = std::make_shared(); + auto column_file = std::make_shared(); readIntBinary(column_file->rows, buf); readIntBinary(column_file->bytes, buf); switch (version) diff --git a/dbms/src/Storages/DeltaMerge/DMContext.h b/dbms/src/Storages/DeltaMerge/DMContext.h index 52c754dd7a1..24eb0ed18be 100644 --- a/dbms/src/Storages/DeltaMerge/DMContext.h +++ b/dbms/src/Storages/DeltaMerge/DMContext.h @@ -51,10 +51,10 @@ struct DMContext : private boost::noncopyable const size_t delta_cache_limit_rows; // The size threshold of cache in delta. const size_t delta_cache_limit_bytes; - // Determine whether a pack is small or not in rows. - const size_t delta_small_pack_rows; - // Determine whether a pack is small or not in bytes. - const size_t delta_small_pack_bytes; + // Determine whether a column file is small or not in rows. + const size_t delta_small_column_file_rows; + // Determine whether a column file is small or not in bytes. + const size_t delta_small_column_file_bytes; // The expected stable pack rows. const size_t stable_pack_rows; @@ -95,8 +95,8 @@ struct DMContext : private boost::noncopyable , delta_limit_bytes(settings.dt_segment_delta_limit_size) , delta_cache_limit_rows(settings.dt_segment_delta_cache_limit_rows) , delta_cache_limit_bytes(settings.dt_segment_delta_cache_limit_size) - , delta_small_pack_rows(settings.dt_segment_delta_small_pack_rows) - , delta_small_pack_bytes(settings.dt_segment_delta_small_pack_size) + , delta_small_column_file_rows(settings.dt_segment_delta_small_column_file_rows) + , delta_small_column_file_bytes(settings.dt_segment_delta_small_column_file_size) , stable_pack_rows(settings.dt_segment_stable_pack_rows) , enable_logical_split(settings.dt_enable_logical_split) , read_delta_only(settings.dt_read_delta_only) diff --git a/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h b/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h index c08edca263f..f4379a2c871 100644 --- a/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h +++ b/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h @@ -44,11 +44,13 @@ class DMSegmentThreadInputStream : public IProfilingBlockInputStream , expected_block_size(expected_block_size_) , is_raw(is_raw_) , do_range_filter_for_raw(do_range_filter_for_raw_) - , log(getMPPTaskLog(log_, getName())) + , log(getMPPTaskLog(log_, getNameImpl())) { } - String getName() const override { return "DeltaMergeSegmentThread"; } + String getName() const override { return getNameImpl(); } + // Add this function because static analysis forbids calling virtual function in constructor + inline String getNameImpl() const { return "DeltaMergeSegmentThread"; } Block getHeader() const override { return header; } protected: diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp index 7c994e8cc58..9a77a5f2b74 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFileFlushTask.cpp @@ -59,7 +59,7 @@ bool ColumnFileFlushTask::commit(ColumnFilePersistedSetPtr & persisted_file_set, { // Just keep cache for really small column file ColumnFile::CachePtr column_file_cache = nullptr; - if (m_file->getRows() < context.delta_small_pack_rows || m_file->getBytes() < context.delta_small_pack_bytes) + if (m_file->getRows() < context.delta_small_column_file_rows || m_file->getBytes() < context.delta_small_column_file_bytes) { column_file_cache = !task.sorted ? m_file->getCache() : std::make_shared(std::move(task.block_data)); } diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index 2b127408886..9cf0fa232f0 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -74,8 +74,7 @@ void ColumnFilePersistedSet::checkColumnFiles(const ColumnFilePersistedLevels & if (unlikely(new_rows != rows || new_deletes != deletes)) { - LOG_ERROR(log, - "Rows and deletes check failed. Current packs: " << columnFilesToString(flattenColumnFileLevels(new_column_file_levels)) << ", new packs: " << columnFilesToString(flattenColumnFileLevels(new_column_file_levels))); + LOG_FMT_ERROR(log, "Rows and deletes check failed. Actual: rows[{}], deletes[{}]. Expected: rows[{}], deletes[{}]. Current column files: {}, new column files: {}.", new_rows, new_deletes, rows.load(), deletes.load(), columnFilesToString(flattenColumnFileLevels(persisted_files_levels)), columnFilesToString(flattenColumnFileLevels(new_column_file_levels))); throw Exception("Rows and deletes check failed.", ErrorCodes::LOGICAL_ERROR); } } @@ -165,9 +164,7 @@ ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & c if (unlikely(!check_success)) { - LOG_ERROR(log, - info() << ", Delta Check head failed, unexpected size. head column files: " << columnFilesToString(head_column_files) - << ", level details: " << levelsInfo()); + LOG_FMT_ERROR(log, "{}, Delta Check head failed, unexpected size. head column files: {}, level details: {}", info(), columnFilesToString(head_column_files), levelsInfo()); throw Exception("Check head failed, unexpected size", ErrorCodes::LOGICAL_ERROR); } @@ -188,7 +185,7 @@ ColumnFilePersisteds ColumnFilePersistedSet::checkHeadAndCloneTail(DMContext & c auto new_dr = d_file->getDeleteRange().shrink(target_range); if (!new_dr.none()) { - // Only use the available delete_range pack. + // Only use the available delete_range column file. cloned_tail.push_back(d_file->cloneWith(new_dr)); } } @@ -340,8 +337,8 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con if (auto * t_file = file->tryToTinyFile(); t_file) { - bool cur_task_full = cur_task.total_rows >= context.delta_small_pack_rows; - bool small_column_file = t_file->getRows() < context.delta_small_pack_rows; + bool cur_task_full = cur_task.total_rows >= context.delta_small_column_file_rows; + bool small_column_file = t_file->getRows() < context.delta_small_column_file_rows; bool schema_ok = cur_task.to_compact.empty(); if (!schema_ok) @@ -376,11 +373,12 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & { if (compaction->current_compaction_version != minor_compaction_version) { - LOG_WARNING(log, "Structure has been updated during compact"); + LOG_FMT_WARNING(log, "Structure has been updated during compact"); return false; } minor_compaction_version += 1; ColumnFilePersistedLevels new_persisted_files_levels; + // Copy column files in level range [0, compaction->compaction_src_level) for (size_t i = 0; i < compaction->compaction_src_level; i++) { auto & new_level = new_persisted_files_levels.emplace_back(); @@ -389,16 +387,16 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & } // Create a new empty level for `compaction_src_level` because all the column files is compacted to next level new_persisted_files_levels.emplace_back(); - // Add new file to the target level auto target_level = compaction->compaction_src_level + 1; auto & target_level_files = new_persisted_files_levels.emplace_back(); - // clone the old column files in the target level first + // Copy the old column files in the target level first if exists if (persisted_files_levels.size() > target_level) { for (auto & column_file : persisted_files_levels[target_level]) target_level_files.emplace_back(column_file); } + // Add the compaction result to new target level for (auto & task : compaction->tasks) { if (task.is_trivial_move) @@ -406,8 +404,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & else target_level_files.push_back(task.result); } - - // Append remaining levels + // Copy column files in level range [compaction->compaction_src_level + 1, +inf) if exists for (size_t i = target_level + 1; i < persisted_files_levels.size(); i++) { auto & new_level = new_persisted_files_levels.emplace_back(); @@ -438,9 +435,11 @@ ColumnFileSetSnapshotPtr ColumnFilePersistedSet::createSnapshot(const DMContext size_t total_rows = 0; size_t total_deletes = 0; - for (const auto & level : persisted_files_levels) + // The read direction is from the last level to the first level, + // and in each level we read from the begin to the end. + for (auto level_it = persisted_files_levels.rbegin(); level_it != persisted_files_levels.rend(); level_it++) { - for (const auto & file : level) + for (const auto & file : *level_it) { if (auto * t = file->tryToTinyFile(); (t && t->getCache())) { @@ -458,7 +457,10 @@ ColumnFileSetSnapshotPtr ColumnFilePersistedSet::createSnapshot(const DMContext } if (unlikely(total_rows != rows || total_deletes != deletes)) - throw Exception("Rows and deletes check failed!", ErrorCodes::LOGICAL_ERROR); + { + LOG_FMT_ERROR(log, "Rows and deletes check failed. Actual: rows[{}], deletes[{}]. Expected: rows[{}], deletes[{}].", total_rows, total_deletes, rows.load(), deletes.load()); + throw Exception("Rows and deletes check failed.", ErrorCodes::LOGICAL_ERROR); + } return snap; } diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h index 0af64127848..eb5ec2df915 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -71,9 +71,13 @@ class ColumnFilePersistedSet : public std::enable_shared_from_this(std::move(persisted_file_set_)); + auto persisted_file_set = ColumnFilePersistedSet::restore(context, segment_range, id); + return std::make_shared(std::move(persisted_file_set)); } void DeltaValueSpace::saveMeta(WriteBatches & wbs) const @@ -58,7 +58,7 @@ DeltaValueSpace::checkHeadAndCloneTail(DMContext & context, WriteBatches & wbs) const { auto tail_persisted_files = persisted_file_set->checkHeadAndCloneTail(context, target_range, head_column_files, wbs); - auto memory_files = mem_table_set->cloneColumnFiles(); + auto memory_files = mem_table_set->cloneColumnFiles(context, target_range, wbs); return std::make_pair(std::move(tail_persisted_files), std::move(memory_files)); } @@ -127,7 +127,7 @@ bool DeltaValueSpace::ingestColumnFiles(DMContext & /*context*/, const RowKeyRan bool DeltaValueSpace::flush(DMContext & context) { - LOG_DEBUG(log, info() << ", Flush start"); + LOG_FMT_DEBUG(log, "{}, Flush start", info()); /// We have two types of data needed to flush to disk: /// 1. The cache data in ColumnFileInMemory @@ -141,7 +141,7 @@ bool DeltaValueSpace::flush(DMContext & context) std::scoped_lock lock(mutex); if (abandoned.load(std::memory_order_relaxed)) { - LOG_DEBUG(log, simpleInfo() << "Flush stop because abandoned"); + LOG_FMT_DEBUG(log, "{} Flush stop because abandoned", simpleInfo()); return false; } flush_task = mem_table_set->buildFlushTask(context, persisted_file_set->getRows(), persisted_file_set->getDeletes(), persisted_file_set->getCurrentFlushVersion()); @@ -151,7 +151,7 @@ bool DeltaValueSpace::flush(DMContext & context) // No update, return successfully. if (!flush_task) { - LOG_DEBUG(log, simpleInfo() << " Nothing to flush"); + LOG_FMT_DEBUG(log, "{} Nothing to flush", simpleInfo()); return true; } @@ -160,9 +160,9 @@ bool DeltaValueSpace::flush(DMContext & context) DeltaIndexPtr new_delta_index; if (!delta_index_updates.empty()) { - LOG_DEBUG(log, simpleInfo() << " Update index start"); + LOG_FMT_DEBUG(log, "{} Update index start", simpleInfo()); new_delta_index = cur_delta_index->cloneWithUpdates(delta_index_updates); - LOG_DEBUG(log, simpleInfo() << " Update index done"); + LOG_FMT_DEBUG(log, "{} Update index done", simpleInfo()); } { @@ -172,23 +172,21 @@ bool DeltaValueSpace::flush(DMContext & context) { // Delete written data. wbs.setRollback(); - LOG_DEBUG(log, simpleInfo() << " Flush stop because abandoned"); + LOG_FMT_DEBUG(log, "{} Flush stop because abandoned", simpleInfo()); return false; } if (!flush_task->commit(persisted_file_set, wbs)) { wbs.rollbackWrittenLogAndData(); - LOG_DEBUG(log, simpleInfo() << " Stop flush because structure got updated"); + LOG_FMT_DEBUG(log, "{} Stop flush because structure got updated", simpleInfo()); } /// Update delta tree if (new_delta_index) delta_index = new_delta_index; - LOG_DEBUG(log, - simpleInfo() << " Flush end. Flushed " << flush_task->getTaskNum() << " column files, " << flush_task->getFlushRows() << " rows and " << flush_task->getFlushDeletes() - << " deletes."); + LOG_FMT_DEBUG(log, "{} Flush end. Flushed {} column files, {} rows and {} deletes.", info(), flush_task->getTaskNum(), flush_task->getFlushRows(), flush_task->getFlushDeletes()); } return true; } @@ -199,7 +197,7 @@ bool DeltaValueSpace::compact(DMContext & context) // Other thread is doing structure update, just return. if (!is_updating.compare_exchange_strong(v, true)) { - LOG_DEBUG(log, simpleInfo() << " Compact stop because updating"); + LOG_FMT_DEBUG(log, "{} Compact stop because updating", simpleInfo()); return true; } SCOPE_EXIT({ @@ -214,13 +212,13 @@ bool DeltaValueSpace::compact(DMContext & context) std::scoped_lock lock(mutex); if (abandoned.load(std::memory_order_relaxed)) { - LOG_DEBUG(log, simpleInfo() << " Compact stop because abandoned"); + LOG_FMT_DEBUG(log, "{} Compact stop because abandoned", simpleInfo()); return false; } compaction_task = persisted_file_set->pickUpMinorCompaction(context); if (!compaction_task) { - LOG_DEBUG(log, simpleInfo() << " Nothing to compact"); + LOG_FMT_DEBUG(log, "{} Nothing to compact", simpleInfo()); return true; } log_storage_snap = context.storage_pool.log()->getSnapshot(); @@ -238,18 +236,18 @@ bool DeltaValueSpace::compact(DMContext & context) if (abandoned.load(std::memory_order_relaxed)) { wbs.rollbackWrittenLogAndData(); - LOG_DEBUG(log, simpleInfo() << " Stop compact because abandoned"); + LOG_FMT_DEBUG(log, "{} Stop compact because abandoned", simpleInfo()); return false; } if (!compaction_task->commit(persisted_file_set, wbs)) { - LOG_WARNING(log, "Structure has been updated during compact"); + LOG_FMT_WARNING(log, "Structure has been updated during compact"); wbs.rollbackWrittenLogAndData(); - LOG_DEBUG(log, simpleInfo() << " Compact stop because structure got updated"); + LOG_FMT_DEBUG(log, "{} Compact stop because structure got updated", simpleInfo()); return false; } - LOG_DEBUG(log, simpleInfo() << compaction_task->info()); + LOG_FMT_DEBUG(log, "{} {}", simpleInfo(), compaction_task->info()); } wbs.writeRemoves(); diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h index b26e33392a6..9b91a8d5aa8 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h @@ -64,7 +64,7 @@ class DeltaValueSpace : public std::enable_shared_from_this std::atomic last_try_flush_rows = 0; std::atomic last_try_flush_bytes = 0; - std::atomic last_try_compact_packs = 0; + std::atomic last_try_compact_column_files = 0; std::atomic last_try_merge_delta_rows = 0; std::atomic last_try_merge_delta_bytes = 0; std::atomic last_try_split_rows = 0; @@ -90,7 +90,7 @@ class DeltaValueSpace : public std::enable_shared_from_this String simpleInfo() const { return "Delta [" + DB::toString(persisted_file_set->getId()) + "]"; } String info() const { - return fmt::format("{}, {}", mem_table_set->info(), persisted_file_set->info()); + return fmt::format("{}. {}", mem_table_set->info(), persisted_file_set->info()); } bool getLock(Lock & lock) const @@ -168,7 +168,7 @@ class DeltaValueSpace : public std::enable_shared_from_this std::atomic & getLastTryFlushRows() { return last_try_flush_rows; } std::atomic & getLastTryFlushBytes() { return last_try_flush_bytes; } - std::atomic & getLastTryCompactPacks() { return last_try_compact_packs; } + std::atomic & getLastTryCompactColumnFiles() { return last_try_compact_column_files; } std::atomic & getLastTryMergeDeltaRows() { return last_try_merge_delta_rows; } std::atomic & getLastTryMergeDeltaBytes() { return last_try_merge_delta_bytes; } std::atomic & getLastTrySplitRows() { return last_try_split_rows; } @@ -275,7 +275,7 @@ class DeltaValueSnapshot : public std::enable_shared_from_this #include #include +#include #include +#include +#include namespace ProfileEvents { @@ -49,6 +52,54 @@ void MemTableSet::appendColumnFileInner(const ColumnFilePtr & column_file) deletes += column_file->getDeletes(); } +ColumnFiles MemTableSet::cloneColumnFiles(DMContext & context, const RowKeyRange & target_range, WriteBatches & wbs) +{ + ColumnFiles cloned_column_files; + for (const auto & column_file : column_files) + { + if (auto * dr = column_file->tryToDeleteRange(); dr) + { + auto new_dr = dr->getDeleteRange().shrink(target_range); + if (!new_dr.none()) + { + // Only use the available delete_range column file. + cloned_column_files.push_back(dr->cloneWith(new_dr)); + } + } + else if (auto * b = column_file->tryToInMemoryFile(); b) + { + auto new_column_file = b->clone(); + + // No matter or what, don't append to column files which cloned from old column file again. + // Because they could shared the same cache. And the cache can NOT be inserted from different column files in different delta. + new_column_file->disableAppend(); + cloned_column_files.push_back(new_column_file); + } + else if (auto * t = column_file->tryToTinyFile(); t) + { + // Use a newly created page_id to reference the data page_id of current column file. + PageId new_data_page_id = context.storage_pool.newLogPageId(); + wbs.log.putRefPage(new_data_page_id, t->getDataPageId()); + auto new_column_file = t->cloneWith(new_data_page_id); + + cloned_column_files.push_back(new_column_file); + } + else if (auto * f = column_file->tryToBigFile(); f) + { + auto delegator = context.path_pool.getStableDiskDelegator(); + auto new_ref_id = context.storage_pool.newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); + auto file_id = f->getFile()->fileId(); + wbs.data.putRefPage(new_ref_id, file_id); + auto file_parent_path = delegator.getDTFilePath(file_id); + auto new_file = DMFile::restore(context.db_context.getFileProvider(), file_id, /* ref_id= */ new_ref_id, file_parent_path, DMFile::ReadMetaMode::all()); + + auto new_column_file = f->cloneWith(context, new_file, target_range); + cloned_column_files.push_back(new_column_file); + } + } + return cloned_column_files; +} + void MemTableSet::appendColumnFile(const ColumnFilePtr & column_file) { appendColumnFileInner(column_file); @@ -124,7 +175,10 @@ ColumnFileSetSnapshotPtr MemTableSet::createSnapshot() } if (unlikely(total_rows != rows || total_deletes != deletes)) - throw Exception("Rows and deletes check failed!", ErrorCodes::LOGICAL_ERROR); + { + LOG_FMT_ERROR(log, "Rows and deletes check failed. Actual: rows[{}], deletes[{}]. Expected: rows[{}], deletes[{}].", total_rows, total_deletes, rows.load(), deletes.load()); + throw Exception("Rows and deletes check failed.", ErrorCodes::LOGICAL_ERROR); + } return snap; } @@ -154,7 +208,10 @@ ColumnFileFlushTaskPtr MemTableSet::buildFlushTask(DMContext & context, size_t r cur_deletes_offset += column_file->getDeletes(); } if (unlikely(flush_task->getFlushRows() != rows || flush_task->getFlushDeletes() != deletes)) - throw Exception("Rows and deletes check failed", ErrorCodes::LOGICAL_ERROR); + { + LOG_FMT_ERROR(log, "Rows and deletes check failed. Actual: rows[{}], deletes[{}]. Expected: rows[{}], deletes[{}]. Column Files: {}", flush_task->getFlushRows(), flush_task->getFlushDeletes(), rows.load(), deletes.load(), columnFilesToString(column_files)); + throw Exception("Rows and deletes check failed.", ErrorCodes::LOGICAL_ERROR); + } return flush_task; } @@ -165,7 +222,6 @@ void MemTableSet::removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush if (unlikely(tasks.size() > column_files.size())) throw Exception("column_files num check failed", ErrorCodes::LOGICAL_ERROR); - ColumnFiles new_column_files; size_t flush_bytes = 0; auto column_file_iter = column_files.begin(); for (const auto & task : tasks) @@ -177,6 +233,7 @@ void MemTableSet::removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush flush_bytes += task.column_file->getBytes(); column_file_iter++; } + ColumnFiles new_column_files; size_t new_rows = 0; size_t new_bytes = 0; size_t new_deletes = 0; @@ -186,6 +243,7 @@ void MemTableSet::removeColumnFilesInFlushTask(const ColumnFileFlushTask & flush new_rows += (*column_file_iter)->getRows(); new_bytes += (*column_file_iter)->getBytes(); new_deletes += (*column_file_iter)->getDeletes(); + column_file_iter++; } column_files.swap(new_column_files); rows = new_rows; diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h index 79fe4cbdf22..f02616caa67 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.h @@ -19,7 +19,7 @@ class MemTableSet : public std::enable_shared_from_this , private boost::noncopyable { private: - // to avoid serialize the same schema between continuous ColumnFileInMemory and ColumnFileTiny instance + /// To avoid serialize the same schema between continuous ColumnFileInMemory and ColumnFileTiny instance. BlockPtr last_schema; ColumnFiles column_files; @@ -56,7 +56,7 @@ class MemTableSet : public std::enable_shared_from_this deletes.load()); } - ColumnFiles cloneColumnFiles() { return column_files; } + ColumnFiles cloneColumnFiles(DMContext & context, const RowKeyRange & target_range, WriteBatches & wbs); size_t getColumnFileCount() const { return column_files.size(); } size_t getRows() const { return rows; } diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp index 2ca1d2219c0..ad824b3f919 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp @@ -30,7 +30,7 @@ void MinorCompaction::prepare(DMContext & context, WriteBatches & wbs, const Pag if (unlikely(!t_file)) throw Exception("The compact candidate is not a ColumnTinyFile", ErrorCodes::LOGICAL_ERROR); - // We ensure schema of all packs are the same + // We ensure schema of all column files are the same Block block = t_file->readBlockForMinorCompaction(reader); size_t block_rows = block.rows(); for (size_t i = 0; i < schema.columns(); ++i) diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h index 8b2a4143a7e..3e92d634c8f 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h @@ -51,7 +51,7 @@ class MinorCompaction : public std::enable_shared_from_this public: MinorCompaction(size_t compaction_src_level_, size_t current_compaction_version_); - // return whether this task is a trivial move + // Add new task and return whether this task is a trivial move inline bool packUpTask(Task && task) { if (task.to_compact.empty()) @@ -75,8 +75,10 @@ class MinorCompaction : public std::enable_shared_from_this return is_trivial_move; } + /// Create new column file by combining several small `ColumnFileTiny`s void prepare(DMContext & context, WriteBatches & wbs, const PageReader & reader); + /// Add new column files and remove old column files in `ColumnFilePersistedSet` bool commit(ColumnFilePersistedSetPtr & persisted_file_set, WriteBatches & wbs); String info() const; diff --git a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp index eccc285d745..853f6c59735 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp @@ -35,9 +35,16 @@ DeltaSnapshotPtr DeltaValueSpace::createSnapshot(const DMContext & context, bool RowKeyRange DeltaValueSnapshot::getSquashDeleteRange() const { - auto delete_range1 = mem_table_snap->getSquashDeleteRange(); - auto delete_range2 = persisted_files_snap->getSquashDeleteRange(); - return delete_range1.merge(delete_range2); + if (mem_table_snap) + { + auto delete_range1 = mem_table_snap->getSquashDeleteRange(); + auto delete_range2 = persisted_files_snap->getSquashDeleteRange(); + return delete_range1.merge(delete_range2); + } + else + { + return persisted_files_snap->getSquashDeleteRange(); + } } // ================================================ @@ -58,7 +65,7 @@ DeltaValueReader::DeltaValueReader( DeltaValueReaderPtr DeltaValueReader::createNewReader(const ColumnDefinesPtr & new_col_defs) { - auto new_reader = new DeltaValueReader(); + auto * new_reader = new DeltaValueReader(); new_reader->delta_snap = delta_snap; new_reader->_compacted_delta_index = _compacted_delta_index; new_reader->persisted_files_reader = persisted_files_reader->createNewReader(new_col_defs); diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp index 127629f270e..4742afd041b 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.cpp @@ -278,7 +278,7 @@ void DeltaMergeStore::setUpBackgroundTask(const DMContextPtr & dm_context) }; auto dmfile_remover = [&](const PageStorage::PathAndIdsVec & path_and_ids_vec, const std::set & valid_ids) { auto delegate = path_pool.getStableDiskDelegator(); - for (auto & [path, ids] : path_and_ids_vec) + for (const auto & [path, ids] : path_and_ids_vec) { for (auto id : ids) { @@ -511,7 +511,7 @@ void DeltaMergeStore::write(const Context & db_context, const DB::Settings & db_ if (segment->hasAbandoned()) continue; - auto & rowkey_range = segment->getRowKeyRange(); + const auto & rowkey_range = segment->getRowKeyRange(); auto [cur_offset, cur_limit] = rowkey_range.getPosRange(handle_column, offset, rows - offset); if (unlikely(cur_offset != offset)) @@ -521,7 +521,7 @@ void DeltaMergeStore::write(const Context & db_context, const DB::Settings & db_ auto alloc_bytes = block.bytes(offset, limit); bool is_small = limit < dm_context->delta_cache_limit_rows / 4 && alloc_bytes < dm_context->delta_cache_limit_bytes / 4; - // Small column fies are appended to Delta Cache, the flushed later. + // Small column fies are appended to Delta Cache, then flushed later. // While large column fies are directly written to PageStorage. if (is_small) { @@ -655,7 +655,7 @@ void DeltaMergeStore::ingestFiles( // TODO: If tiflash crash during the middle of ingesting, we may leave some DTFiles on disk and // they can not be deleted. We should find a way to cleanup those files. WriteBatches ingest_wbs(storage_pool, dm_context->getWriteLimiter()); - if (files.size() > 0) + if (!files.empty()) { for (const auto & file : files) { @@ -701,7 +701,7 @@ void DeltaMergeStore::ingestFiles( { /// Generate DMFile instance with a new ref_id pointed to the file_id. auto file_id = file->fileId(); - auto & file_parent_path = file->parentPath(); + const auto & file_parent_path = file->parentPath(); auto ref_id = storage_pool.newDataPageIdForDTFile(delegate, __PRETTY_FUNCTION__); auto ref_file = DMFile::restore(file_provider, file_id, ref_id, file_parent_path, DMFile::ReadMetaMode::all()); @@ -1069,7 +1069,7 @@ void DeltaMergeStore::waitForWrite(const DMContextPtr & dm_context, const Segmen size_t segment_bytes = segment->getEstimatedBytes(); // The speed of delta merge in a very bad situation we assume. It should be a very conservative value. - size_t _10MB = 10 << 20; + constexpr size_t ten_mb = 10 << 20; size_t stop_write_delta_rows = dm_context->db_context.getSettingsRef().dt_segment_stop_write_delta_rows; size_t stop_write_delta_bytes = dm_context->db_context.getSettingsRef().dt_segment_stop_write_delta_size; @@ -1079,7 +1079,7 @@ void DeltaMergeStore::waitForWrite(const DMContextPtr & dm_context, const Segmen if (delta_rows >= stop_write_delta_rows || delta_bytes >= stop_write_delta_bytes) sleep_ms = std::numeric_limits::max(); else - sleep_ms = (double)segment_bytes / _10MB * 1000 * wait_duration_factor; + sleep_ms = static_cast(segment_bytes) / ten_mb * 1000 * wait_duration_factor; // checkSegmentUpdate could do foreground merge delta, so call it before sleep. checkSegmentUpdate(dm_context, segment, ThreadType::Write); @@ -1105,7 +1105,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const { if (segment->hasAbandoned()) return; - auto & delta = segment->getDelta(); + const auto & delta = segment->getDelta(); size_t delta_saved_rows = delta->getRows(/* use_unsaved */ false); size_t delta_saved_bytes = delta->getBytes(/* use_unsaved */ false); @@ -1121,13 +1121,13 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const size_t delta_bytes = delta_saved_bytes + unsaved_bytes; size_t segment_rows = segment->getEstimatedRows(); size_t segment_bytes = segment->getEstimatedBytes(); - size_t pack_count = delta->getColumnFileCount(); + size_t column_file_count = delta->getColumnFileCount(); size_t placed_delta_rows = delta->getPlacedDeltaRows(); auto & delta_last_try_flush_rows = delta->getLastTryFlushRows(); auto & delta_last_try_flush_bytes = delta->getLastTryFlushBytes(); - auto & delta_last_try_compact_packs = delta->getLastTryCompactPacks(); + auto & delta_last_try_compact_column_files = delta->getLastTryCompactColumnFiles(); auto & delta_last_try_merge_delta_rows = delta->getLastTryMergeDeltaRows(); auto & delta_last_try_merge_delta_bytes = delta->getLastTryMergeDeltaBytes(); auto & delta_last_try_split_rows = delta->getLastTrySplitRows(); @@ -1162,7 +1162,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const bool should_merge = segment_rows < segment_limit_rows / 3 && segment_bytes < segment_limit_bytes / 3; // Don't do compact on starting up. - bool should_compact = (thread_type != ThreadType::Init) && std::max((Int64)pack_count - delta_last_try_compact_packs, 0) >= 10; + bool should_compact = (thread_type != ThreadType::Init) && std::max(static_cast(column_file_count) - delta_last_try_compact_column_files, 0) >= 10; // Don't do background place index if we limit DeltaIndex cache. bool should_place_delta_index = !dm_context->db_context.isDeltaIndexLimited() @@ -1211,7 +1211,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const /// Now start trying structure update. - auto getMergeSibling = [&]() -> SegmentPtr { + auto get_merge_sibling = [&]() -> SegmentPtr { /// For complexity reason, currently we only try to merge with next segment. Normally it is good enough. // The last segment cannot be merged. @@ -1282,16 +1282,13 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const auto my_should_split = my_segment_size >= dm_context->segment_force_split_bytes; if (my_should_split && !my_segment->isSplitForbidden()) { - if (segmentSplit(*dm_context, my_segment, true).first) - return true; - else - return false; + return static_cast(segmentSplit(*dm_context, my_segment, true).first); } return false; }; auto try_bg_merge = [&]() { SegmentPtr merge_sibling; - if (should_merge && (merge_sibling = getMergeSibling())) + if (should_merge && (merge_sibling = get_merge_sibling())) { try_add_background_task(BackgroundTask{TaskType::Merge, dm_context, segment, merge_sibling}); return true; @@ -1301,7 +1298,7 @@ void DeltaMergeStore::checkSegmentUpdate(const DMContextPtr & dm_context, const auto try_bg_compact = [&]() { if (should_compact) { - delta_last_try_compact_packs = pack_count; + delta_last_try_compact_column_files = column_file_count; try_add_background_task(BackgroundTask{TaskType::Compact, dm_context, segment, {}}); return true; } @@ -1458,7 +1455,7 @@ bool shouldCompactStable(const SegmentPtr & seg, DB::Timestamp gc_safepoint, dou if (ratio_threshold < 1.0) return true; - auto & property = seg->getStable()->getStableProperty(); + const auto & property = seg->getStable()->getStableProperty(); LOG_FMT_TRACE(log, "{} {}", __PRETTY_FUNCTION__, property.toDebugString()); // No data older than safe_point to GC. if (property.gc_hint_version > gc_safepoint) @@ -1669,8 +1666,8 @@ SegmentPair DeltaMergeStore::segmentSplit(DMContext & dm_context, const SegmentP } // Not counting the early give up action. - auto delta_bytes = (Int64)segment_snap->delta->getBytes(); - auto delta_rows = (Int64)segment_snap->delta->getRows(); + auto delta_bytes = static_cast(segment_snap->delta->getBytes()); + auto delta_rows = static_cast(segment_snap->delta->getRows()); size_t duplicated_bytes = 0; size_t duplicated_rows = 0; @@ -1801,8 +1798,8 @@ void DeltaMergeStore::segmentMerge(DMContext & dm_context, const SegmentPtr & le } // Not counting the early give up action. - auto delta_bytes = (Int64)left_snap->delta->getBytes() + right_snap->getBytes(); - auto delta_rows = (Int64)left_snap->delta->getRows() + right_snap->getRows(); + auto delta_bytes = static_cast(left_snap->delta->getBytes()) + right_snap->getBytes(); + auto delta_rows = static_cast(left_snap->delta->getRows()) + right_snap->getRows(); CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_SegmentMerge}; GET_METRIC(tiflash_storage_subtask_count, type_seg_merge).Increment(); @@ -1895,12 +1892,12 @@ SegmentPtr DeltaMergeStore::segmentMergeDelta( } // Not counting the early give up action. - auto delta_bytes = (Int64)segment_snap->delta->getBytes(); - auto delta_rows = (Int64)segment_snap->delta->getRows(); + auto delta_bytes = static_cast(segment_snap->delta->getBytes()); + auto delta_rows = static_cast(segment_snap->delta->getRows()); CurrentMetrics::Increment cur_dm_segments{CurrentMetrics::DT_DeltaMerge}; - CurrentMetrics::Increment cur_dm_total_bytes{CurrentMetrics::DT_DeltaMergeTotalBytes, (Int64)segment_snap->getBytes()}; - CurrentMetrics::Increment cur_dm_total_rows{CurrentMetrics::DT_DeltaMergeTotalRows, (Int64)segment_snap->getRows()}; + CurrentMetrics::Increment cur_dm_total_bytes{CurrentMetrics::DT_DeltaMergeTotalBytes, static_cast(segment_snap->getBytes())}; + CurrentMetrics::Increment cur_dm_total_rows{CurrentMetrics::DT_DeltaMergeTotalRows, static_cast(segment_snap->getRows())}; switch (run_thread) { @@ -2135,7 +2132,7 @@ void DeltaMergeStore::restoreStableFiles() auto path_delegate = path_pool.getStableDiskDelegator(); for (const auto & root_path : path_delegate.listPaths()) { - for (auto & file_id : DMFile::listAllInPath(file_provider, root_path, options)) + for (const auto & file_id : DMFile::listAllInPath(file_provider, root_path, options)) { auto dmfile = DMFile::restore(file_provider, file_id, /* ref_id= */ 0, root_path, DMFile::ReadMetaMode::diskSizeOnly()); path_delegate.addDTFile(file_id, dmfile->getBytesOnDisk(), root_path); @@ -2157,10 +2154,10 @@ DeltaMergeStoreStat DeltaMergeStore::getStat() stat.segment_count = segments.size(); - long total_placed_rows = 0; - long total_delta_cache_rows = 0; + size_t total_placed_rows = 0; + size_t total_delta_cache_rows = 0; Float64 total_delta_cache_size = 0; - long total_delta_valid_cache_rows = 0; + size_t total_delta_valid_cache_rows = 0; for (const auto & [handle, segment] : segments) { (void)handle; @@ -2278,8 +2275,8 @@ SegmentStats DeltaMergeStore::getSegmentStats() (void)handle; SegmentStat stat; - auto & delta = segment->getDelta(); - auto & stable = segment->getStable(); + const auto & delta = segment->getDelta(); + const auto & stable = segment->getStable(); stat.segment_id = segment->segmentId(); stat.range = segment->getRowKeyRange(); @@ -2293,10 +2290,10 @@ SegmentStats DeltaMergeStore::getSegmentStats() stat.delta_pack_count = delta->getColumnFileCount(); stat.stable_pack_count = stable->getPacks(); - stat.avg_delta_pack_rows = (Float64)delta->getRows() / stat.delta_pack_count; - stat.avg_stable_pack_rows = (Float64)stable->getRows() / stat.stable_pack_count; + stat.avg_delta_pack_rows = static_cast(delta->getRows()) / stat.delta_pack_count; + stat.avg_stable_pack_rows = static_cast(stable->getRows()) / stat.stable_pack_count; - stat.delta_rate = (Float64)delta->getRows() / stat.rows; + stat.delta_rate = static_cast(delta->getRows()) / stat.rows; stat.delta_cache_size = delta->getTotalCacheBytes(); stat.delta_index_size = delta->getDeltaIndexBytes(); @@ -2328,8 +2325,8 @@ SegmentReadTasks DeltaMergeStore::getReadTasksByRanges( while (range_it != sorted_ranges.end() && seg_it != segments.end()) { - auto & req_range = *range_it; - auto & seg_range = seg_it->second->getRowKeyRange(); + const auto & req_range = *range_it; + const auto & seg_range = seg_it->second->getRowKeyRange(); if (req_range.intersect(seg_range) && (read_segments.empty() || read_segments.count(seg_it->second->segmentId()))) { if (tasks.empty() || tasks.back()->segment != seg_it->second) diff --git a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h index f5b69e3e630..9f344c3f768 100644 --- a/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h +++ b/dbms/src/Storages/DeltaMerge/DeltaMergeStore.h @@ -275,7 +275,7 @@ class DeltaMergeStore : private boost::noncopyable DeltaMergeStore(Context & db_context, // bool data_path_contains_database_name, const String & db_name, - const String & tbl_name, + const String & table_name_, const ColumnDefines & columns, const ColumnDefine & handle, bool is_common_handle_, @@ -323,7 +323,7 @@ class DeltaMergeStore : private boost::noncopyable /// Read all rows without MVCC filtering BlockInputStreams readRaw(const Context & db_context, const DB::Settings & db_settings, - const ColumnDefines & column_defines, + const ColumnDefines & columns_to_read, size_t num_streams, const SegmentIdSet & read_segments = {}); diff --git a/dbms/src/Storages/DeltaMerge/Segment.cpp b/dbms/src/Storages/DeltaMerge/Segment.cpp index 08dba3c6694..0774a5dec72 100644 --- a/dbms/src/Storages/DeltaMerge/Segment.cpp +++ b/dbms/src/Storages/DeltaMerge/Segment.cpp @@ -445,7 +445,7 @@ BlockInputStreamPtr Segment::getInputStreamForDataExport(const DMContext & dm_co const SegmentSnapshotPtr & segment_snap, const RowKeyRange & data_range, size_t expected_block_size, - bool reorgnize_block) const + bool reorganize_block) const { RowKeyRanges data_ranges{data_range}; auto read_info = getReadInfo(dm_context, columns_to_read, segment_snap, data_ranges); @@ -462,7 +462,7 @@ BlockInputStreamPtr Segment::getInputStreamForDataExport(const DMContext & dm_co data_stream = std::make_shared>(data_stream, data_ranges, 0); - if (reorgnize_block) + if (reorganize_block) { data_stream = std::make_shared>(data_stream, EXTRA_HANDLE_COLUMN_ID, is_common_handle); } @@ -574,7 +574,7 @@ StableValueSpacePtr Segment::prepareMergeDelta(DMContext & dm_context, WriteBatches & wbs) const { LOG_FMT_INFO(log, - "Segment [{}] prepare merge delta start. delta packs: {}, delta total rows: {}, delta total size: {}", + "Segment [{}] prepare merge delta start. delta column files: {}, delta total rows: {}, delta total size: {}", segment_id, segment_snap->delta->getColumnFileCount(), segment_snap->delta->getRows(), @@ -604,7 +604,7 @@ SegmentPtr Segment::applyMergeDelta(DMContext & context, { LOG_FMT_INFO(log, "Before apply merge delta: {}", info()); - auto [persisted_column_files, in_memory_files] = delta->checkHeadAndCloneTail(context, rowkey_range, segment_snap->delta->getHeadColumnFilesForCheck(), wbs); + auto [persisted_column_files, in_memory_files] = delta->checkHeadAndCloneTail(context, rowkey_range, segment_snap->delta->getColumnFilesInSnapshot(), wbs); // Created references to tail pages' pages in "log" storage, we need to write them down. wbs.writeLogAndData(); @@ -898,7 +898,9 @@ std::optional Segment::prepareSplitLogical(DMContext & dm_co return {}; } - GenPageId log_gen_page_id = std::bind(&StoragePool::newLogPageId, &storage_pool); + GenPageId log_gen_page_id = [&]() { + return storage_pool.newLogPageId(); + }; DMFiles my_stable_files; DMFiles other_stable_files; @@ -1065,7 +1067,7 @@ SegmentPair Segment::applySplit(DMContext & dm_context, // RowKeyRange my_range(rowkey_range.start, split_info.split_point, is_common_handle, rowkey_column_size); RowKeyRange other_range(split_info.split_point, rowkey_range.end, is_common_handle, rowkey_column_size); ColumnFiles empty_files; - ColumnFiles * head_files = split_info.is_logical ? &empty_files : &segment_snap->delta->getHeadColumnFilesForCheck(); + ColumnFiles * head_files = split_info.is_logical ? &empty_files : &segment_snap->delta->getColumnFilesInSnapshot(); auto [my_persisted_files, my_in_memory_files] = delta->checkHeadAndCloneTail(dm_context, my_range, *head_files, wbs); auto [other_persisted_files, other_in_memory_files] = delta->checkHeadAndCloneTail(dm_context, other_range, *head_files, wbs); @@ -1209,14 +1211,12 @@ SegmentPtr Segment::applyMerge(DMContext & dm_context, // RowKeyRange merged_range(left->rowkey_range.start, right->rowkey_range.end, left->is_common_handle, left->rowkey_column_size); - auto [left_persisted_files, left_in_memory_files] = left->delta->checkHeadAndCloneTail(dm_context, merged_range, left_snap->delta->getHeadColumnFilesForCheck(), wbs); - auto [right_persisted_files, right_in_memory_files] = right->delta->checkHeadAndCloneTail(dm_context, merged_range, right_snap->delta->getHeadColumnFilesForCheck(), wbs); - + auto [left_persisted_files, left_in_memory_files] = left->delta->checkHeadAndCloneTail(dm_context, merged_range, left_snap->delta->getColumnFilesInSnapshot(), wbs); + auto [right_persisted_files, right_in_memory_files] = right->delta->checkHeadAndCloneTail(dm_context, merged_range, right_snap->delta->getColumnFilesInSnapshot(), wbs); // Created references to tail pages' pages in "log" storage, we need to write them down. wbs.writeLogAndData(); - /// Make sure saved packs are appended before unsaved packs. ColumnFilePersisteds merged_persisted_column_files = std::move(left_persisted_files); ColumnFiles merged_in_memory_files = std::move(left_in_memory_files); From aca8b971f59f8c4e53765ef417cde9d1e942fe32 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Mon, 7 Feb 2022 22:42:20 +0800 Subject: [PATCH 14/23] Fix gtests --- dbms/src/Server/tests/gtest_server_config.cpp | 4 ++-- .../Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/dbms/src/Server/tests/gtest_server_config.cpp b/dbms/src/Server/tests/gtest_server_config.cpp index 686e4ed1acd..72e292ffa4a 100644 --- a/dbms/src/Server/tests/gtest_server_config.cpp +++ b/dbms/src/Server/tests/gtest_server_config.cpp @@ -175,8 +175,8 @@ dt_storage_pool_data_gc_max_valid_rate = 0.5 ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_file_num, 8); ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_legacy_num, 2); ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_bytes, 256); - ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_pack_size, 8388608); - ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_pack_rows, 2048); + ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_column_file_size, 8388608); + ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_column_file_rows, 2048); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_limit_size, 536870912); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_limit_size, 42991616); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_force_merge_delta_size, 1073741824); diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp index 8676705474f..b786e46b26d 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp @@ -181,10 +181,6 @@ void ColumnFileSetReader::getPlaceItems(BlockOrDeletes & place_items, size_t row auto block = readPKVersion(block_rows_end, column_file.getRows()); place_items.emplace_back(std::move(block), block_rows_end + place_rows_offset); } - else - { - throw Exception("Unknown column file type", ErrorCodes::LOGICAL_ERROR); - } block_rows_end += column_file.getRows(); block_rows_start = block_rows_end; From 9c654537073c634b49f8689c893a7d56b68da038 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Tue, 8 Feb 2022 13:39:14 +0800 Subject: [PATCH 15/23] Add more log and check for minor compaction --- .../Delta/ColumnFilePersistedSet.cpp | 35 ++++++++++++++++--- .../DeltaMerge/Delta/MinorCompaction.cpp | 3 +- .../DeltaMerge/Delta/MinorCompaction.h | 1 + 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index 9cf0fa232f0..ac4c64b0dfb 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -16,9 +16,10 @@ namespace DM inline ColumnFilePersisteds flattenColumnFileLevels(const ColumnFilePersistedSet::ColumnFilePersistedLevels & file_levels) { ColumnFilePersisteds column_files; - for (const auto & level : file_levels) + // Last level first + for (auto level_it = file_levels.rbegin(); level_it != file_levels.rend(); ++level_it) { - for (const auto & file : level) + for (const auto & file : *level_it) { column_files.emplace_back(file); } @@ -377,6 +378,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & return false; } minor_compaction_version += 1; + LOG_FMT_DEBUG(log, "Before commit compaction, level summary: {}", info()); ColumnFilePersistedLevels new_persisted_files_levels; // Copy column files in level range [0, compaction->compaction_src_level) for (size_t i = 0; i < compaction->compaction_src_level; i++) @@ -385,8 +387,30 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & for (const auto & f : persisted_files_levels[i]) new_level.push_back(f); } - // Create a new empty level for `compaction_src_level` because all the column files is compacted to next level - new_persisted_files_levels.emplace_back(); + // Copy the files in source level that is not in the compaction task. + // Actually, just level 0 may contain file that is not in the compaction task, because flush and compaction can happen concurrently. + // For other levels, we always compact all the files in the level. + // And because compaction is a single threaded process, so there will be no new files compacted to the source level at the same time. + const auto & old_src_level_files = persisted_files_levels[compaction->compaction_src_level]; + auto old_src_level_files_iter = old_src_level_files.begin(); + for (auto & task : compaction->tasks) + { + for (auto & file : task.to_compact) + { + if (unlikely(old_src_level_files_iter == old_src_level_files.end() + || (file->getId() != (*old_src_level_files_iter)->getId()) + || (file->getRows() != (*old_src_level_files_iter)->getRows()))) + { + throw Exception("Compaction algorithm broken", ErrorCodes::LOGICAL_ERROR); + } + old_src_level_files_iter++; + } + } + auto & src_level_files = new_persisted_files_levels.emplace_back(); + while (old_src_level_files_iter != old_src_level_files.end()) + { + src_level_files.emplace_back(*old_src_level_files_iter); + } // Add new file to the target level auto target_level = compaction->compaction_src_level + 1; auto & target_level_files = new_persisted_files_levels.emplace_back(); @@ -404,7 +428,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & else target_level_files.push_back(task.result); } - // Copy column files in level range [compaction->compaction_src_level + 1, +inf) if exists + // Copy column files in level range [target_level + 1, +inf) if exists for (size_t i = target_level + 1; i < persisted_files_levels.size(); i++) { auto & new_level = new_persisted_files_levels.emplace_back(); @@ -421,6 +445,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & /// Commit updates in memory. persisted_files_levels.swap(new_persisted_files_levels); updateColumnFileStats(); + LOG_FMT_DEBUG(log, "After commit compaction, level summary: {}", info()); return true; } diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp index ad824b3f919..9b5365972f3 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.cpp @@ -48,6 +48,7 @@ void MinorCompaction::prepare(DMContext & context, WriteBatches & wbs, const Pag total_compact_files += task.to_compact.size(); total_compact_rows += compact_rows; + result_compact_files += 1; } } @@ -58,7 +59,7 @@ bool MinorCompaction::commit(ColumnFilePersistedSetPtr & persisted_file_set, Wri String MinorCompaction::info() const { - return fmt::format("Compacted {} column files into {} column files, total {} rows.", total_compact_files, tasks.size(), total_compact_rows); + return fmt::format("Compacted {} column files into {} column files, total {} rows.", total_compact_files, result_compact_files, total_compact_rows); } } // namespace DM } // namespace DB diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h index 3e92d634c8f..7f81cd5f462 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h @@ -47,6 +47,7 @@ class MinorCompaction : public std::enable_shared_from_this size_t total_compact_files = 0; size_t total_compact_rows = 0; + size_t result_compact_files = 0; public: MinorCompaction(size_t compaction_src_level_, size_t current_compaction_version_); From de4800c605aab6f68e01e41ed8e286200380627b Mon Sep 17 00:00:00 2001 From: lidezhu Date: Thu, 10 Feb 2022 14:57:54 +0800 Subject: [PATCH 16/23] add gtest for delta value space --- dbms/src/DataStreams/ConcatBlockInputStream.h | 4 +- .../ColumnFile/ColumnFileSetReader.cpp | 5 +- .../ColumnFile/ColumnFileSetReader.h | 2 +- .../DeltaMerge/DMSegmentThreadInputStream.h | 9 +- .../Delta/ColumnFilePersistedSet.cpp | 18 +- .../DeltaMerge/Delta/ColumnFilePersistedSet.h | 1 + .../DeltaMerge/Delta/DeltaValueSpace.h | 6 + .../Storages/DeltaMerge/Delta/MemTableSet.cpp | 15 +- .../DeltaMerge/Delta/MinorCompaction.h | 7 +- .../Storages/DeltaMerge/Delta/Snapshot.cpp | 7 + dbms/src/Storages/DeltaMerge/Segment.cpp | 20 +- .../Storages/DeltaMerge/tests/CMakeLists.txt | 3 + .../tests/gtest_dm_delta_value_space.cpp | 649 ++++++++++++++++++ 13 files changed, 722 insertions(+), 24 deletions(-) create mode 100644 dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_value_space.cpp diff --git a/dbms/src/DataStreams/ConcatBlockInputStream.h b/dbms/src/DataStreams/ConcatBlockInputStream.h index 0ed8a34df60..270daeb6c08 100644 --- a/dbms/src/DataStreams/ConcatBlockInputStream.h +++ b/dbms/src/DataStreams/ConcatBlockInputStream.h @@ -29,8 +29,8 @@ class ConcatBlockInputStream : public IProfilingBlockInputStream protected: Block readImpl() override { - FilterPtr filter; - return readImpl(filter, false); + FilterPtr filter_; + return readImpl(filter_, false); } Block readImpl(FilterPtr & res_filter, bool return_filter) override diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp index b786e46b26d..310f5be68a7 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.cpp @@ -35,7 +35,8 @@ std::pair findColumnFile(const ColumnFiles & column_files, size_ } else { - rows_count += column_file->getRows(); + size_t column_file_rows = column_file->getRows(); + rows_count += column_file_rows; if (rows_count > rows_offset) { if (unlikely(deletes_count != deletes_offset)) @@ -45,7 +46,7 @@ std::pair findColumnFile(const ColumnFiles & column_files, size_ + ", deletes_offset: " + DB::toString(deletes_offset), ErrorCodes::LOGICAL_ERROR); - return {column_file_index, column_file->getRows() - (rows_count - rows_offset)}; + return {column_file_index, column_file_rows - (rows_count - rows_offset)}; } } } diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h index 340a62bb4ee..1ace097ccbf 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h @@ -35,7 +35,7 @@ class ColumnFileSetReader const ColumnDefinesPtr & col_defs_, const RowKeyRange & segment_range_); - // If we need to read columns besides pk and version, a DeltaValueReader can NOT be used more than once. + // If we need to read columns besides pk and version, a ColumnFileSetReader can NOT be used more than once. // This method create a new reader based on then current one. It will reuse some caches in the current reader. ColumnFileSetReaderPtr createNewReader(const ColumnDefinesPtr & new_col_defs); diff --git a/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h b/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h index f4379a2c871..f31826103d3 100644 --- a/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h +++ b/dbms/src/Storages/DeltaMerge/DMSegmentThreadInputStream.h @@ -44,13 +44,14 @@ class DMSegmentThreadInputStream : public IProfilingBlockInputStream , expected_block_size(expected_block_size_) , is_raw(is_raw_) , do_range_filter_for_raw(do_range_filter_for_raw_) - , log(getMPPTaskLog(log_, getNameImpl())) + , log(getMPPTaskLog(log_, name)) { } - String getName() const override { return getNameImpl(); } - // Add this function because static analysis forbids calling virtual function in constructor - inline String getNameImpl() const { return "DeltaMergeSegmentThread"; } + static constexpr auto name = "DeltaMergeSegmentThread"; + + String getName() const override { return name; } + Block getHeader() const override { return header; } protected: diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp index ac4c64b0dfb..acd76319900 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.cpp @@ -372,7 +372,7 @@ MinorCompactionPtr ColumnFilePersistedSet::pickUpMinorCompaction(DMContext & con bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & compaction, WriteBatches & wbs) { - if (compaction->current_compaction_version != minor_compaction_version) + if (compaction->getCompactionVersion() != minor_compaction_version) { LOG_FMT_WARNING(log, "Structure has been updated during compact"); return false; @@ -380,8 +380,9 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & minor_compaction_version += 1; LOG_FMT_DEBUG(log, "Before commit compaction, level summary: {}", info()); ColumnFilePersistedLevels new_persisted_files_levels; - // Copy column files in level range [0, compaction->compaction_src_level) - for (size_t i = 0; i < compaction->compaction_src_level; i++) + auto compaction_src_level = compaction->getCompactionSourceLevel(); + // Copy column files in level range [0, compaction_src_level) + for (size_t i = 0; i < compaction_src_level; i++) { auto & new_level = new_persisted_files_levels.emplace_back(); for (const auto & f : persisted_files_levels[i]) @@ -391,11 +392,11 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & // Actually, just level 0 may contain file that is not in the compaction task, because flush and compaction can happen concurrently. // For other levels, we always compact all the files in the level. // And because compaction is a single threaded process, so there will be no new files compacted to the source level at the same time. - const auto & old_src_level_files = persisted_files_levels[compaction->compaction_src_level]; + const auto & old_src_level_files = persisted_files_levels[compaction_src_level]; auto old_src_level_files_iter = old_src_level_files.begin(); - for (auto & task : compaction->tasks) + for (const auto & task : compaction->getTasks()) { - for (auto & file : task.to_compact) + for (const auto & file : task.to_compact) { if (unlikely(old_src_level_files_iter == old_src_level_files.end() || (file->getId() != (*old_src_level_files_iter)->getId()) @@ -410,9 +411,10 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & while (old_src_level_files_iter != old_src_level_files.end()) { src_level_files.emplace_back(*old_src_level_files_iter); + old_src_level_files_iter++; } // Add new file to the target level - auto target_level = compaction->compaction_src_level + 1; + auto target_level = compaction_src_level + 1; auto & target_level_files = new_persisted_files_levels.emplace_back(); // Copy the old column files in the target level first if exists if (persisted_files_levels.size() > target_level) @@ -421,7 +423,7 @@ bool ColumnFilePersistedSet::installCompactionResults(const MinorCompactionPtr & target_level_files.emplace_back(column_file); } // Add the compaction result to new target level - for (auto & task : compaction->tasks) + for (const auto & task : compaction->getTasks()) { if (task.is_trivial_move) target_level_files.push_back(task.to_compact[0]); diff --git a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h index eb5ec2df915..ee3fca3c01d 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h +++ b/dbms/src/Storages/DeltaMerge/Delta/ColumnFilePersistedSet.h @@ -105,6 +105,7 @@ class ColumnFilePersistedSet : public std::enable_shared_from_this /// Only called after reboot. static DeltaValueSpacePtr restore(DMContext & context, const RowKeyRange & segment_range, PageId id); + /// The following two methods are just for test purposes + MemTableSetPtr getMemTableSet() const { return mem_table_set; } + ColumnFilePersistedSetPtr getPersistedFileSet() const { return persisted_file_set; } + String simpleInfo() const { return "Delta [" + DB::toString(persisted_file_set->getId()) + "]"; } String info() const { @@ -299,6 +303,8 @@ class DeltaValueSnapshot : public std::enable_shared_from_thisgetStorageSnapshot(); } const auto & getSharedDeltaIndex() { return shared_delta_index; } + + String toString() const; }; class DeltaValueReader diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp index 51f667a9c78..276e04301a0 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp @@ -167,9 +167,18 @@ ColumnFileSetSnapshotPtr MemTableSet::createSnapshot() size_t total_deletes = 0; for (const auto & file : column_files) { - // All column files in MemTableSet is constant(except append data to the cache object in ColumnFileInMemory), - // and we always create new column file object when flushing, so it's safe to reuse the column file object here. - snap->column_files.push_back(file); + // ColumnFileInMemory object may still be appendable after creating this snapshot, and it will change some of its internal state. + // So it's safe to create a new object and just share the cache object with the original ColumnFileInMemory object instance. + if (auto * m = file->tryToInMemoryFile(); m) + { + // Compact threads could update the value of ColumnTinyFile::cache, + // and since ColumnFile is not multi-threads safe, we should create a new column file object. + snap->column_files.push_back(std::make_shared(*m)); + } + else + { + snap->column_files.push_back(file); + } total_rows += file->getRows(); total_deletes += file->getDeletes(); } diff --git a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h index 7f81cd5f462..fa4e872eb3b 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h +++ b/dbms/src/Storages/DeltaMerge/Delta/MinorCompaction.h @@ -16,8 +16,6 @@ using MinorCompactionPtr = std::shared_ptr; /// For `ColumnFileBig` and `ColumnFileDeleteRange`, it just moves it to the next level. class MinorCompaction : public std::enable_shared_from_this { - friend class ColumnFilePersistedSet; - public: struct Task { @@ -76,6 +74,11 @@ class MinorCompaction : public std::enable_shared_from_this return is_trivial_move; } + const Tasks & getTasks() const { return tasks; } + + size_t getCompactionSourceLevel() const { return compaction_src_level; } + size_t getCompactionVersion() const { return current_compaction_version; } + /// Create new column file by combining several small `ColumnFileTiny`s void prepare(DMContext & context, WriteBatches & wbs, const PageReader & reader); diff --git a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp index 853f6c59735..dafb74dd39d 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp @@ -46,6 +46,13 @@ RowKeyRange DeltaValueSnapshot::getSquashDeleteRange() const return persisted_files_snap->getSquashDeleteRange(); } } +String DeltaValueSnapshot::toString() const +{ + String info = columnFilesToString(persisted_files_snap->getColumnFiles()); + if (mem_table_snap) + return info += columnFilesToString(mem_table_snap->getColumnFiles()); + return info; +} // ================================================ // DeltaValueReader diff --git a/dbms/src/Storages/DeltaMerge/Segment.cpp b/dbms/src/Storages/DeltaMerge/Segment.cpp index 0774a5dec72..0c1b35ef419 100644 --- a/dbms/src/Storages/DeltaMerge/Segment.cpp +++ b/dbms/src/Storages/DeltaMerge/Segment.cpp @@ -1485,12 +1485,28 @@ std::pair Segment::ensurePlace(const DMContext & dm_context } if (unlikely(my_placed_rows != delta_snap->getRows() || my_placed_deletes != delta_snap->getDeletes())) + { + String items_info; + for (auto & item : items) + { + if (item.isBlock()) + { + items_info += "block " + toString(item.getBlockOffset()) + " rows " + toString(item.getBlock().rows()) + " "; + } + else + { + items_info += "delete "; + } + } throw Exception( - fmt::format("Placed status not match! Expected place rows:{}, deletes:{}, but actually placed rows:{}, deletes:{}", + fmt::format("Placed status not match! Expected place rows:{}, deletes:{}, but actually placed rows:{}, deletes:{}, items detail:{}, snap detail:{}", delta_snap->getRows(), delta_snap->getDeletes(), my_placed_rows, - my_placed_deletes)); + my_placed_deletes, + items_info, + delta_snap->toString())); + } my_delta_index->update(my_delta_tree, my_placed_rows, my_placed_deletes); diff --git a/dbms/src/Storages/DeltaMerge/tests/CMakeLists.txt b/dbms/src/Storages/DeltaMerge/tests/CMakeLists.txt index cb97d7c9043..b1f961e9c95 100644 --- a/dbms/src/Storages/DeltaMerge/tests/CMakeLists.txt +++ b/dbms/src/Storages/DeltaMerge/tests/CMakeLists.txt @@ -43,6 +43,9 @@ target_link_libraries(dm_test_minmax_index dbms gtest_main clickhouse_functions) add_executable(dm_test_delta_index_manager EXCLUDE_FROM_ALL gtest_dm_delta_index_manager.cpp) target_link_libraries(dm_test_delta_index_manager dbms gtest_main clickhouse_functions) +add_executable(dm_test_delta_value_space EXCLUDE_FROM_ALL gtest_dm_delta_value_space.cpp) +target_link_libraries(dm_test_delta_value_space dbms gtest_main clickhouse_functions) + add_subdirectory (bank EXCLUDE_FROM_ALL) add_subdirectory (stress EXCLUDE_FROM_ALL) diff --git a/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_value_space.cpp b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_value_space.cpp new file mode 100644 index 00000000000..8b63e9c1485 --- /dev/null +++ b/dbms/src/Storages/DeltaMerge/tests/gtest_dm_delta_value_space.cpp @@ -0,0 +1,649 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "dm_basic_include.h" + +namespace CurrentMetrics +{ +extern const Metric DT_SnapshotOfRead; +} // namespace CurrentMetrics +namespace DB +{ +namespace DM +{ +extern DMFilePtr writeIntoNewDMFile(DMContext & dm_context, // + const ColumnDefinesPtr & schema_snap, + const BlockInputStreamPtr & input_stream, + UInt64 file_id, + const String & parent_path, + DMFileBlockOutputStream::Flags flags); +namespace tests +{ +void assertBlocksEqual(const Blocks & blocks1, const Blocks & blocks2) +{ + ASSERT_EQ(blocks1.size(), blocks2.size()); + for (size_t i = 0; i < blocks1.size(); ++i) + ASSERT_EQ(blocks1[i].rows(), blocks2[i].rows()); + + // use hash to check the read results + SipHash hash1; + for (const auto & block : blocks1) + block.updateHash(hash1); + + SipHash hash2; + for (const auto & block : blocks2) + block.updateHash(hash2); + + ASSERT_EQ(hash1.get64(), hash2.get64()); +} + +class DeltaValueSpaceTest : public DB::base::TiFlashStorageTestBasic +{ +public: + static void SetUpTestCase() {} + + void SetUp() override + { + TiFlashStorageTestBasic::SetUp(); + table_columns = std::make_shared(); + + delta = reload(); + ASSERT_EQ(delta->getId(), delta_id); + } + +protected: + DeltaValueSpacePtr reload(const ColumnDefinesPtr & pre_define_columns = {}, DB::Settings && db_settings = DB::Settings()) + { + TiFlashStorageTestBasic::reload(std::move(db_settings)); + storage_path_pool = std::make_unique(db_context->getPathPool().withTable("test", "t1", false)); + storage_pool = std::make_unique("test.t1", *storage_path_pool, *db_context, db_context->getSettingsRef()); + storage_pool->restore(); + ColumnDefinesPtr cols = (!pre_define_columns) ? DMTestEnv::getDefaultColumns() : pre_define_columns; + setColumns(cols); + + return std::make_unique(delta_id); + } + + // setColumns should update dm_context at the same time + void setColumns(const ColumnDefinesPtr & columns) + { + *table_columns = *columns; + + dm_context = std::make_unique(*db_context, + *storage_path_pool, + *storage_pool, + 0, + /*min_version_*/ 0, + settings.not_compress_columns, + false, + 1, + db_context->getSettingsRef()); + } + + const ColumnDefinesPtr & tableColumns() const { return table_columns; } + + DMContext & dmContext() { return *dm_context; } + +protected: + /// all these var lives as ref in dm_context + std::unique_ptr storage_path_pool; + std::unique_ptr storage_pool; + ColumnDefinesPtr table_columns; + DM::DeltaMergeStore::Settings settings; + /// dm_context + std::unique_ptr dm_context; + + // the delta we are going to test + DeltaValueSpacePtr delta; + + static constexpr PageId delta_id = 1; + static constexpr size_t num_rows_write_per_batch = 100; +}; + +void appendBlockToDeltaValueSpace(DMContext & context, DeltaValueSpacePtr delta, size_t rows_start, size_t rows_num, UInt64 tso = 2) +{ + Block block = DMTestEnv::prepareSimpleWriteBlock(rows_start, rows_start + rows_num, false, tso); + delta->appendToCache(context, block, 0, block.rows()); +} + +void appendColumnFileTinyToDeltaValueSpace(DMContext & context, DeltaValueSpacePtr delta, size_t rows_start, size_t rows_num, WriteBatches & wbs, UInt64 tso = 2) +{ + Block block = DMTestEnv::prepareSimpleWriteBlock(rows_start, rows_start + rows_num, false, tso); + auto tiny_file = ColumnFileTiny::writeColumnFile(context, block, 0, block.rows(), wbs); + wbs.writeLogAndData(); + delta->appendColumnFile(context, tiny_file); +} + +void appendColumnFileBigToDeltaValueSpace(DMContext & context, ColumnDefines column_defines, DeltaValueSpacePtr delta, size_t rows_start, size_t rows_num, WriteBatches & wbs, UInt64 tso = 2) +{ + Block block = DMTestEnv::prepareSimpleWriteBlock(rows_start, rows_start + rows_num, false, tso); + auto delegator = context.path_pool.getStableDiskDelegator(); + auto file_id = context.storage_pool.newDataPageIdForDTFile(delegator, __PRETTY_FUNCTION__); + auto input_stream = std::make_shared(block); + auto store_path = delegator.choosePath(); + auto dmfile + = writeIntoNewDMFile(context, std::make_shared(column_defines), input_stream, file_id, store_path, {}); + delegator.addDTFile(file_id, dmfile->getBytesOnDisk(), store_path); + + auto & pk_column = block.getByPosition(0).column; + auto min_pk = pk_column->getInt(0); + auto max_pk = pk_column->getInt(block.rows() - 1); + HandleRange range(min_pk, max_pk + 1); + + auto column_file = std::make_shared(context, dmfile, RowKeyRange::fromHandleRange(range)); + wbs.data.putExternal(file_id, 0); + wbs.writeLogAndData(); + delta->ingestColumnFiles(context, RowKeyRange::fromHandleRange(range), {column_file}, false); +} + +TEST_F(DeltaValueSpaceTest, WriteRead) +{ + Blocks write_blocks; + size_t total_rows_write = 0; + // write data to memory and read it + { + Block block = DMTestEnv::prepareSimpleWriteBlock(0, num_rows_write_per_batch, false); + write_blocks.push_back(block); + delta->appendToCache(dmContext(), block, 0, block.rows()); + total_rows_write += num_rows_write_per_batch; + // read + auto snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + auto rows = snapshot->getRows(); + ASSERT_EQ(rows, total_rows_write); + auto reader = std::make_shared( + dmContext(), + snapshot, + table_columns, + RowKeyRange::newAll(false, 1)); + { + auto columns = block.cloneEmptyColumns(); + ASSERT_EQ(reader->readRows(columns, 0, total_rows_write, nullptr), total_rows_write); + Blocks result_blocks; + result_blocks.push_back(block.cloneWithColumns(std::move(columns))); + assertBlocksEqual(write_blocks, result_blocks); + } + // read with a specific range + { + auto columns = block.cloneEmptyColumns(); + RowKeyRange read_range = RowKeyRange::fromHandleRange(HandleRange(0, num_rows_write_per_batch / 2)); + ASSERT_EQ(reader->readRows(columns, 0, total_rows_write, &read_range), num_rows_write_per_batch / 2); + } + } + + // flush data to disk and read again + { + ASSERT_EQ(delta->getUnsavedRows(), total_rows_write); + delta->flush(dmContext()); + ASSERT_EQ(delta->getUnsavedRows(), 0); + auto snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + auto rows = snapshot->getRows(); + ASSERT_EQ(rows, total_rows_write); + auto reader = std::make_shared( + dmContext(), + snapshot, + table_columns, + RowKeyRange::newAll(false, 1)); + { + auto columns = write_blocks[0].cloneEmptyColumns(); + ASSERT_EQ(reader->readRows(columns, 0, total_rows_write, nullptr), total_rows_write); + Blocks result_blocks; + result_blocks.push_back(write_blocks[0].cloneWithColumns(std::move(columns))); + assertBlocksEqual(write_blocks, result_blocks); + } + // read with a specific range + { + auto columns = write_blocks[0].cloneEmptyColumns(); + RowKeyRange read_range = RowKeyRange::fromHandleRange(HandleRange(0, num_rows_write_per_batch / 2)); + ASSERT_EQ(reader->readRows(columns, 0, total_rows_write, &read_range), num_rows_write_per_batch / 2); + } + } + + // write more data to memory and read again + { + Block block = DMTestEnv::prepareSimpleWriteBlock(total_rows_write, total_rows_write + num_rows_write_per_batch, false); + write_blocks.push_back(block); + delta->appendToCache(dmContext(), block, 0, block.rows()); + total_rows_write += num_rows_write_per_batch; + // read + auto snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + auto rows = snapshot->getRows(); + ASSERT_EQ(rows, total_rows_write); + auto reader = std::make_shared( + dmContext(), + snapshot, + table_columns, + RowKeyRange::newAll(false, 1)); + { + size_t total_read_rows = 0; + Blocks result_blocks; + while (total_read_rows < total_rows_write) + { + auto columns = block.cloneEmptyColumns(); + size_t read_rows = reader->readRows(columns, total_read_rows, num_rows_write_per_batch, nullptr); + ASSERT_EQ(read_rows, num_rows_write_per_batch); + total_read_rows += read_rows; + result_blocks.push_back(block.cloneWithColumns(std::move(columns))); + } + ASSERT_EQ(total_read_rows, total_rows_write); + assertBlocksEqual(write_blocks, result_blocks); + } + // read with a specific range + { + auto columns = block.cloneEmptyColumns(); + RowKeyRange read_range = RowKeyRange::fromHandleRange(HandleRange(0, num_rows_write_per_batch + num_rows_write_per_batch / 2)); + ASSERT_EQ(reader->readRows(columns, 0, total_rows_write, &read_range), num_rows_write_per_batch + num_rows_write_per_batch / 2); + } + } + + // flush to disk, write a delete range and write more data + { + ASSERT_EQ(delta->getUnsavedRows(), num_rows_write_per_batch); + delta->flush(dmContext()); + ASSERT_EQ(delta->getUnsavedRows(), 0); + // the actual delete range value doesn't matter + delta->appendDeleteRange(dmContext(), RowKeyRange::fromHandleRange(HandleRange(0, num_rows_write_per_batch))); + + Block block = DMTestEnv::prepareSimpleWriteBlock(total_rows_write, total_rows_write + num_rows_write_per_batch, false); + write_blocks.push_back(block); + delta->appendToCache(dmContext(), block, 0, block.rows()); + total_rows_write += num_rows_write_per_batch; + // read + auto snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + auto rows = snapshot->getRows(); + ASSERT_EQ(rows, total_rows_write); + auto reader = std::make_shared( + dmContext(), + snapshot, + table_columns, + RowKeyRange::newAll(false, 1)); + { + size_t total_read_rows = 0; + Blocks result_blocks; + while (total_read_rows < total_rows_write) + { + auto columns = block.cloneEmptyColumns(); + size_t read_rows = reader->readRows(columns, total_read_rows, num_rows_write_per_batch, nullptr); + ASSERT_EQ(read_rows, num_rows_write_per_batch); + total_read_rows += read_rows; + result_blocks.push_back(block.cloneWithColumns(std::move(columns))); + } + ASSERT_EQ(total_read_rows, total_rows_write); + assertBlocksEqual(write_blocks, result_blocks); + } + // read with a specific range + { + auto columns = block.cloneEmptyColumns(); + RowKeyRange read_range = RowKeyRange::fromHandleRange(HandleRange(0, 2 * num_rows_write_per_batch + num_rows_write_per_batch / 2)); + ASSERT_EQ(reader->readRows(columns, 0, total_rows_write, &read_range), 2 * num_rows_write_per_batch + num_rows_write_per_batch / 2); + } + } +} + +// Write data to MemTableSet when do flush at the same time +TEST_F(DeltaValueSpaceTest, Flush) +{ + auto mem_table_set = delta->getMemTableSet(); + auto persisted_file_set = delta->getPersistedFileSet(); + WriteBatches wbs(dmContext().storage_pool, dmContext().getWriteLimiter()); + size_t total_rows_write = 0; + // write some column_file + { + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + { + delta->appendDeleteRange(dmContext(), RowKeyRange::fromHandleRange(HandleRange(0, num_rows_write_per_batch))); + } + { + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + } + } + // build flush task and finish prepare stage + ColumnFileFlushTaskPtr flush_task; + { + flush_task = mem_table_set->buildFlushTask(dmContext(), persisted_file_set->getRows(), persisted_file_set->getDeletes(), persisted_file_set->getCurrentFlushVersion()); + ASSERT_EQ(flush_task->getTaskNum(), 3); + ASSERT_EQ(flush_task->getFlushRows(), 2 * num_rows_write_per_batch); + ASSERT_EQ(flush_task->getFlushDeletes(), 1); + flush_task->prepare(wbs); + } + // another thread write more data to the delta value space + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + // commit the flush task and check the status after flush + { + ASSERT_TRUE(flush_task->commit(persisted_file_set, wbs)); + ASSERT_EQ(persisted_file_set->getRows(), 2 * num_rows_write_per_batch); + ASSERT_EQ(persisted_file_set->getDeletes(), 1); + ASSERT_EQ(mem_table_set->getRows(), total_rows_write - persisted_file_set->getRows()); + } +} + +TEST_F(DeltaValueSpaceTest, MinorCompaction) +{ + auto persisted_file_set = delta->getPersistedFileSet(); + WriteBatches wbs(dmContext().storage_pool, dmContext().getWriteLimiter()); + size_t total_rows_write = 0; + // write some column_file and flush + { + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + { + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + } + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + { + delta->appendDeleteRange(dmContext(), RowKeyRange::fromHandleRange(HandleRange(0, num_rows_write_per_batch))); + } + delta->flush(dmContext()); + } + // build compaction task and finish prepare stage + MinorCompactionPtr compaction_task; + { + PageStorage::SnapshotPtr log_storage_snap = dmContext().storage_pool.log()->getSnapshot(); + PageReader reader(dmContext().storage_pool.log(), std::move(log_storage_snap), dmContext().getReadLimiter()); + compaction_task = persisted_file_set->pickUpMinorCompaction(dmContext()); + ASSERT_EQ(compaction_task->getCompactionSourceLevel(), 0); + // There should be two compaction sub_tasks. + // The first task try to compact the first three column files to a larger one, + // and the second task is just a trivial move for the last column file which is a delete range. + const auto & tasks = compaction_task->getTasks(); + ASSERT_EQ(tasks.size(), 2); + ASSERT_EQ(tasks[0].to_compact.size(), 3); + ASSERT_EQ(tasks[0].is_trivial_move, false); + ASSERT_EQ(tasks[1].to_compact.size(), 1); + ASSERT_EQ(tasks[1].is_trivial_move, true); + compaction_task->prepare(dmContext(), wbs, reader); + } + // another thread write more data to the delta value space and flush it + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + delta->flush(dmContext()); + ASSERT_EQ(delta->getUnsavedRows(), 0); + ASSERT_EQ(persisted_file_set->getRows(), total_rows_write); + ASSERT_EQ(persisted_file_set->getDeletes(), 1); + ASSERT_EQ(persisted_file_set->getColumnFileCount(), 5); + } + // commit the compaction task and check the status + { + ASSERT_TRUE(compaction_task->commit(persisted_file_set, wbs)); + ASSERT_EQ(persisted_file_set->getRows(), total_rows_write); + ASSERT_EQ(persisted_file_set->getDeletes(), 1); + ASSERT_EQ(persisted_file_set->getColumnFileCount(), 3); + } + // after compaction, the column file in persisted_file_set should be like the following: + // level 0: T_100 + // level 1: T_300, D_0_100 + // so there is no compaction task to do + { + compaction_task = persisted_file_set->pickUpMinorCompaction(dmContext()); + ASSERT_TRUE(!compaction_task); + } + // do a lot of minor compaction and check the status + { + for (size_t i = 0; i < 20; i++) + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + delta->flush(dmContext()); + while (true) + { + PageStorage::SnapshotPtr log_storage_snap = dmContext().storage_pool.log()->getSnapshot(); + PageReader reader(dmContext().storage_pool.log(), std::move(log_storage_snap), dmContext().getReadLimiter()); + auto minor_compaction_task = persisted_file_set->pickUpMinorCompaction(dmContext()); + if (!minor_compaction_task) + break; + minor_compaction_task->prepare(dmContext(), wbs, reader); + minor_compaction_task->commit(persisted_file_set, wbs); + } + wbs.writeRemoves(); + ASSERT_EQ(persisted_file_set->getRows(), total_rows_write); + ASSERT_EQ(persisted_file_set->getDeletes(), 1); + } + } +} + +TEST_F(DeltaValueSpaceTest, Restore) +{ + auto persisted_file_set = delta->getPersistedFileSet(); + size_t total_rows_write = 0; + // write some column_file, flush and compact it + { + WriteBatches wbs(dmContext().storage_pool, dmContext().getWriteLimiter()); + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + { + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + } + { + delta->appendDeleteRange(dmContext(), RowKeyRange::fromHandleRange(HandleRange(0, num_rows_write_per_batch))); + } + delta->flush(dmContext()); + delta->compact(dmContext()); + // after compaction, the two ColumnFileTiny must be compacted to a large column file, so there are just two column files left. + ASSERT_EQ(delta->getColumnFileCount(), 2); + } + // write more data and flush it, and then there are two levels in the persisted_file_set + { + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + delta->flush(dmContext()); + ASSERT_EQ(persisted_file_set->getColumnFileLevelCount(), 2); + ASSERT_EQ(delta->getColumnFileCount(), 3); + ASSERT_EQ(delta->getRows(), total_rows_write); + } + // check the column file order remain the same after restore + { + Blocks old_delta_blocks; + { + auto old_delta_snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + DeltaValueInputStream old_delta_stream(dmContext(), old_delta_snapshot, table_columns, RowKeyRange::newAll(false, 1)); + old_delta_stream.readPrefix(); + while (true) + { + auto block = old_delta_stream.read(); + if (!block) + break; + old_delta_blocks.push_back(std::move(block)); + } + old_delta_stream.readSuffix(); + } + Blocks new_delta_blocks; + { + auto new_delta = delta->restore(dmContext(), RowKeyRange::newAll(false, 1), delta_id); + auto new_delta_snapshot = new_delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + DeltaValueInputStream new_delta_stream(dmContext(), new_delta_snapshot, table_columns, RowKeyRange::newAll(false, 1)); + new_delta_stream.readPrefix(); + while (true) + { + auto block = new_delta_stream.read(); + if (!block) + break; + new_delta_blocks.push_back(std::move(block)); + } + new_delta_stream.readSuffix(); + } + assertBlocksEqual(old_delta_blocks, new_delta_blocks); + } +} + +TEST_F(DeltaValueSpaceTest, CheckHeadAndCloneTail) +{ + auto persisted_file_set = delta->getPersistedFileSet(); + size_t total_rows_write = 0; + WriteBatches wbs(dmContext().storage_pool, dmContext().getWriteLimiter()); + // create three levels in persisted_file_set + { + // one column file in level 1 + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + { + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + } + delta->flush(dmContext()); + delta->compact(dmContext()); + ASSERT_EQ(delta->getColumnFileCount(), 1); + ASSERT_EQ(persisted_file_set->getColumnFileLevelCount(), 2); + // one column files in level 2 + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + { + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + } + delta->flush(dmContext()); + // compact two level 0 files to level 1 + delta->compact(dmContext()); + // compact two level 1 files to level 2 + delta->compact(dmContext()); + ASSERT_EQ(delta->getColumnFileCount(), 1); + ASSERT_EQ(persisted_file_set->getColumnFileLevelCount(), 3); + // one column files in level 1 and one column files in level 2 + { + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + { + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + } + delta->flush(dmContext()); + delta->compact(dmContext()); + ASSERT_EQ(delta->getColumnFileCount(), 2); + ASSERT_EQ(persisted_file_set->getColumnFileLevelCount(), 3); + } + { + auto snapshot = delta->createSnapshot(dmContext(), true, CurrentMetrics::DT_SnapshotOfRead); + auto snapshot_rows = snapshot->getRows(); + ASSERT_EQ(snapshot_rows, total_rows_write); + // write some more column file to persisted_file_set and memory_table_set + for (size_t i = 0; i < 2; i++) + { + // ColumnFileInMemory + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + // ColumnFileDeleteRange + delta->appendDeleteRange(dmContext(), RowKeyRange::newAll(false, 1)); + // ColumnFileTiny + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + // ColumnFileBig + appendColumnFileBigToDeltaValueSpace(dmContext(), *table_columns, delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + if (i == 0) + delta->flush(dmContext()); + } + auto [persisted_column_files, in_memory_files] = delta->checkHeadAndCloneTail(dmContext(), RowKeyRange::newAll(false, 1), snapshot->getColumnFilesInSnapshot(), wbs); + wbs.writeLogAndData(); + ASSERT_EQ(persisted_column_files.size(), 4); + ASSERT_EQ(in_memory_files.size(), 4); + size_t tail_rows = 0; + for (const auto & file : persisted_column_files) + tail_rows += file->getRows(); + for (const auto & file : in_memory_files) + tail_rows += file->getRows(); + ASSERT_EQ(snapshot_rows + tail_rows, total_rows_write); + } +} + +TEST_F(DeltaValueSpaceTest, GetPlaceItems) +{ + size_t total_rows_write = 0; + WriteBatches wbs(dmContext().storage_pool, dmContext().getWriteLimiter()); + // write some data to persisted_file_set and mem_table_set + { + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + appendColumnFileTinyToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch, wbs); + total_rows_write += num_rows_write_per_batch; + delta->flush(dmContext()); + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + total_rows_write += num_rows_write_per_batch; + } + // read + { + auto snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + auto rows = snapshot->getRows(); + ASSERT_EQ(rows, total_rows_write); + // write some more data after create snapshot + appendBlockToDeltaValueSpace(dmContext(), delta, total_rows_write, num_rows_write_per_batch); + ASSERT_EQ(delta->getRows(true), total_rows_write + num_rows_write_per_batch); + auto reader = std::make_shared( + dmContext(), + snapshot, + table_columns, + RowKeyRange::newAll(false, 1)); + auto place_items = reader->getPlaceItems(0, 0, snapshot->getRows(), snapshot->getDeletes()); + ASSERT_EQ(place_items.size(), 2); + size_t total_place_rows = 0; + for (auto & item : place_items) + { + ASSERT_EQ(item.isBlock(), true); + auto block = item.getBlock(); + total_place_rows += block.rows(); + } + ASSERT_EQ(total_place_rows, total_rows_write); + } +} +TEST_F(DeltaValueSpaceTest, ShouldPlace) +{ + size_t tso = 100; + WriteBatches wbs(dmContext().storage_pool, dmContext().getWriteLimiter()); + appendBlockToDeltaValueSpace(dmContext(), delta, 0, num_rows_write_per_batch, tso); + { + auto snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + auto reader = std::make_shared( + dmContext(), + snapshot, + table_columns, + RowKeyRange::newAll(false, 1)); + ASSERT_TRUE(reader->shouldPlace(dmContext(), snapshot->getSharedDeltaIndex(), RowKeyRange::newAll(false, 1), RowKeyRange::fromHandleRange(HandleRange(0, 100)), tso + 1)); + ASSERT_FALSE(reader->shouldPlace(dmContext(), snapshot->getSharedDeltaIndex(), RowKeyRange::newAll(false, 1), RowKeyRange::fromHandleRange(HandleRange(0, 100)), tso - 1)); + } + { + delta->flush(dmContext()); + auto snapshot = delta->createSnapshot(dmContext(), false, CurrentMetrics::DT_SnapshotOfRead); + auto reader = std::make_shared( + dmContext(), + snapshot, + table_columns, + RowKeyRange::newAll(false, 1)); + ASSERT_TRUE(reader->shouldPlace(dmContext(), snapshot->getSharedDeltaIndex(), RowKeyRange::newAll(false, 1), RowKeyRange::fromHandleRange(HandleRange(0, 100)), tso + 1)); + ASSERT_FALSE(reader->shouldPlace(dmContext(), snapshot->getSharedDeltaIndex(), RowKeyRange::newAll(false, 1), RowKeyRange::fromHandleRange(HandleRange(0, 100)), tso - 1)); + } +} +} // namespace tests +} // namespace DM +} // namespace DB From 0ef3617fef604da8ef3651658964965eaf9c546f Mon Sep 17 00:00:00 2001 From: lidezhu Date: Thu, 10 Feb 2022 15:09:29 +0800 Subject: [PATCH 17/23] small improvement on code comment --- dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp index 276e04301a0..2881f13fbf2 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/MemTableSet.cpp @@ -167,8 +167,8 @@ ColumnFileSetSnapshotPtr MemTableSet::createSnapshot() size_t total_deletes = 0; for (const auto & file : column_files) { - // ColumnFileInMemory object may still be appendable after creating this snapshot, and it will change some of its internal state. - // So it's safe to create a new object and just share the cache object with the original ColumnFileInMemory object instance. + // ColumnFile is not a thread-safe object, but only ColumnFileInMemory may be appendable after its creation. + // So we only clone the instance of ColumnFileInMemory here. if (auto * m = file->tryToInMemoryFile(); m) { // Compact threads could update the value of ColumnTinyFile::cache, From ec499d7ad80ca31d3fbd1ab3a7d0b42d73f2baa2 Mon Sep 17 00:00:00 2001 From: lidezhu Date: Thu, 10 Feb 2022 17:18:28 +0800 Subject: [PATCH 18/23] remove some debug log --- .../DeltaMerge/Delta/DeltaValueSpace.h | 2 -- .../src/Storages/DeltaMerge/Delta/Snapshot.cpp | 7 ------- dbms/src/Storages/DeltaMerge/Segment.cpp | 18 ++---------------- 3 files changed, 2 insertions(+), 25 deletions(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h index 8ced001fdde..d245f5ec9d7 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h +++ b/dbms/src/Storages/DeltaMerge/Delta/DeltaValueSpace.h @@ -303,8 +303,6 @@ class DeltaValueSnapshot : public std::enable_shared_from_thisgetStorageSnapshot(); } const auto & getSharedDeltaIndex() { return shared_delta_index; } - - String toString() const; }; class DeltaValueReader diff --git a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp index dafb74dd39d..853f6c59735 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp @@ -46,13 +46,6 @@ RowKeyRange DeltaValueSnapshot::getSquashDeleteRange() const return persisted_files_snap->getSquashDeleteRange(); } } -String DeltaValueSnapshot::toString() const -{ - String info = columnFilesToString(persisted_files_snap->getColumnFiles()); - if (mem_table_snap) - return info += columnFilesToString(mem_table_snap->getColumnFiles()); - return info; -} // ================================================ // DeltaValueReader diff --git a/dbms/src/Storages/DeltaMerge/Segment.cpp b/dbms/src/Storages/DeltaMerge/Segment.cpp index 0c1b35ef419..0d92e81bbf5 100644 --- a/dbms/src/Storages/DeltaMerge/Segment.cpp +++ b/dbms/src/Storages/DeltaMerge/Segment.cpp @@ -1486,26 +1486,12 @@ std::pair Segment::ensurePlace(const DMContext & dm_context if (unlikely(my_placed_rows != delta_snap->getRows() || my_placed_deletes != delta_snap->getDeletes())) { - String items_info; - for (auto & item : items) - { - if (item.isBlock()) - { - items_info += "block " + toString(item.getBlockOffset()) + " rows " + toString(item.getBlock().rows()) + " "; - } - else - { - items_info += "delete "; - } - } throw Exception( - fmt::format("Placed status not match! Expected place rows:{}, deletes:{}, but actually placed rows:{}, deletes:{}, items detail:{}, snap detail:{}", + fmt::format("Placed status not match! Expected place rows:{}, deletes:{}, but actually placed rows:{}, deletes:{}", delta_snap->getRows(), delta_snap->getDeletes(), my_placed_rows, - my_placed_deletes, - items_info, - delta_snap->toString())); + my_placed_deletes)); } my_delta_index->update(my_delta_tree, my_placed_rows, my_placed_deletes); From 78258d41316e1a7f922a523f5a8decdd3cd3b97e Mon Sep 17 00:00:00 2001 From: lidezhu Date: Mon, 14 Feb 2022 17:51:13 +0800 Subject: [PATCH 19/23] do some more rename --- dbms/src/Interpreters/Settings.h | 4 ++-- .../DeltaMerge/ColumnFile/ColumnFile_V2.cpp | 16 ++++++++-------- dbms/src/Storages/DeltaMerge/DMContext.h | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h index 30000203313..53bffbe1465 100644 --- a/dbms/src/Interpreters/Settings.h +++ b/dbms/src/Interpreters/Settings.h @@ -253,8 +253,8 @@ struct Settings M(SettingUInt64, dt_segment_stop_write_delta_size, 2147483648, "Delta size before stop new writes. 2 GB by default.") \ M(SettingUInt64, dt_segment_delta_cache_limit_rows, 4096, "Max rows of cache in segment delta in DeltaTree Engine.") \ M(SettingUInt64, dt_segment_delta_cache_limit_size, 4194304, "Max size of cache in segment delta in DeltaTree Engine. 4 MB by default.") \ - M(SettingUInt64, dt_segment_delta_small_column_file_rows, 2048, "Determine whether a column file in delta is small or not.") \ - M(SettingUInt64, dt_segment_delta_small_column_file_size, 8388608, "Determine whether a column file in delta is small or not. 8MB by default.") \ + M(SettingUInt64, dt_segment_delta_small_pack_rows, 2048, "Determine whether a column file in delta is small or not.") \ + M(SettingUInt64, dt_segment_delta_small_pack_size, 8388608, "Determine whether a column file in delta is small or not. 8MB by default.") \ M(SettingUInt64, dt_segment_stable_pack_rows, DEFAULT_MERGE_BLOCK_SIZE, "Expected stable pack rows in DeltaTree Engine.") \ M(SettingFloat, dt_segment_wait_duration_factor, 1, "The factor of wait duration in a write stall.") \ M(SettingUInt64, dt_bg_gc_check_interval, 600, "Background gc thread check interval, the unit is second.") \ diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp index 36a95a89d0c..29ff0b15642 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFile_V2.cpp @@ -16,10 +16,10 @@ struct ColumnFileV2 bool isDeleteRange() const { return !delete_range.none(); } }; -using ColumnFile_V2Ptr = std::shared_ptr; -using ColumnFiles_V2 = std::vector; +using ColumnFileV2Ptr = std::shared_ptr; +using ColumnFileV2s = std::vector; -inline ColumnFilePersisteds transform_V2_to_V3(const ColumnFiles_V2 & column_files_v2) +inline ColumnFilePersisteds transform_V2_to_V3(const ColumnFileV2s & column_files_v2) { ColumnFilePersisteds column_files_v3; for (const auto & f : column_files_v2) @@ -35,9 +35,9 @@ inline ColumnFilePersisteds transform_V2_to_V3(const ColumnFiles_V2 & column_fil return column_files_v3; } -inline ColumnFiles_V2 transformSaved_V3_to_V2(const ColumnFilePersisteds & column_files_v3) +inline ColumnFileV2s transformSaved_V3_to_V2(const ColumnFilePersisteds & column_files_v3) { - ColumnFiles_V2 column_files_v2; + ColumnFileV2s column_files_v2; for (const auto & f : column_files_v3) { auto * f_v2 = new ColumnFileV2(); @@ -85,7 +85,7 @@ inline void serializeColumnFile_V2(const ColumnFileV2 & column_file, const Block } } -void serializeSavedColumnFiles_V2(WriteBuffer & buf, const ColumnFiles_V2 & column_files) +void serializeSavedColumnFiles_V2(WriteBuffer & buf, const ColumnFileV2s & column_files) { writeIntBinary(column_files.size(), buf); BlockPtr last_schema; @@ -116,7 +116,7 @@ void serializeSavedColumnFilesInV2Format(WriteBuffer & buf, const ColumnFilePers serializeSavedColumnFiles_V2(buf, transformSaved_V3_to_V2(column_files)); } -inline ColumnFile_V2Ptr deserializeColumnFile_V2(ReadBuffer & buf, UInt64 version) +inline ColumnFileV2Ptr deserializeColumnFile_V2(ReadBuffer & buf, UInt64 version) { auto column_file = std::make_shared(); readIntBinary(column_file->rows, buf); @@ -147,7 +147,7 @@ ColumnFilePersisteds deserializeSavedColumnFilesInV2Format(ReadBuffer & buf, UIn { size_t size; readIntBinary(size, buf); - ColumnFiles_V2 column_files; + ColumnFileV2s column_files; BlockPtr last_schema; for (size_t i = 0; i < size; ++i) { diff --git a/dbms/src/Storages/DeltaMerge/DMContext.h b/dbms/src/Storages/DeltaMerge/DMContext.h index 24eb0ed18be..9e98cee174c 100644 --- a/dbms/src/Storages/DeltaMerge/DMContext.h +++ b/dbms/src/Storages/DeltaMerge/DMContext.h @@ -95,8 +95,8 @@ struct DMContext : private boost::noncopyable , delta_limit_bytes(settings.dt_segment_delta_limit_size) , delta_cache_limit_rows(settings.dt_segment_delta_cache_limit_rows) , delta_cache_limit_bytes(settings.dt_segment_delta_cache_limit_size) - , delta_small_column_file_rows(settings.dt_segment_delta_small_column_file_rows) - , delta_small_column_file_bytes(settings.dt_segment_delta_small_column_file_size) + , delta_small_column_file_rows(settings.dt_segment_delta_small_pack_rows) // still use `pack` instead of `column_file` in Settings.h for the compatibility of configure file + , delta_small_column_file_bytes(settings.dt_segment_delta_small_pack_size) , stable_pack_rows(settings.dt_segment_stable_pack_rows) , enable_logical_split(settings.dt_enable_logical_split) , read_delta_only(settings.dt_read_delta_only) From 471148923ebdb16d8fac99eafeaec64884111faf Mon Sep 17 00:00:00 2001 From: lidezhu Date: Tue, 15 Feb 2022 11:27:55 +0800 Subject: [PATCH 20/23] fix gtest for config rename --- dbms/src/Server/tests/gtest_server_config.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dbms/src/Server/tests/gtest_server_config.cpp b/dbms/src/Server/tests/gtest_server_config.cpp index 72e292ffa4a..686e4ed1acd 100644 --- a/dbms/src/Server/tests/gtest_server_config.cpp +++ b/dbms/src/Server/tests/gtest_server_config.cpp @@ -175,8 +175,8 @@ dt_storage_pool_data_gc_max_valid_rate = 0.5 ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_file_num, 8); ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_legacy_num, 2); ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_bytes, 256); - ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_column_file_size, 8388608); - ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_column_file_rows, 2048); + ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_pack_size, 8388608); + ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_pack_rows, 2048); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_limit_size, 536870912); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_limit_size, 42991616); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_force_merge_delta_size, 1073741824); From 5d00c12b37cbe6bc3ecad5ce0d4317020a66186e Mon Sep 17 00:00:00 2001 From: lidezhu Date: Wed, 16 Feb 2022 19:26:03 +0800 Subject: [PATCH 21/23] fix DeltaValueReader::shouldPlace --- dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp index 853f6c59735..a8a61ea93ce 100644 --- a/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp +++ b/dbms/src/Storages/DeltaMerge/Delta/Snapshot.cpp @@ -151,8 +151,9 @@ bool DeltaValueReader::shouldPlace(const DMContext & context, || placed_delete_ranges != delta_snap->getDeletes()) return true; + size_t rows_in_persisted_file_snap = delta_snap->getMemTableSetRowsOffset(); return persisted_files_reader->shouldPlace(context, relevant_range, max_version, placed_rows) - || (mem_table_reader && mem_table_reader->shouldPlace(context, relevant_range, max_version, placed_rows)); + || (mem_table_reader && mem_table_reader->shouldPlace(context, relevant_range, max_version, placed_rows <= rows_in_persisted_file_snap ? 0 : placed_rows - rows_in_persisted_file_snap)); } } // namespace DB::DM From 5e3e40c71bbc809f41122f86ddcdd036900da1a2 Mon Sep 17 00:00:00 2001 From: lidezhu <47731263+lidezhu@users.noreply.github.com> Date: Wed, 16 Feb 2022 19:26:46 +0800 Subject: [PATCH 22/23] Update dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h Co-authored-by: Flowyi --- dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h index 1ace097ccbf..8b32111939b 100644 --- a/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h +++ b/dbms/src/Storages/DeltaMerge/ColumnFile/ColumnFileSetReader.h @@ -36,7 +36,7 @@ class ColumnFileSetReader const RowKeyRange & segment_range_); // If we need to read columns besides pk and version, a ColumnFileSetReader can NOT be used more than once. - // This method create a new reader based on then current one. It will reuse some caches in the current reader. + // This method create a new reader based on the current one. It will reuse some caches in the current reader. ColumnFileSetReaderPtr createNewReader(const ColumnDefinesPtr & new_col_defs); // Use for DeltaMergeBlockInputStream to read rows from MemTableSet to do full compaction with other layer. From dc960cd202a123e4614b5fa3792f91cf4a5ee1ce Mon Sep 17 00:00:00 2001 From: lidezhu Date: Wed, 16 Feb 2022 19:50:07 +0800 Subject: [PATCH 23/23] use `column_file` in config instead `pack` --- dbms/src/Interpreters/Settings.h | 6 ++++-- dbms/src/Server/tests/gtest_server_config.cpp | 4 ++-- dbms/src/Storages/DeltaMerge/DMContext.h | 4 ++-- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/dbms/src/Interpreters/Settings.h b/dbms/src/Interpreters/Settings.h index 53bffbe1465..e35a57d219e 100644 --- a/dbms/src/Interpreters/Settings.h +++ b/dbms/src/Interpreters/Settings.h @@ -253,8 +253,10 @@ struct Settings M(SettingUInt64, dt_segment_stop_write_delta_size, 2147483648, "Delta size before stop new writes. 2 GB by default.") \ M(SettingUInt64, dt_segment_delta_cache_limit_rows, 4096, "Max rows of cache in segment delta in DeltaTree Engine.") \ M(SettingUInt64, dt_segment_delta_cache_limit_size, 4194304, "Max size of cache in segment delta in DeltaTree Engine. 4 MB by default.") \ - M(SettingUInt64, dt_segment_delta_small_pack_rows, 2048, "Determine whether a column file in delta is small or not.") \ - M(SettingUInt64, dt_segment_delta_small_pack_size, 8388608, "Determine whether a column file in delta is small or not. 8MB by default.") \ + M(SettingUInt64, dt_segment_delta_small_pack_rows, 2048, "Deprecated. Reserved for backward compatibility. Use dt_segment_delta_small_column_file_rows instead") \ + M(SettingUInt64, dt_segment_delta_small_pack_size, 8388608, "Deprecated. Reserved for backward compatibility. Use dt_segment_delta_small_column_file_size instead") \ + M(SettingUInt64, dt_segment_delta_small_column_file_rows, 2048, "Determine whether a column file in delta is small or not. 8MB by default.") \ + M(SettingUInt64, dt_segment_delta_small_column_file_size, 8388608, "Determine whether a column file in delta is small or not. 8MB by default.") \ M(SettingUInt64, dt_segment_stable_pack_rows, DEFAULT_MERGE_BLOCK_SIZE, "Expected stable pack rows in DeltaTree Engine.") \ M(SettingFloat, dt_segment_wait_duration_factor, 1, "The factor of wait duration in a write stall.") \ M(SettingUInt64, dt_bg_gc_check_interval, 600, "Background gc thread check interval, the unit is second.") \ diff --git a/dbms/src/Server/tests/gtest_server_config.cpp b/dbms/src/Server/tests/gtest_server_config.cpp index 686e4ed1acd..72e292ffa4a 100644 --- a/dbms/src/Server/tests/gtest_server_config.cpp +++ b/dbms/src/Server/tests/gtest_server_config.cpp @@ -175,8 +175,8 @@ dt_storage_pool_data_gc_max_valid_rate = 0.5 ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_file_num, 8); ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_legacy_num, 2); ASSERT_EQ(global_ctx.getSettingsRef().dt_storage_pool_data_gc_min_bytes, 256); - ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_pack_size, 8388608); - ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_pack_rows, 2048); + ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_column_file_size, 8388608); + ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_small_column_file_rows, 2048); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_limit_size, 536870912); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_delta_limit_size, 42991616); ASSERT_EQ(global_ctx.getSettingsRef().dt_segment_force_merge_delta_size, 1073741824); diff --git a/dbms/src/Storages/DeltaMerge/DMContext.h b/dbms/src/Storages/DeltaMerge/DMContext.h index 9e98cee174c..24eb0ed18be 100644 --- a/dbms/src/Storages/DeltaMerge/DMContext.h +++ b/dbms/src/Storages/DeltaMerge/DMContext.h @@ -95,8 +95,8 @@ struct DMContext : private boost::noncopyable , delta_limit_bytes(settings.dt_segment_delta_limit_size) , delta_cache_limit_rows(settings.dt_segment_delta_cache_limit_rows) , delta_cache_limit_bytes(settings.dt_segment_delta_cache_limit_size) - , delta_small_column_file_rows(settings.dt_segment_delta_small_pack_rows) // still use `pack` instead of `column_file` in Settings.h for the compatibility of configure file - , delta_small_column_file_bytes(settings.dt_segment_delta_small_pack_size) + , delta_small_column_file_rows(settings.dt_segment_delta_small_column_file_rows) + , delta_small_column_file_bytes(settings.dt_segment_delta_small_column_file_size) , stable_pack_rows(settings.dt_segment_stable_pack_rows) , enable_logical_split(settings.dt_enable_logical_split) , read_delta_only(settings.dt_read_delta_only)