From 721923219ba06b03d04a29ae522b7816977cfcf8 Mon Sep 17 00:00:00 2001 From: kungasc Date: Thu, 28 Mar 2024 22:03:15 +0300 Subject: [PATCH] BTreeIndex Split Flush method, use bigger resolution --- .../flat_page_btree_index_writer.h | 93 ++++++++++--------- ydb/core/tablet_flat/flat_part_writer.h | 6 +- ydb/core/tablet_flat/flat_stat_part.h | 2 +- ydb/core/tablet_flat/flat_stat_table.cpp | 10 +- .../tablet_flat/ut/ut_btree_index_nodes.cpp | 49 +++++----- ydb/core/tx/datashard/datashard.cpp | 2 +- 6 files changed, 81 insertions(+), 81 deletions(-) diff --git a/ydb/core/tablet_flat/flat_page_btree_index_writer.h b/ydb/core/tablet_flat/flat_page_btree_index_writer.h index 2a4627bf7e2d..345c0b9eb4a6 100644 --- a/ydb/core/tablet_flat/flat_page_btree_index_writer.h +++ b/ydb/core/tablet_flat/flat_page_btree_index_writer.h @@ -386,23 +386,35 @@ namespace NKikimr::NTable::NPage { Levels[0].PushChild(child); } + + void Flush(IPageWriter &pager) { + for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) { + bool hasChanges = false; + + // Note: in theory we may want to flush one level multiple times when different triggers are applicable + while (CanFlush(levelIndex)) { + DoFlush(levelIndex, pager, false); + hasChanges = true; + } - std::optional Flush(IPageWriter &pager, bool last) { - Y_ABORT_UNLESS(Levels.size() < Max(), "Levels size is out of bounds"); + if (!hasChanges) { + break; // no more changes + } + } + } + + TBtreeIndexMeta Finish(IPageWriter &pager) { for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) { - if (last && !Levels[levelIndex].GetKeysCount()) { + if (!Levels[levelIndex].GetKeysCount()) { Y_ABORT_UNLESS(Levels[levelIndex].GetChildrenCount() == 1, "Should be root"); - return TBtreeIndexMeta{ Levels[levelIndex].PopChild(), levelIndex, IndexSize }; + Y_ABORT_UNLESS(levelIndex + 1 == Levels.size(), "Should be root"); + return {Levels[levelIndex].PopChild(), levelIndex, IndexSize}; } - if (!TryFlush(levelIndex, pager, last)) { - Y_ABORT_UNLESS(!last); - break; - } + DoFlush(levelIndex, pager, true); } - Y_ABORT_UNLESS(!last, "Should have returned root"); - return { }; + Y_ABORT_UNLESS(false, "Should have returned root"); } void Reset() { @@ -415,43 +427,41 @@ namespace NKikimr::NTable::NPage { } private: - bool TryFlush(ui32 levelIndex, IPageWriter &pager, bool last) { - if (!last && Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMax) { - // Note: node should meet both NodeKeysMin and NodeSize restrictions for split + bool CanFlush(ui32 levelIndex) { + const ui64 waitFullNodes = 2; - if (Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMin) { - // not enough keys for split - return false; - } - - // Note: this size check is approximate and we might not produce 2 full-sized pages - if (CalcPageSize(Levels[levelIndex]) <= 2 * NodeTargetSize) { - // not enough bytes for split - return false; - } + if (Levels[levelIndex].GetKeysCount() <= waitFullNodes * NodeKeysMin) { + // node keys min restriction should be always satisfied + return false; } - Writer.EnsureEmpty(); + // Note: size checks are approximate and flush might not produce 2 full-sized pages - // Note: for now we build last nodes from all remaining level's keys - // we may to try splitting them more evenly later + return + Levels[levelIndex].GetKeysCount() > waitFullNodes * NodeKeysMax || + CalcPageSize(Levels[levelIndex]) > waitFullNodes * NodeTargetSize; + } - while (last || Writer.GetKeysCount() < NodeKeysMin || Writer.CalcPageSize() < NodeTargetSize) { - if (!last && Levels[levelIndex].GetKeysCount() < 3) { - // we shouldn't produce empty nodes (but can violate NodeKeysMin restriction) - break; - } - if (!last && Writer.GetKeysCount() >= NodeKeysMax) { - // have enough keys - break; + void DoFlush(ui32 levelIndex, IPageWriter &pager, bool last) { + Writer.EnsureEmpty(); + + if (last) { + // Note: for now we build last nodes from all remaining level's keys + // we may to try splitting them more evenly later + + while (Levels[levelIndex].GetKeysCount()) { + Writer.AddChild(Levels[levelIndex].PopChild()); + Writer.AddKey(Levels[levelIndex].PopKey()); } - if (last && !Levels[levelIndex].GetKeysCount()) { - // nothing left - break; + } else { + while (Writer.GetKeysCount() < NodeKeysMin || ( + // can add more to writer if: + Levels[levelIndex].GetKeysCount() > 2 && + Writer.GetKeysCount() < NodeKeysMax && + Writer.CalcPageSize() < NodeTargetSize)) { + Writer.AddChild(Levels[levelIndex].PopChild()); + Writer.AddKey(Levels[levelIndex].PopKey()); } - - Writer.AddChild(Levels[levelIndex].PopChild()); - Writer.AddKey(Levels[levelIndex].PopKey()); } auto lastChild = Levels[levelIndex].PopChild(); Writer.AddChild(lastChild); @@ -462,6 +472,7 @@ namespace NKikimr::NTable::NPage { if (levelIndex + 1 == Levels.size()) { Levels.emplace_back(); + Y_ABORT_UNLESS(Levels.size() < Max(), "Levels size is out of bounds"); } Levels[levelIndex + 1].PushChild(TChild{pageId, lastChild.RowCount, lastChild.DataSize, lastChild.ErasedRowCount}); if (!last) { @@ -475,8 +486,6 @@ namespace NKikimr::NTable::NPage { } else { Y_ABORT_UNLESS(Levels[levelIndex].GetKeysCount(), "Shouldn't leave empty levels"); } - - return true; } size_t CalcPageSize(const TLevel& level) const { diff --git a/ydb/core/tablet_flat/flat_part_writer.h b/ydb/core/tablet_flat/flat_part_writer.h index 4c7b9e226831..b5d97fb3303c 100644 --- a/ydb/core/tablet_flat/flat_part_writer.h +++ b/ydb/core/tablet_flat/flat_part_writer.h @@ -532,12 +532,12 @@ namespace NTable { if (WriteBTreeIndex) { Current.BTreeGroupIndexes.reserve(Groups.size()); for (auto& g : Groups) { - Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value()); + Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Finish(Pager)); } if (Current.HistoryWritten > 0) { Current.BTreeHistoricIndexes.reserve(Histories.size()); for (auto& g : Histories) { - Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value()); + Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Finish(Pager)); } } } @@ -807,7 +807,7 @@ namespace NTable { } else { g.BTreeIndex.AddShortChild({page, dataPage->Count, raw.size()}); } - g.BTreeIndex.Flush(Pager, false); + g.BTreeIndex.Flush(Pager); } // N.B. hack to save the last row/key for the main group diff --git a/ydb/core/tablet_flat/flat_stat_part.h b/ydb/core/tablet_flat/flat_stat_part.h index cf898f07ae4e..a0ed7d0b0e59 100644 --- a/ydb/core/tablet_flat/flat_stat_part.h +++ b/ydb/core/tablet_flat/flat_stat_part.h @@ -48,7 +48,7 @@ class TStatsScreenedPartIterator { for (bool historic : {false, true}) { for (ui32 groupIndex : xrange(historic ? Part->HistoricGroupsCount : Part->GroupsCount)) { ui64 groupRowCountResolution, groupDataSizeResolution; - if (groupIndex == 0 && Part->GroupsCount > 1) { + if (groupIndex == 0 && (Part->GroupsCount > 1 || Small || Large)) { // make steps as small as possible because they will affect groups resolution groupRowCountResolution = groupDataSizeResolution = 0; } else { diff --git a/ydb/core/tablet_flat/flat_stat_table.cpp b/ydb/core/tablet_flat/flat_stat_table.cpp index 43ceb8a75412..da5d92a317f5 100644 --- a/ydb/core/tablet_flat/flat_stat_table.cpp +++ b/ydb/core/tablet_flat/flat_stat_table.cpp @@ -12,13 +12,9 @@ bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, u TDataStats iteratorStats = { }; TStatsIterator statsIterator(subset.Scheme->Keys); - TSet epochs; - for (const auto& part : subset.Flatten) { - epochs.insert(part->Epoch); - } - // if rowCountResolution = 300, 3-leveled SST, let's move each iterator up to 25 rows - ui64 iterRowCountResolution = rowCountResolution / Max(1, epochs.size()) / 4; - ui64 iterDataSizeResolution = dataSizeResolution / Max(1, epochs.size()) / 4; + // TODO: make it better + ui64 iterRowCountResolution = rowCountResolution / 2; + ui64 iterDataSizeResolution = dataSizeResolution / 2; // Make index iterators for all parts bool started = true; diff --git a/ydb/core/tablet_flat/ut/ut_btree_index_nodes.cpp b/ydb/core/tablet_flat/ut/ut_btree_index_nodes.cpp index 4e0076b684f1..d66d240d0987 100644 --- a/ydb/core/tablet_flat/ut/ut_btree_index_nodes.cpp +++ b/ydb/core/tablet_flat/ut/ut_btree_index_nodes.cpp @@ -1,6 +1,7 @@ #include "flat_page_btree_index.h" #include "flat_page_btree_index_writer.h" #include "test/libs/table/test_writer.h" +#include "ydb/core/tx/datashard/datashard.h" #include #include @@ -505,11 +506,10 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) { builder.AddChild(child); TWriterBundle pager(1, TLogoBlobID()); - auto result = builder.Flush(pager, true); - UNIT_ASSERT(result); + auto result = builder.Finish(pager); TBtreeIndexMeta expected{child, 0, 0}; - UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString()); + UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString()); } Y_UNIT_TEST(OneNode) { @@ -536,15 +536,14 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) { } TWriterBundle pager(1, TLogoBlobID()); - auto result = builder.Flush(pager, true); - UNIT_ASSERT(result); + auto result = builder.Finish(pager); - Dump(*result, builder.GroupInfo, pager.Back()); + Dump(result, builder.GroupInfo, pager.Back()); TBtreeIndexMeta expected{{0, 1155, 11055, 385}, 1, 595}; - UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString()); + UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString()); - CheckKeys(result->PageId, keys, builder.GroupInfo, pager.Back()); + CheckKeys(result.PageId, keys, builder.GroupInfo, pager.Back()); } Y_UNIT_TEST(FewNodes) { @@ -569,16 +568,21 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) { TSerializedCellVec deserialized(keys[i]); builder.AddKey(deserialized.GetCells()); builder.AddChild(children[i + 1]); - UNIT_ASSERT(!builder.Flush(pager, false)); + builder.Flush(pager); } - auto result = builder.Flush(pager, true); - UNIT_ASSERT(result); + auto result = builder.Finish(pager); - Dump(*result, builder.GroupInfo, pager.Back()); - - UNIT_ASSERT_VALUES_EQUAL(result->LevelCount, 3); + Dump(result, builder.GroupInfo, pager.Back()); + TBtreeIndexMeta expected{{9, 0, 0, 0}, 3, 1550}; + for (auto c : children) { + expected.RowCount += c.RowCount; + expected.DataSize += c.DataSize; + expected.ErasedRowCount += c.ErasedRowCount; + } + UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString()); + auto checkKeys = [&](TPageId pageId, const TVector& keys) { CheckKeys(pageId, keys, builder.GroupInfo, pager.Back()); }; @@ -624,14 +628,6 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) { checkKeys(9, { keys[8] }); - - TBtreeIndexMeta expected{{9, 0, 0, 0}, 3, 1550}; - for (auto c : children) { - expected.RowCount += c.RowCount; - expected.DataSize += c.DataSize; - expected.ErasedRowCount += c.ErasedRowCount; - } - UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString()); } Y_UNIT_TEST(SplitBySize) { @@ -656,16 +652,15 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) { TSerializedCellVec deserialized(keys[i]); builder.AddKey(deserialized.GetCells()); builder.AddChild(children[i + 1]); - UNIT_ASSERT(!builder.Flush(pager, false)); + builder.Flush(pager); } - auto result = builder.Flush(pager, true); - UNIT_ASSERT(result); + auto result = builder.Finish(pager); - Dump(*result, builder.GroupInfo, pager.Back()); + Dump(result, builder.GroupInfo, pager.Back()); TBtreeIndexMeta expected{{15, 15150, 106050, 8080}, 3, 10270}; - UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString()); + UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result->ToString()); } } diff --git a/ydb/core/tx/datashard/datashard.cpp b/ydb/core/tx/datashard/datashard.cpp index 5fdec1361b14..e8386f06df31 100644 --- a/ydb/core/tx/datashard/datashard.cpp +++ b/ydb/core/tx/datashard/datashard.cpp @@ -30,7 +30,7 @@ using namespace NSchemeShard; using namespace NTabletFlatExecutor; // NOTE: We really want to batch log records by default in datashards! -// But in unittests we want to test both scenarios +// But in unit tests we want to test both scenarios bool gAllowLogBatchingDefaultValue = true; TDuration gDbStatsReportInterval = TDuration::Seconds(10);