Skip to content

Commit

Permalink
BTreeIndex Split Flush method, use bigger resolution
Browse files Browse the repository at this point in the history
  • Loading branch information
kunga committed Mar 28, 2024
1 parent 17708e5 commit 7219232
Show file tree
Hide file tree
Showing 6 changed files with 81 additions and 81 deletions.
93 changes: 51 additions & 42 deletions ydb/core/tablet_flat/flat_page_btree_index_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,23 +386,35 @@ namespace NKikimr::NTable::NPage {

Levels[0].PushChild(child);
}

void Flush(IPageWriter &pager) {
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
bool hasChanges = false;

// Note: in theory we may want to flush one level multiple times when different triggers are applicable
while (CanFlush(levelIndex)) {
DoFlush(levelIndex, pager, false);
hasChanges = true;
}

std::optional<TBtreeIndexMeta> Flush(IPageWriter &pager, bool last) {
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
if (!hasChanges) {
break; // no more changes
}
}
}

TBtreeIndexMeta Finish(IPageWriter &pager) {
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
if (last && !Levels[levelIndex].GetKeysCount()) {
if (!Levels[levelIndex].GetKeysCount()) {
Y_ABORT_UNLESS(Levels[levelIndex].GetChildrenCount() == 1, "Should be root");
return TBtreeIndexMeta{ Levels[levelIndex].PopChild(), levelIndex, IndexSize };
Y_ABORT_UNLESS(levelIndex + 1 == Levels.size(), "Should be root");
return {Levels[levelIndex].PopChild(), levelIndex, IndexSize};
}

if (!TryFlush(levelIndex, pager, last)) {
Y_ABORT_UNLESS(!last);
break;
}
DoFlush(levelIndex, pager, true);
}

Y_ABORT_UNLESS(!last, "Should have returned root");
return { };
Y_ABORT_UNLESS(false, "Should have returned root");
}

void Reset() {
Expand All @@ -415,43 +427,41 @@ namespace NKikimr::NTable::NPage {
}

private:
bool TryFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
if (!last && Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMax) {
// Note: node should meet both NodeKeysMin and NodeSize restrictions for split
bool CanFlush(ui32 levelIndex) {
const ui64 waitFullNodes = 2;

if (Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMin) {
// not enough keys for split
return false;
}

// Note: this size check is approximate and we might not produce 2 full-sized pages
if (CalcPageSize(Levels[levelIndex]) <= 2 * NodeTargetSize) {
// not enough bytes for split
return false;
}
if (Levels[levelIndex].GetKeysCount() <= waitFullNodes * NodeKeysMin) {
// node keys min restriction should be always satisfied
return false;
}

Writer.EnsureEmpty();
// Note: size checks are approximate and flush might not produce 2 full-sized pages

// Note: for now we build last nodes from all remaining level's keys
// we may to try splitting them more evenly later
return
Levels[levelIndex].GetKeysCount() > waitFullNodes * NodeKeysMax ||
CalcPageSize(Levels[levelIndex]) > waitFullNodes * NodeTargetSize;
}

while (last || Writer.GetKeysCount() < NodeKeysMin || Writer.CalcPageSize() < NodeTargetSize) {
if (!last && Levels[levelIndex].GetKeysCount() < 3) {
// we shouldn't produce empty nodes (but can violate NodeKeysMin restriction)
break;
}
if (!last && Writer.GetKeysCount() >= NodeKeysMax) {
// have enough keys
break;
void DoFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
Writer.EnsureEmpty();

if (last) {
// Note: for now we build last nodes from all remaining level's keys
// we may to try splitting them more evenly later

while (Levels[levelIndex].GetKeysCount()) {
Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}
if (last && !Levels[levelIndex].GetKeysCount()) {
// nothing left
break;
} else {
while (Writer.GetKeysCount() < NodeKeysMin || (
// can add more to writer if:
Levels[levelIndex].GetKeysCount() > 2 &&
Writer.GetKeysCount() < NodeKeysMax &&
Writer.CalcPageSize() < NodeTargetSize)) {
Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}

Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}
auto lastChild = Levels[levelIndex].PopChild();
Writer.AddChild(lastChild);
Expand All @@ -462,6 +472,7 @@ namespace NKikimr::NTable::NPage {

if (levelIndex + 1 == Levels.size()) {
Levels.emplace_back();
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
}
Levels[levelIndex + 1].PushChild(TChild{pageId, lastChild.RowCount, lastChild.DataSize, lastChild.ErasedRowCount});
if (!last) {
Expand All @@ -475,8 +486,6 @@ namespace NKikimr::NTable::NPage {
} else {
Y_ABORT_UNLESS(Levels[levelIndex].GetKeysCount(), "Shouldn't leave empty levels");
}

return true;
}

size_t CalcPageSize(const TLevel& level) const {
Expand Down
6 changes: 3 additions & 3 deletions ydb/core/tablet_flat/flat_part_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -532,12 +532,12 @@ namespace NTable {
if (WriteBTreeIndex) {
Current.BTreeGroupIndexes.reserve(Groups.size());
for (auto& g : Groups) {
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Finish(Pager));
}
if (Current.HistoryWritten > 0) {
Current.BTreeHistoricIndexes.reserve(Histories.size());
for (auto& g : Histories) {
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Finish(Pager));
}
}
}
Expand Down Expand Up @@ -807,7 +807,7 @@ namespace NTable {
} else {
g.BTreeIndex.AddShortChild({page, dataPage->Count, raw.size()});
}
g.BTreeIndex.Flush(Pager, false);
g.BTreeIndex.Flush(Pager);
}

// N.B. hack to save the last row/key for the main group
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/tablet_flat/flat_stat_part.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class TStatsScreenedPartIterator {
for (bool historic : {false, true}) {
for (ui32 groupIndex : xrange(historic ? Part->HistoricGroupsCount : Part->GroupsCount)) {
ui64 groupRowCountResolution, groupDataSizeResolution;
if (groupIndex == 0 && Part->GroupsCount > 1) {
if (groupIndex == 0 && (Part->GroupsCount > 1 || Small || Large)) {
// make steps as small as possible because they will affect groups resolution
groupRowCountResolution = groupDataSizeResolution = 0;
} else {
Expand Down
10 changes: 3 additions & 7 deletions ydb/core/tablet_flat/flat_stat_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,9 @@ bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, u
TDataStats iteratorStats = { };
TStatsIterator statsIterator(subset.Scheme->Keys);

TSet<TEpoch> epochs;
for (const auto& part : subset.Flatten) {
epochs.insert(part->Epoch);
}
// if rowCountResolution = 300, 3-leveled SST, let's move each iterator up to 25 rows
ui64 iterRowCountResolution = rowCountResolution / Max<ui64>(1, epochs.size()) / 4;
ui64 iterDataSizeResolution = dataSizeResolution / Max<ui64>(1, epochs.size()) / 4;
// TODO: make it better
ui64 iterRowCountResolution = rowCountResolution / 2;
ui64 iterDataSizeResolution = dataSizeResolution / 2;

// Make index iterators for all parts
bool started = true;
Expand Down
49 changes: 22 additions & 27 deletions ydb/core/tablet_flat/ut/ut_btree_index_nodes.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "flat_page_btree_index.h"
#include "flat_page_btree_index_writer.h"
#include "test/libs/table/test_writer.h"
#include "ydb/core/tx/datashard/datashard.h"
#include <ydb/core/tablet_flat/test/libs/rows/layout.h>
#include <library/cpp/testing/unittest/registar.h>

Expand Down Expand Up @@ -505,11 +506,10 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
builder.AddChild(child);

TWriterBundle pager(1, TLogoBlobID());
auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

TBtreeIndexMeta expected{child, 0, 0};
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString());
}

Y_UNIT_TEST(OneNode) {
Expand All @@ -536,15 +536,14 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
}

TWriterBundle pager(1, TLogoBlobID());
auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

Dump(*result, builder.GroupInfo, pager.Back());
Dump(result, builder.GroupInfo, pager.Back());

TBtreeIndexMeta expected{{0, 1155, 11055, 385}, 1, 595};
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString());

CheckKeys(result->PageId, keys, builder.GroupInfo, pager.Back());
CheckKeys(result.PageId, keys, builder.GroupInfo, pager.Back());
}

Y_UNIT_TEST(FewNodes) {
Expand All @@ -569,16 +568,21 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
TSerializedCellVec deserialized(keys[i]);
builder.AddKey(deserialized.GetCells());
builder.AddChild(children[i + 1]);
UNIT_ASSERT(!builder.Flush(pager, false));
builder.Flush(pager);
}

auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

Dump(*result, builder.GroupInfo, pager.Back());

UNIT_ASSERT_VALUES_EQUAL(result->LevelCount, 3);
Dump(result, builder.GroupInfo, pager.Back());

TBtreeIndexMeta expected{{9, 0, 0, 0}, 3, 1550};
for (auto c : children) {
expected.RowCount += c.RowCount;
expected.DataSize += c.DataSize;
expected.ErasedRowCount += c.ErasedRowCount;
}
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result.ToString());

auto checkKeys = [&](TPageId pageId, const TVector<TString>& keys) {
CheckKeys(pageId, keys, builder.GroupInfo, pager.Back());
};
Expand Down Expand Up @@ -624,14 +628,6 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
checkKeys(9, {
keys[8]
});

TBtreeIndexMeta expected{{9, 0, 0, 0}, 3, 1550};
for (auto c : children) {
expected.RowCount += c.RowCount;
expected.DataSize += c.DataSize;
expected.ErasedRowCount += c.ErasedRowCount;
}
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
}

Y_UNIT_TEST(SplitBySize) {
Expand All @@ -656,16 +652,15 @@ Y_UNIT_TEST_SUITE(TBtreeIndexBuilder) {
TSerializedCellVec deserialized(keys[i]);
builder.AddKey(deserialized.GetCells());
builder.AddChild(children[i + 1]);
UNIT_ASSERT(!builder.Flush(pager, false));
builder.Flush(pager);
}

auto result = builder.Flush(pager, true);
UNIT_ASSERT(result);
auto result = builder.Finish(pager);

Dump(*result, builder.GroupInfo, pager.Back());
Dump(result, builder.GroupInfo, pager.Back());

TBtreeIndexMeta expected{{15, 15150, 106050, 8080}, 3, 10270};
UNIT_ASSERT_EQUAL_C(*result, expected, "Got " + result->ToString());
UNIT_ASSERT_EQUAL_C(result, expected, "Got " + result->ToString());
}

}
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/tx/datashard/datashard.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ using namespace NSchemeShard;
using namespace NTabletFlatExecutor;

// NOTE: We really want to batch log records by default in datashards!
// But in unittests we want to test both scenarios
// But in unit tests we want to test both scenarios
bool gAllowLogBatchingDefaultValue = true;

TDuration gDbStatsReportInterval = TDuration::Seconds(10);
Expand Down

0 comments on commit 7219232

Please sign in to comment.