Skip to content

Commit

Permalink
Merge 8d775e7 into 8418282
Browse files Browse the repository at this point in the history
  • Loading branch information
kunga authored Mar 26, 2024
2 parents 8418282 + 8d775e7 commit e2eabd4
Show file tree
Hide file tree
Showing 11 changed files with 180 additions and 95 deletions.
134 changes: 91 additions & 43 deletions ydb/core/tablet_flat/flat_page_btree_index_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,6 +319,8 @@ namespace NKikimr::NTable::NPage {
Y_ABORT_UNLESS(Children);
TChild result = Children.front();
Children.pop_front();
PrevDataSize = result.DataSize;
PrevRowCount = result.RowCount;
return result;
}

Expand All @@ -334,15 +336,41 @@ namespace NKikimr::NTable::NPage {
return Children.size();
}

ui64 GetPrevRowCount() const {
return PrevRowCount;
}

ui64 GetPrevDataSize() const {
return PrevDataSize;
}

ui64 GetNextRowCount(ui64 prevRowCount) const {
return Children[1].RowCount - prevRowCount;
}

ui64 GetNextDataSize(ui64 prevDataSize) const {
return Children[1].DataSize - prevDataSize;
}

ui64 GetRowCount() const {
return Children.back().RowCount - PrevRowCount;
}

ui64 GetDataSize() const {
return Children.back().DataSize - PrevDataSize;
}

private:
size_t KeysSize = 0;
ui64 PrevRowCount = 0;
ui64 PrevDataSize = 0;
TDeque<TString> Keys;
TDeque<TChild> Children;
};

public:
TBtreeIndexBuilder(TIntrusiveConstPtr<TPartScheme> scheme, TGroupId groupId,
ui32 nodeTargetSize, ui32 nodeKeysMin, ui32 nodeKeysMax)
ui32 nodeTargetSize, ui32 nodeKeysMin, ui32 nodeKeysMax, ui32 leafDataSizeMax, ui32 leafRowsCountMax)
: Scheme(std::move(scheme))
, GroupId(groupId)
, GroupInfo(Scheme->GetLayout(groupId))
Expand All @@ -351,6 +379,8 @@ namespace NKikimr::NTable::NPage {
, NodeTargetSize(nodeTargetSize)
, NodeKeysMin(nodeKeysMin)
, NodeKeysMax(nodeKeysMax)
, LeafDataSizeMax(leafDataSizeMax)
, LeafRowsCountMax(leafRowsCountMax)
{
Y_ABORT_UNLESS(NodeTargetSize > 0);
Y_ABORT_UNLESS(NodeKeysMin > 0);
Expand Down Expand Up @@ -386,23 +416,35 @@ namespace NKikimr::NTable::NPage {

Levels[0].PushChild(child);
}

void Flush(IPageWriter &pager) {
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
bool hasChanges = false;

std::optional<TBtreeIndexMeta> Flush(IPageWriter &pager, bool last) {
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
// Note: in theory we may want to flush one level multiple times when different triggers are applicable
while (CanFlush(levelIndex)) {
DoFlush(levelIndex, pager, false);
hasChanges = true;
}

if (!hasChanges) {
break; // no more changes
}
}
}

TBtreeIndexMeta Finish(IPageWriter &pager) {
for (ui32 levelIndex = 0; levelIndex < Levels.size(); levelIndex++) {
if (last && !Levels[levelIndex].GetKeysCount()) {
if (!Levels[levelIndex].GetKeysCount()) {
Y_ABORT_UNLESS(Levels[levelIndex].GetChildrenCount() == 1, "Should be root");
return TBtreeIndexMeta{ Levels[levelIndex].PopChild(), levelIndex, IndexSize };
Y_ABORT_UNLESS(levelIndex + 1 == Levels.size(), "Should be root");
return {Levels[levelIndex].PopChild(), levelIndex, IndexSize};
}

if (!TryFlush(levelIndex, pager, last)) {
Y_ABORT_UNLESS(!last);
break;
}
DoFlush(levelIndex, pager, true);
}

Y_ABORT_UNLESS(!last, "Should have returned root");
return { };
Y_ABORT_UNLESS(false, "Should have returned root");
}

void Reset() {
Expand All @@ -415,43 +457,48 @@ namespace NKikimr::NTable::NPage {
}

private:
bool TryFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
if (!last && Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMax) {
// Note: node should meet both NodeKeysMin and NodeSize restrictions for split

if (Levels[levelIndex].GetKeysCount() <= 2 * NodeKeysMin) {
// not enough keys for split
return false;
}
bool CanFlush(ui32 levelIndex) {
const ui64 waitFullNodes = 2;

// Note: this size check is approximate and we might not produce 2 full-sized pages
if (CalcPageSize(Levels[levelIndex]) <= 2 * NodeTargetSize) {
// not enough bytes for split
return false;
}
if (Levels[levelIndex].GetKeysCount() <= waitFullNodes * NodeKeysMin) {
// node keys min restriction should be always satisfied
return false;
}

Writer.EnsureEmpty();
// Note: size checks are approximate and flush might not produce 2 full-sized pages

return
Levels[levelIndex].GetKeysCount() > waitFullNodes * NodeKeysMax ||
CalcPageSize(Levels[levelIndex]) > waitFullNodes * NodeTargetSize ||
levelIndex == 0 && Levels[levelIndex].GetDataSize() > waitFullNodes * LeafDataSizeMax ||
levelIndex == 0 && Levels[levelIndex].GetRowCount() > waitFullNodes * LeafRowsCountMax;
}

// Note: for now we build last nodes from all remaining level's keys
// we may to try splitting them more evenly later
void DoFlush(ui32 levelIndex, IPageWriter &pager, bool last) {
Writer.EnsureEmpty();

if (last) {
// Note: for now we build last nodes from all remaining level's keys
// we may to try splitting them more evenly later

while (last || Writer.GetKeysCount() < NodeKeysMin || Writer.CalcPageSize() < NodeTargetSize) {
if (!last && Levels[levelIndex].GetKeysCount() < 3) {
// we shouldn't produce empty nodes (but can violate NodeKeysMin restriction)
break;
while (Levels[levelIndex].GetKeysCount()) {
Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}
if (!last && Writer.GetKeysCount() >= NodeKeysMax) {
// have enough keys
break;
}
if (last && !Levels[levelIndex].GetKeysCount()) {
// nothing left
break;
} else {
auto prevDataSize = Levels[levelIndex].GetPrevDataSize();
auto prevRowCount = Levels[levelIndex].GetPrevRowCount();

while (Writer.GetKeysCount() < NodeKeysMin || (
// can add more to writer if:
Levels[levelIndex].GetKeysCount() > 2 &&
Writer.GetKeysCount() < NodeKeysMax &&
Writer.CalcPageSize() < NodeTargetSize &&
(levelIndex != 0 || Levels[levelIndex].GetNextDataSize(prevDataSize) < LeafDataSizeMax) &&
(levelIndex != 0 || Levels[levelIndex].GetNextRowCount(prevRowCount) < LeafRowsCountMax))) {
Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}

Writer.AddChild(Levels[levelIndex].PopChild());
Writer.AddKey(Levels[levelIndex].PopKey());
}
auto lastChild = Levels[levelIndex].PopChild();
Writer.AddChild(lastChild);
Expand All @@ -462,6 +509,7 @@ namespace NKikimr::NTable::NPage {

if (levelIndex + 1 == Levels.size()) {
Levels.emplace_back();
Y_ABORT_UNLESS(Levels.size() < Max<ui32>(), "Levels size is out of bounds");
}
Levels[levelIndex + 1].PushChild(TChild{pageId, lastChild.RowCount, lastChild.DataSize, lastChild.ErasedRowCount});
if (!last) {
Expand All @@ -475,8 +523,6 @@ namespace NKikimr::NTable::NPage {
} else {
Y_ABORT_UNLESS(Levels[levelIndex].GetKeysCount(), "Shouldn't leave empty levels");
}

return true;
}

size_t CalcPageSize(const TLevel& level) const {
Expand All @@ -497,6 +543,8 @@ namespace NKikimr::NTable::NPage {
const ui32 NodeTargetSize;
const ui32 NodeKeysMin;
const ui32 NodeKeysMax;
const ui32 LeafDataSizeMax;
const ui32 LeafRowsCountMax;

TRowId ChildRowCount = 0;
TRowId ChildErasedRowCount = 0;
Expand Down
8 changes: 5 additions & 3 deletions ydb/core/tablet_flat/flat_page_conf.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,9 +63,11 @@ namespace NPage {
ui32 PageRows = Max<ui32>(); /* Max rows per page, for UTs */
ui32 IndexMin = 32 * 1024; /* Index initial buffer size */

ui32 BTreeIndexNodeTargetSize = 7 * 1024; /* 1 GB of (up to) 140B keys leads to 3-level B-Tree index */
ui32 BTreeIndexNodeKeysMin = 6; /* 1 GB of 7KB keys leads to 6-level B-Tree index (node size - ~42KB) */
ui32 BTreeIndexNodeKeysMax = Max<ui32>(); /* for UTs */
ui32 BTreeIndexNodeTargetSize = 7 * 1024; /* 1 GB of (up to) 140B keys leads to 3-level B-Tree index */
ui32 BTreeIndexNodeKeysMin = 6; /* 1 GB of 7KB keys leads to 6-level B-Tree index (node size - ~42KB) */
ui32 BTreeIndexNodeKeysMax = Max<ui32>(); /* for UTs */
ui32 BTreeIndexLeafDataSizeMax = 1024 * 1024; /* gDbStatsDataSizeResolution / gDbStatsResolutionMultiplier */
ui32 BTreeIndexLeafRowsCountMax = 10000; /* gDbStatsRowCountResolution / gDbStatsResolutionMultiplier */
};

struct TConf {
Expand Down
10 changes: 6 additions & 4 deletions ydb/core/tablet_flat/flat_part_writer.h
Original file line number Diff line number Diff line change
Expand Up @@ -532,12 +532,12 @@ namespace NTable {
if (WriteBTreeIndex) {
Current.BTreeGroupIndexes.reserve(Groups.size());
for (auto& g : Groups) {
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
Current.BTreeGroupIndexes.push_back(g.BTreeIndex.Finish(Pager));
}
if (Current.HistoryWritten > 0) {
Current.BTreeHistoricIndexes.reserve(Histories.size());
for (auto& g : Histories) {
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Flush(Pager, true).value());
Current.BTreeHistoricIndexes.push_back(g.BTreeIndex.Finish(Pager));
}
}
}
Expand Down Expand Up @@ -807,7 +807,7 @@ namespace NTable {
} else {
g.BTreeIndex.AddShortChild({page, dataPage->Count, raw.size()});
}
g.BTreeIndex.Flush(Pager, false);
g.BTreeIndex.Flush(Pager);
}

// N.B. hack to save the last row/key for the main group
Expand Down Expand Up @@ -1086,7 +1086,9 @@ namespace NTable {
, Codec(conf.Groups[groupId.Index].Codec)
, Data(scheme, conf, tags, groupId)
, Index(scheme, conf, groupId)
, BTreeIndex(scheme, groupId, conf.Groups[groupId.Index].BTreeIndexNodeTargetSize, conf.Groups[groupId.Index].BTreeIndexNodeKeysMin, conf.Groups[groupId.Index].BTreeIndexNodeKeysMax)
, BTreeIndex(scheme, groupId, conf.Groups[groupId.Index].BTreeIndexNodeTargetSize,
conf.Groups[groupId.Index].BTreeIndexNodeKeysMin, conf.Groups[groupId.Index].BTreeIndexNodeKeysMax,
conf.Groups[groupId.Index].BTreeIndexLeafDataSizeMax, conf.Groups[groupId.Index].BTreeIndexLeafRowsCountMax)
{ }
};

Expand Down
2 changes: 1 addition & 1 deletion ydb/core/tablet_flat/flat_stat_part.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ class TStatsScreenedPartIterator {
for (bool historic : {false, true}) {
for (ui32 groupIndex : xrange(historic ? Part->HistoricGroupsCount : Part->GroupsCount)) {
ui64 groupRowCountResolution, groupDataSizeResolution;
if (groupIndex == 0 && Part->GroupsCount > 1) {
if (groupIndex == 0 && (Part->GroupsCount > 1 || Small || Large)) {
// make steps as small as possible because they will affect groups resolution
groupRowCountResolution = groupDataSizeResolution = 0;
} else {
Expand Down
11 changes: 3 additions & 8 deletions ydb/core/tablet_flat/flat_stat_table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,19 +6,14 @@
namespace NKikimr {
namespace NTable {

bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env) {
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, ui32 resolutionMultiplier, IPages* env) {
stats.Clear();

TDataStats iteratorStats = { };
TStatsIterator statsIterator(subset.Scheme->Keys);

TSet<TEpoch> epochs;
for (const auto& part : subset.Flatten) {
epochs.insert(part->Epoch);
}
// if rowCountResolution = 300, 3-leveled SST, let's move each iterator up to 25 rows
ui64 iterRowCountResolution = rowCountResolution / Max<ui64>(1, epochs.size()) / 4;
ui64 iterDataSizeResolution = dataSizeResolution / Max<ui64>(1, epochs.size()) / 4;
ui64 iterRowCountResolution = rowCountResolution / resolutionMultiplier;
ui64 iterDataSizeResolution = dataSizeResolution / resolutionMultiplier;

// Make index iterators for all parts
bool started = true;
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/tablet_flat/flat_stat_table.h
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ class TKeyAccessSample {
THashMap<TString, ui64> KeyRefCount;
};

bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, IPages* env);
bool BuildStats(const TSubset& subset, TStats& stats, ui64 rowCountResolution, ui64 dataSizeResolution, ui32 resolutionMultiplier, IPages* env);
void GetPartOwners(const TSubset& subset, THashSet<ui64>& partOwners);

}}
Loading

0 comments on commit e2eabd4

Please sign in to comment.