diff --git a/ydb/core/tablet_flat/flat_part_charge_btree_index.h b/ydb/core/tablet_flat/flat_part_charge_btree_index.h index ee091ff8e095..4207f85f69fb 100644 --- a/ydb/core/tablet_flat/flat_part_charge_btree_index.h +++ b/ydb/core/tablet_flat/flat_part_charge_btree_index.h @@ -16,20 +16,25 @@ class TChargeBTreeIndex : public ICharge { struct TChildState { TPageId PageId; - TRowId BeginRowId; - TRowId EndRowId; + TRowId BeginRowId, EndRowId; + TRowId PrevItems, Items; + ui64 PrevBytes, Bytes; - TChildState(TPageId pageId, TRowId beginRowId, TRowId endRowId) + TChildState(TPageId pageId, TRowId beginRowId, TRowId endRowId, TRowId prevItems, TRowId items, ui64 prevDataSize, ui64 dataSize) : PageId(pageId) , BeginRowId(beginRowId) , EndRowId(endRowId) + , PrevItems(prevItems) + , Items(items) + , PrevBytes(prevDataSize) + , Bytes(dataSize) { } }; struct TNodeState : TChildState, TBtreeIndexNode { - TNodeState(TSharedData data, TPageId pageId, TRowId beginRowId, TRowId endRowId) - : TChildState(pageId, beginRowId, endRowId) + TNodeState(TSharedData data, TPageId pageId, TRowId beginRowId, TRowId endRowId, TRowId prevItems, TRowId items, ui64 prevDataSize, ui64 dataSize) + : TChildState(pageId, beginRowId, endRowId, prevItems, items, prevDataSize, dataSize) , TBtreeIndexNode(data) { } @@ -63,17 +68,13 @@ class TChargeBTreeIndex : public ICharge { TResult Do(TCells key1, TCells key2, TRowId beginRowId, TRowId endRowId, const TKeyCellDefaults &keyDefaults, ui64 itemsLimit, ui64 bytesLimit) const noexcept override { endRowId++; // current interface accepts inclusive row2 bound - Y_ABORT_UNLESS(beginRowId < endRowId); - - bool ready = true, overshot = true; - bool hasValidRowsRange = Groups || IncludeHistory; // false value means that beginRowId, endRowId are invalid and shouldn't be used - ui64 chargeGroupsItemsLimit = itemsLimit; // pessimistic items limit for groups - TRowId beginBytesLimitRowId = Max(); // first unloaded probably needed row + bool ready = true, overshot = true, hasValidRowsRange = Groups || IncludeHistory; + const TRowId sliceBeginRowId = beginRowId, sliceEndRowId = endRowId; const auto& meta = Part->IndexPages.BTreeGroups[0]; + Y_ABORT_UNLESS(beginRowId < endRowId); Y_ABORT_UNLESS(endRowId <= meta.RowCount); - const TRowId sliceEndRowId = endRowId; if (Y_UNLIKELY(key1 && key2 && Compare(key1, key2, keyDefaults) > 0)) { key2 = key1; // will not go further than key1 hasValidRowsRange = false; @@ -82,71 +83,47 @@ class TChargeBTreeIndex : public ICharge { TVector level, nextLevel(::Reserve(3)); TPageId key1PageId = key1 ? meta.PageId : Max(); TPageId key2PageId = key2 ? meta.PageId : Max(); - ui64 key1Items = 0, prevKey1Items = 0; + TChildState firstChild = BuildRootChildState(meta); const auto iterateLevel = [&](const auto& tryHandleChild) { // tryHandleChild may update them, copy for simplicity - // always load beginRowId regardless of keys - const TRowId levelBeginRowId = beginRowId, levelEndRowId = Max(endRowId, beginRowId + 1); - const TChild* levelFirstChild = nullptr; - + const TRowId levelBeginRowId = beginRowId, levelEndRowId = endRowId; + for (const auto &node : level) { if (node.EndRowId <= levelBeginRowId || node.BeginRowId >= levelEndRowId) { continue; } - TRecIdx from = 0, to = node.GetChildrenCount(); - if (node.BeginRowId < levelBeginRowId) { + if (node.BeginRowId <= levelBeginRowId) { from = node.Seek(levelBeginRowId); + if (firstChild.PageId != Max()) { // still valid and should be updated + auto& child = node.GetChild(from); + auto prevChild = from ? node.GetChildRef(from - 1) : nullptr; + firstChild = BuildChildState(node, child, prevChild); + } } if (node.EndRowId > levelEndRowId) { to = node.Seek(levelEndRowId - 1) + 1; } + for (TRecIdx pos : xrange(from, to)) { - auto child = node.GetChildRef(pos); + auto child = node.GetChild(pos); auto prevChild = pos ? node.GetChildRef(pos - 1) : nullptr; - TRowId childBeginRowId = prevChild ? prevChild->RowCount : node.BeginRowId; - TRowId childEndRowId = child->RowCount; - ready &= tryHandleChild(TChildState(child->PageId, childBeginRowId, childEndRowId)); - if (itemsLimit || bytesLimit) { - if (!levelFirstChild) { - // do not apply limits on the first child because beginRowId/key1 position is uncertain - levelFirstChild = child; - } else { - if (itemsLimit) { - ui64 items = child->GetNonErasedRowCount() - levelFirstChild->GetNonErasedRowCount(); - if (LimitExceeded(items, itemsLimit)) { - overshot = false; - return; - } - } - if (bytesLimit) { - ui64 bytes = child->DataSize - levelFirstChild->DataSize; - if (LimitExceeded(bytes, bytesLimit)) { - endRowId = Min(endRowId, childEndRowId); - overshot = false; - return; - } - } - } + auto childState = BuildChildState(node, child, prevChild); + if (LimitExceeded(firstChild.Items, childState.PrevItems, itemsLimit) || LimitExceeded(firstChild.Bytes, childState.PrevBytes, bytesLimit)) { + endRowId = Min(endRowId, childState.BeginRowId); + return; } + ready &= tryHandleChild(childState); } } }; const auto skipUnloadedRows = [&](const TChildState& child) { + if (child.PageId == firstChild.PageId) { + firstChild.PageId = Max(); // mark first child unloaded + } if (child.PageId == key1PageId) { - if (hasValidRowsRange && chargeGroupsItemsLimit) { - ui64 unloadedItems = key1Items - prevKey1Items; - if (unloadedItems < chargeGroupsItemsLimit) { - chargeGroupsItemsLimit -= unloadedItems; - } else { - hasValidRowsRange = false; - } - } - if (hasValidRowsRange && bytesLimit) { - beginBytesLimitRowId = Max(beginRowId, child.BeginRowId); - } beginRowId = Max(beginRowId, child.EndRowId); } if (child.PageId == key2PageId) { @@ -154,19 +131,16 @@ class TChargeBTreeIndex : public ICharge { } }; - const auto tryHandleNode = [&](TChildState child) -> bool { - if (child.PageId == key1PageId || child.PageId == key2PageId) { + const auto tryHandleNode = [&](const TChildState& child) -> bool { + if (child.PageId == firstChild.PageId || child.PageId == key1PageId || child.PageId == key2PageId) { if (TryLoadNode(child, nextLevel)) { const auto& node = nextLevel.back(); if (child.PageId == key1PageId) { TRecIdx pos = node.Seek(ESeek::Lower, key1, Scheme.Groups[0].ColsKeyIdx, &keyDefaults); auto& key1Child = node.GetChild(pos); key1PageId = key1Child.PageId; - key1Items = key1Child.GetNonErasedRowCount(); if (pos) { - auto& prevKey1Child = node.GetChild(pos - 1); - prevKey1Items = prevKey1Child.GetNonErasedRowCount(); - beginRowId = Max(beginRowId, prevKey1Child.RowCount); // move beginRowId to the first key >= key1 + beginRowId = Max(beginRowId, node.GetChild(pos - 1).RowCount); // move beginRowId to the first key >= key1 } } if (child.PageId == key2PageId) { @@ -174,8 +148,9 @@ class TChargeBTreeIndex : public ICharge { auto& key2Child = node.GetChild(pos); key2PageId = key2Child.PageId; endRowId = Min(endRowId, key2Child.RowCount + 1); // move endRowId - 1 to the first key > key2 - if (key2Child.RowCount <= beginRowId) { + if (key2Child.RowCount <= sliceBeginRowId) { hasValidRowsRange = false; // key2 is before current slice + endRowId = Max(endRowId, sliceBeginRowId + 1); // always load sliceBeginRowId regardless of key2 } } return true; @@ -188,7 +163,7 @@ class TChargeBTreeIndex : public ICharge { } }; - const auto tryHandleDataPage = [&](TChildState child) -> bool { + const auto tryHandleDataPage = [&](const TChildState& child) -> bool { if (hasValidRowsRange && (child.PageId == key1PageId || child.PageId == key2PageId)) { const auto page = TryGetDataPage(child.PageId, { }); if (page) { @@ -211,19 +186,18 @@ class TChargeBTreeIndex : public ICharge { } }; - for (ui32 height = 0; height < meta.LevelCount && ready; height++) { + for (ui32 height = 0; height < meta.LevelCount; height++) { if (height == 0) { - ready &= tryHandleNode(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleNode(BuildRootChildState(meta)); } else { iterateLevel(tryHandleNode); } level.swap(nextLevel); nextLevel.clear(); - } - - if (!ready) { // some index pages are missing, do not continue - ready &= DoGroupsAndHistory(hasValidRowsRange, beginRowId, endRowId, beginBytesLimitRowId, chargeGroupsItemsLimit, bytesLimit); // precharge groups using the latest row bounds - return {ready, false}; + if (firstChild.PageId == Max()) { // first child is unloaded, consider all first's child rows are needed for next levels + firstChild.Items = firstChild.PrevItems; + firstChild.Bytes = firstChild.PrevBytes; + } } // flat index doesn't treat key placement within data page, so let's do the same @@ -231,12 +205,12 @@ class TChargeBTreeIndex : public ICharge { overshot &= endRowId == sliceEndRowId; if (meta.LevelCount == 0) { - ready &= tryHandleDataPage(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleDataPage(BuildRootChildState(meta)); } else { iterateLevel(tryHandleDataPage); } - ready &= DoGroupsAndHistory(hasValidRowsRange, beginRowId, endRowId, beginBytesLimitRowId, chargeGroupsItemsLimit, bytesLimit); // precharge groups using the latest row bounds + ready &= DoGroupsAndHistory(hasValidRowsRange, beginRowId, endRowId, firstChild, itemsLimit, bytesLimit); // precharge groups using the latest row bounds return {ready, overshot}; } @@ -244,17 +218,13 @@ class TChargeBTreeIndex : public ICharge { TResult DoReverse(TCells key1, TCells key2, TRowId endRowId, TRowId beginRowId, const TKeyCellDefaults &keyDefaults, ui64 itemsLimit, ui64 bytesLimit) const noexcept override { endRowId++; // current interface accepts inclusive row1 bound - Y_ABORT_UNLESS(beginRowId < endRowId); - bool ready = true, overshot = true; - bool hasValidRowsRange = Groups || IncludeHistory; // false value means that beginRowId, endRowId are invalid and shouldn't be used - ui64 chargeGroupsItemsLimit = itemsLimit; // pessimistic items limit for groups - TRowId endBytesLimitRowId = Max(); // last unloaded probably needed row - + bool ready = true, overshot = true, hasValidRowsRange = Groups || IncludeHistory; + const TRowId sliceBeginRowId = beginRowId, sliceEndRowId = endRowId; const auto& meta = Part->IndexPages.BTreeGroups[0]; + Y_ABORT_UNLESS(beginRowId < endRowId); Y_ABORT_UNLESS(endRowId <= meta.RowCount); - const TRowId sliceBeginRowId = beginRowId; if (Y_UNLIKELY(key1 && key2 && Compare(key2, key1, keyDefaults) > 0)) { key2 = key1; // will not go further than key1 hasValidRowsRange = false; @@ -264,74 +234,47 @@ class TChargeBTreeIndex : public ICharge { TVector level, nextLevel(::Reserve(3)); TPageId key1PageId = key1 ? meta.PageId : Max(); TPageId key2PageId = key2 ? meta.PageId : Max(); - ui64 prevKey1Items = 0, key1Items = 0; + TChildState lastChild = BuildRootChildState(meta); const auto iterateLevel = [&](const auto& tryHandleChild) { // tryHandleChild may update them, copy for simplicity - // always load endRowId - 1 regardless of keys - const TRowId levelBeginRowId = Min(beginRowId, endRowId - 1), levelEndRowId = endRowId; - const TChild *levelLastChild = nullptr, *levelPrevLastChild = nullptr; + const TRowId levelBeginRowId = beginRowId, levelEndRowId = endRowId; for (const auto &node : level) { if (node.EndRowId <= levelBeginRowId || node.BeginRowId >= levelEndRowId) { continue; } - - TRecIdx from = 0, to = node.GetChildrenCount(); + TRecIdx from = 0, to = node.GetKeysCount(); if (node.BeginRowId < levelBeginRowId) { from = node.Seek(levelBeginRowId); } - if (node.EndRowId > levelEndRowId) { - to = node.Seek(levelEndRowId - 1) + 1; + if (node.EndRowId >= levelEndRowId) { + to = node.Seek(levelEndRowId - 1); + if (lastChild.PageId != Max()) { // still valid and should be updated + auto& child = node.GetChild(to); + auto prevChild = to ? node.GetChildRef(to - 1) : nullptr; + lastChild = BuildChildState(node, child, prevChild); + } } - for (TRecIdx posExt = to; posExt > from; posExt--) { - auto child = node.GetChildRef(posExt - 1); + + for (TRecIdx posExt = to + 1; posExt > from; posExt--) { + auto& child = node.GetChild(posExt - 1); auto prevChild = posExt - 1 ? node.GetChildRef(posExt - 2) : nullptr; - TRowId childBeginRowId = prevChild ? prevChild->RowCount : node.BeginRowId; - TRowId childEndRowId = child->RowCount; - if (itemsLimit || bytesLimit) { - if (!levelLastChild) { - // do not apply limits on the last child because endRowId/key1 position is uncertain - levelLastChild = child; - } else { - if (!levelPrevLastChild) { - levelPrevLastChild = child; - } - if (itemsLimit) { - ui64 items = levelPrevLastChild->GetNonErasedRowCount() - child->GetNonErasedRowCount(); - if (LimitExceeded(items, itemsLimit)) { - overshot = false; - return; - } - } - if (bytesLimit) { - ui64 bytes = levelPrevLastChild->DataSize - child->DataSize; - if (LimitExceeded(bytes, bytesLimit)) { - beginRowId = Max(beginRowId, childEndRowId); - overshot = false; - return; - } - } - } + auto childState = BuildChildState(node, child, prevChild); + if (LimitExceeded(childState.Items, lastChild.PrevItems, itemsLimit) || LimitExceeded(childState.Bytes, lastChild.PrevBytes, bytesLimit)) { + beginRowId = Max(beginRowId, childState.EndRowId); + return; } - ready &= tryHandleChild(TChildState(child->PageId, childBeginRowId, childEndRowId)); + ready &= tryHandleChild(childState); } } }; const auto skipUnloadedRows = [&](const TChildState& child) { + if (child.PageId == lastChild.PageId) { + lastChild.PageId = Max(); // mark last child unloaded + } if (child.PageId == key1PageId) { - if (hasValidRowsRange && chargeGroupsItemsLimit) { - ui64 unloadedItems = key1Items - prevKey1Items; - if (unloadedItems < chargeGroupsItemsLimit) { - chargeGroupsItemsLimit -= unloadedItems; - } else { - hasValidRowsRange = false; - } - } - if (hasValidRowsRange && bytesLimit) { - endBytesLimitRowId = Min(endRowId, child.EndRowId); - } endRowId = Min(endRowId, child.BeginRowId); } if (child.PageId == key2PageId) { @@ -339,19 +282,14 @@ class TChargeBTreeIndex : public ICharge { } }; - const auto tryHandleNode = [&](TChildState child) -> bool { - if (child.PageId == key1PageId || child.PageId == key2PageId) { + const auto tryHandleNode = [&](const TChildState& child) -> bool { + if (child.PageId == lastChild.PageId || child.PageId == key1PageId || child.PageId == key2PageId) { if (TryLoadNode(child, nextLevel)) { const auto& node = nextLevel.back(); if (child.PageId == key1PageId) { TRecIdx pos = node.SeekReverse(ESeek::Lower, key1, Scheme.Groups[0].ColsKeyIdx, &keyDefaults); auto& key1Child = node.GetChild(pos); key1PageId = key1Child.PageId; - key1Items = key1Child.GetNonErasedRowCount(); - if (pos) { - auto& prevKey1Child = node.GetChild(pos - 1); - prevKey1Items = prevKey1Child.GetNonErasedRowCount(); - } endRowId = Min(endRowId, key1Child.RowCount); // move endRowId - 1 to the last key <= key1 } if (child.PageId == key2PageId) { @@ -360,8 +298,9 @@ class TChargeBTreeIndex : public ICharge { if (pos) { auto& prevKey2Child = node.GetChild(pos - 1); beginRowId = Max(beginRowId, prevKey2Child.RowCount - 1); // move beginRowId to the last key < key2 - if (prevKey2Child.RowCount >= endRowId) { + if (prevKey2Child.RowCount >= sliceEndRowId) { hasValidRowsRange = false; // key2 is after current slice + beginRowId = Min(beginRowId, sliceEndRowId - 1); // always load endRowId - 1 regardless of keys } } } @@ -375,7 +314,7 @@ class TChargeBTreeIndex : public ICharge { } }; - const auto tryHandleDataPage = [&](TChildState child) -> bool { + const auto tryHandleDataPage = [&](const TChildState& child) -> bool { if (hasValidRowsRange && (child.PageId == key1PageId || child.PageId == key2PageId)) { const auto page = TryGetDataPage(child.PageId, { }); if (page) { @@ -408,19 +347,18 @@ class TChargeBTreeIndex : public ICharge { } }; - for (ui32 height = 0; height < meta.LevelCount && ready; height++) { + for (ui32 height = 0; height < meta.LevelCount; height++) { if (height == 0) { - ready &= tryHandleNode(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleNode(BuildRootChildState(meta)); } else { iterateLevel(tryHandleNode); } level.swap(nextLevel); nextLevel.clear(); - } - - if (!ready) { // some index pages are missing, do not continue - ready &= DoGroupsAndHistoryReverse(hasValidRowsRange, beginRowId, endRowId, endBytesLimitRowId, chargeGroupsItemsLimit, bytesLimit); // precharge groups using the latest row bounds - return {ready, false}; + if (lastChild.PageId == Max()) { // last child is unloaded, consider all last's child rows are needed for next levels + lastChild.PrevItems = lastChild.Items; + lastChild.PrevBytes = lastChild.Bytes; + } } // flat index doesn't treat key placement within data page, so let's do the same @@ -428,115 +366,122 @@ class TChargeBTreeIndex : public ICharge { overshot &= beginRowId == sliceBeginRowId; if (meta.LevelCount == 0) { - ready &= tryHandleDataPage(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleDataPage(BuildRootChildState(meta)); } else { iterateLevel(tryHandleDataPage); } - ready &= DoGroupsAndHistoryReverse(hasValidRowsRange, beginRowId, endRowId, endBytesLimitRowId, chargeGroupsItemsLimit, bytesLimit); // precharge groups using the latest row bounds + ready &= DoGroupsAndHistoryReverse(hasValidRowsRange, beginRowId, endRowId, lastChild, itemsLimit, bytesLimit); // precharge groups using the latest row bounds return {ready, overshot}; } private: - bool DoGroupsAndHistory(bool hasValidRowsRange, TRowId beginRowId, TRowId endRowId, TRowId beginBytesLimitRowId, ui64 itemsLimit, ui64 bytesLimit) const noexcept { + bool DoGroupsAndHistory(bool hasValidRowsRange, TRowId beginRowId, TRowId endRowId, const TChildState& firstChild, ui64 itemsLimit, ui64 bytesLimit) const noexcept { bool ready = true; - if (hasValidRowsRange && beginRowId < endRowId) { - if (itemsLimit && endRowId - beginRowId - 1 >= itemsLimit) { - endRowId = beginRowId + itemsLimit + 1; - } - - if (beginBytesLimitRowId == Max()) { - beginBytesLimitRowId = beginRowId; + if (!hasValidRowsRange) { + return ready; + } + if (beginRowId >= endRowId) { + return ready; + } - if (IncludeHistory) { - ready &= DoHistory(beginRowId, endRowId); - } // otherwise beginBytesLimitRowId is specified, so we do not know where to start charging history + if (itemsLimit) { + // TODO: items limit should be applied on items not rows, but it requires iteration via first and last data pages + TRowId limitFromRowId = firstChild.PageId == Max() ? firstChild.BeginRowId : beginRowId; + if (endRowId - limitFromRowId - 1 > itemsLimit) { + endRowId = limitFromRowId + itemsLimit + 1; } - - for (auto groupIndex : Groups) { - ready &= DoGroup(TGroupId(groupIndex), beginRowId, endRowId, beginBytesLimitRowId, bytesLimit); + if (beginRowId >= endRowId) { + return ready; } } + if (IncludeHistory && (!bytesLimit || firstChild.PageId != Max())) { + ready &= DoHistory(beginRowId, endRowId); + } + + for (auto groupIndex : Groups) { + ready &= DoGroup(TGroupId(groupIndex), beginRowId, endRowId, firstChild.BeginRowId, bytesLimit); + } + return ready; } - bool DoGroupsAndHistoryReverse(bool hasValidRowsRange, TRowId beginRowId, TRowId endRowId, TRowId endBytesLimitRowId, ui64 itemsLimit, ui64 bytesLimit) const noexcept { + bool DoGroupsAndHistoryReverse(bool hasValidRowsRange, TRowId beginRowId, TRowId endRowId, const TChildState& lastChild, ui64 itemsLimit, ui64 bytesLimit) const noexcept { bool ready = true; - if (hasValidRowsRange && beginRowId < endRowId) { - if (itemsLimit && endRowId - beginRowId - 1 >= itemsLimit) { - beginRowId = endRowId - itemsLimit - 1; + if (!hasValidRowsRange) { + return ready; + } + if (beginRowId >= endRowId) { + return ready; + } + + if (itemsLimit) { + // TODO: items limit should be applied on items not rows, but it requires iteration via first and last data pages + TRowId limitToRowId = lastChild.PageId == Max() ? lastChild.EndRowId : endRowId; + if (limitToRowId - beginRowId - 1 >= itemsLimit) { + beginRowId = limitToRowId - itemsLimit - 1; + } + if (beginRowId >= endRowId) { + return ready; } - if (endBytesLimitRowId == Max()) { - endBytesLimitRowId = endRowId; + } - - if (IncludeHistory) { - ready &= DoHistory(beginRowId, endRowId); - } - } // otherwise endBytesLimitRowId is specified, so we do not know where to start charging history + if (IncludeHistory && (!bytesLimit || lastChild.PageId != Max())) { + ready &= DoHistory(beginRowId, endRowId); + } - for (auto groupIndex : Groups) { - ready &= DoGroupReverse(TGroupId(groupIndex), beginRowId, endRowId, endBytesLimitRowId, bytesLimit); - } + for (auto groupIndex : Groups) { + ready &= DoGroupReverse(TGroupId(groupIndex), beginRowId, endRowId, lastChild.EndRowId, bytesLimit); } return ready; } private: - bool DoGroup(TGroupId groupId, TRowId beginRowId, TRowId endRowId, TRowId beginBytesLimitRowId, ui64 bytesLimit) const noexcept { + bool DoGroup(TGroupId groupId, TRowId beginRowId, TRowId endRowId, TRowId firstChildBeginRowId, ui64 bytesLimit) const noexcept { bool ready = true; - const auto& meta = groupId.IsHistoric() ? Part->IndexPages.BTreeHistoric[groupId.Index] : Part->IndexPages.BTreeGroups[groupId.Index]; TVector level, nextLevel(::Reserve(3)); - ui64 prevBeginDataSize = 0; - ui64 prevBeginBytesLimitDataSize = bytesLimit ? GetPrevDataSize(meta, beginBytesLimitRowId) : 0; + ui64 firstChildPrevBytes = bytesLimit ? GetPrevBytes(meta, firstChildBeginRowId) : 0; const auto iterateLevel = [&](const auto& tryHandleChild) { - ui64 prevChildDataSize = prevBeginDataSize; for (const auto &node : level) { TRecIdx from = 0, to = node.GetChildrenCount(); if (node.BeginRowId < beginRowId) { from = node.Seek(beginRowId); - if (from) { - prevChildDataSize = prevBeginDataSize = node.GetShortChild(from - 1).DataSize; - } } if (node.EndRowId > endRowId) { to = node.Seek(endRowId - 1) + 1; } + for (TRecIdx pos : xrange(from, to)) { - auto child = node.GetShortChildRef(pos); + auto& child = node.GetShortChild(pos); auto prevChild = pos ? node.GetShortChildRef(pos - 1) : nullptr; - TRowId childBeginRowId = prevChild ? prevChild->RowCount : node.BeginRowId; - TRowId childEndRowId = child->RowCount; - if (bytesLimit) { - if (prevChildDataSize > prevBeginBytesLimitDataSize && LimitExceeded(prevChildDataSize - prevBeginBytesLimitDataSize, bytesLimit)) { - return; - } + auto childState = BuildChildState(node, child, prevChild); + if (LimitExceeded(firstChildPrevBytes, childState.PrevBytes, bytesLimit)) { + return; } - ready &= tryHandleChild(TChildState(child->PageId, childBeginRowId, childEndRowId)); - prevChildDataSize = child->DataSize; + ready &= tryHandleChild(childState); } } }; - const auto tryHandleNode = [&](TChildState child) -> bool { + const auto tryHandleNode = [&](const TChildState& child) -> bool { return TryLoadNode(child, nextLevel); }; - const auto tryHandleDataPage = [&](TChildState child) -> bool { + const auto tryHandleDataPage = [&](const TChildState& child) -> bool { return HasDataPage(child.PageId, groupId); }; - for (ui32 height = 0; height < meta.LevelCount && ready; height++) { + for (ui32 height = 0; height < meta.LevelCount; height++) { if (height == 0) { - ready &= tryHandleNode(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleNode(BuildRootChildState(meta)); } else { iterateLevel(tryHandleNode); } @@ -544,12 +489,8 @@ class TChargeBTreeIndex : public ICharge { nextLevel.clear(); } - if (!ready) { // some index pages are missing, do not continue - return ready; - } - if (meta.LevelCount == 0) { - ready &= tryHandleDataPage(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleDataPage(BuildRootChildState(meta)); } else { iterateLevel(tryHandleDataPage); } @@ -557,50 +498,46 @@ class TChargeBTreeIndex : public ICharge { return ready; } - bool DoGroupReverse(TGroupId groupId, TRowId beginRowId, TRowId endRowId, TRowId endBytesLimitRowId, ui64 bytesLimit) const noexcept { + bool DoGroupReverse(TGroupId groupId, TRowId beginRowId, TRowId endRowId, TRowId lastChildEndRowId, ui64 bytesLimit) const noexcept { bool ready = true; - const auto& meta = groupId.IsHistoric() ? Part->IndexPages.BTreeHistoric[groupId.Index] : Part->IndexPages.BTreeGroups[groupId.Index]; // level's nodes is in reverse order TVector level, nextLevel(::Reserve(3)); - ui64 endBytesLimitDataSize = bytesLimit ? GetDataSize(meta, endBytesLimitRowId - 1) : 0; + ui64 lastChildBytes = bytesLimit ? GetBytes(meta, lastChildEndRowId - 1) : 0; const auto iterateLevel = [&](const auto& tryHandleChild) { for (const auto &node : level) { - TRecIdx from = 0, to = node.GetChildrenCount(); + TRecIdx from = 0, to = node.GetKeysCount(); if (node.BeginRowId < beginRowId) { from = node.Seek(beginRowId); } if (node.EndRowId > endRowId) { - to = node.Seek(endRowId - 1) + 1; + to = node.Seek(endRowId - 1); } - for (TRecIdx posExt = to; posExt > from; posExt--) { - auto child = node.GetShortChildRef(posExt - 1); + for (TRecIdx posExt = to + 1; posExt > from; posExt--) { + auto& child = node.GetShortChild(posExt - 1); auto prevChild = posExt - 1 ? node.GetShortChildRef(posExt - 2) : nullptr; - TRowId childBeginRowId = prevChild ? prevChild->RowCount : node.BeginRowId; - TRowId childEndRowId = child->RowCount; - if (bytesLimit) { - if (endBytesLimitDataSize > child->DataSize && LimitExceeded(endBytesLimitDataSize - child->DataSize, bytesLimit)) { - return; - } + auto childState = BuildChildState(node, child, prevChild); + if (LimitExceeded(childState.Bytes, lastChildBytes, bytesLimit)) { + return; } - ready &= tryHandleChild(TChildState(child->PageId, childBeginRowId, childEndRowId)); + ready &= tryHandleChild(childState); } } }; - const auto tryHandleNode = [&](TChildState child) -> bool { + const auto tryHandleNode = [&](const TChildState& child) -> bool { return TryLoadNode(child, nextLevel); }; - const auto tryHandleDataPage = [&](TChildState child) -> bool { + const auto tryHandleDataPage = [&](const TChildState& child) -> bool { return HasDataPage(child.PageId, groupId); }; - for (ui32 height = 0; height < meta.LevelCount && ready; height++) { + for (ui32 height = 0; height < meta.LevelCount; height++) { if (height == 0) { - ready &= tryHandleNode(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleNode(BuildRootChildState(meta)); } else { iterateLevel(tryHandleNode); } @@ -608,12 +545,8 @@ class TChargeBTreeIndex : public ICharge { nextLevel.clear(); } - if (!ready) { // some index pages are missing, do not continue - return ready; - } - if (meta.LevelCount == 0) { - ready &= tryHandleDataPage(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleDataPage(BuildRootChildState(meta)); } else { iterateLevel(tryHandleDataPage); } @@ -645,11 +578,9 @@ class TChargeBTreeIndex : public ICharge { }; TCells key2{ key2Cells, 3 }; - // Directly use the history group scheme + // Directly use the history group scheme and key defaults with correct sort order const auto& scheme = Part->Scheme->HistoryGroup; Y_DEBUG_ABORT_UNLESS(scheme.ColsKeyIdx.size() == 3); - - // Directly use the history key defaults with correct sort order const TKeyCellDefaults* keyDefaults = Part->Scheme->HistoryKeys.Get(); const TGroupId groupId(0, true); @@ -670,9 +601,9 @@ class TChargeBTreeIndex : public ICharge { } for (TRecIdx pos : xrange(from, to)) { auto& child = node.GetShortChild(pos); - TRowId childBeginRowId = pos ? node.GetShortChild(pos - 1).RowCount : node.BeginRowId; - TRowId childEndRowId = child.RowCount; - ready &= tryHandleChild(TChildState(child.PageId, childBeginRowId, childEndRowId)); + auto prevChild = pos ? node.GetShortChildRef(pos - 1) : nullptr; + auto childState = BuildChildState(node, child, prevChild); + ready &= tryHandleChild(childState); } } }; @@ -686,7 +617,7 @@ class TChargeBTreeIndex : public ICharge { } }; - const auto tryHandleNode = [&](TChildState child) -> bool { + const auto tryHandleNode = [&](const TChildState& child) -> bool { if (child.PageId == key1PageId || child.PageId == key2PageId) { if (TryLoadNode(child, nextLevel)) { const auto& node = nextLevel.back(); @@ -714,7 +645,7 @@ class TChargeBTreeIndex : public ICharge { } }; - const auto tryHandleDataPage = [&](TChildState child) -> bool { + const auto tryHandleDataPage = [&](const TChildState& child) -> bool { if (Groups && (child.PageId == key1PageId || child.PageId == key2PageId)) { const auto page = TryGetDataPage(child.PageId, groupId); if (page) { @@ -739,7 +670,7 @@ class TChargeBTreeIndex : public ICharge { for (ui32 height = 0; height < meta.LevelCount; height++) { if (height == 0) { - ready &= tryHandleNode(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleNode(BuildRootChildState(meta)); } else { iterateLevel(tryHandleNode); } @@ -748,7 +679,7 @@ class TChargeBTreeIndex : public ICharge { } if (meta.LevelCount == 0) { - ready &= tryHandleDataPage(TChildState(meta.PageId, 0, meta.RowCount)); + ready &= tryHandleDataPage(BuildRootChildState(meta)); } else { iterateLevel(tryHandleDataPage); } @@ -771,7 +702,7 @@ class TChargeBTreeIndex : public ICharge { } private: - ui64 GetPrevDataSize(const TBtreeIndexMeta& meta, TRowId rowId) const { + ui64 GetPrevBytes(const TBtreeIndexMeta& meta, TRowId rowId) const { TPageId pageId = meta.PageId; ui64 result = 0; @@ -791,7 +722,7 @@ class TChargeBTreeIndex : public ICharge { return result; } - ui64 GetDataSize(TBtreeIndexMeta meta, TRowId rowId) const { + ui64 GetBytes(TBtreeIndexMeta meta, TRowId rowId) const { TPageId pageId = meta.PageId; ui64 result = meta.DataSize; @@ -845,8 +776,29 @@ class TChargeBTreeIndex : public ICharge { : (left.size() > right.size() ? -1 : 1); } - bool LimitExceeded(ui64 value, ui64 limit) const noexcept { - return limit && value > limit; + TChildState BuildRootChildState(const TBtreeIndexMeta& meta) const noexcept { + return TChildState(meta.PageId, + 0, meta.RowCount, + 0, meta.GetNonErasedRowCount(), + 0, meta.DataSize); + } + + TChildState BuildChildState(const TNodeState& parent, TChild child, const TChild* prevChild) const noexcept { + return TChildState(child.PageId, + prevChild ? prevChild->RowCount : parent.BeginRowId, child.RowCount, + prevChild ? prevChild->GetNonErasedRowCount() : parent.PrevItems, child.GetNonErasedRowCount(), + prevChild ? prevChild->DataSize : parent.PrevBytes, child.DataSize); + } + + TChildState BuildChildState(const TNodeState& parent, TShortChild child, const TShortChild* prevChild) const noexcept { + return TChildState(child.PageId, + prevChild ? prevChild->RowCount : parent.BeginRowId, child.RowCount, + prevChild ? prevChild->RowCount : parent.BeginRowId, child.RowCount, + prevChild ? prevChild->DataSize : parent.PrevBytes, child.DataSize); + } + + bool LimitExceeded(ui64 prev, ui64 current, ui64 limit) const noexcept { + return limit && current > prev && current - prev > limit; } private: diff --git a/ydb/core/tablet_flat/ut/ut_btree_index_iter_charge.cpp b/ydb/core/tablet_flat/ut/ut_btree_index_iter_charge.cpp index 68b2433f4682..4bb71c2a9383 100644 --- a/ydb/core/tablet_flat/ut/ut_btree_index_iter_charge.cpp +++ b/ydb/core/tablet_flat/ut/ut_btree_index_iter_charge.cpp @@ -18,6 +18,10 @@ namespace { struct TTouchEnv : public NTest::TTestEnv { const TSharedData* TryGetPage(const TPart *part, TPageId pageId, TGroupId groupId) override { + if (Sticky[groupId].contains(pageId)) { + Loaded[groupId].insert(pageId); + } + Touched[groupId].insert(pageId); if (Loaded[groupId].contains(pageId)) { return NTest::TTestEnv::TryGetPage(part, pageId, groupId); @@ -32,12 +36,22 @@ namespace { Touched.clear(); } + void StickLoaded() { + for (const auto &g : Loaded) { + Sticky[g.first].insert(g.second.begin(), g.second.end()); + } + Touched.clear(); + Loaded.clear(); + } + TMap> Loaded; TMap> Touched; + TMap> Sticky; }; void AssertLoadedTheSame(const TPartStore& part, const TTouchEnv& bTree, const TTouchEnv& flat, const TString& message, bool allowAdditionalFirstLastPartPages = false, bool allowAdditionalFirstLoadedPage = false, bool allowLastLoadedPageDifference = false) { + TSet groupIds; for (const auto &c : {bTree.Loaded, flat.Loaded}) { for (const auto &g : c) { @@ -79,15 +93,16 @@ namespace { } } - struct TMakePartParams { + struct TTestParams { const ui32 Levels = Max(); const bool Groups = false; const bool History = false; const bool Slices = false; const ui32 Rows = 40; + const bool StickSomePages = false; }; - TPartEggs MakePart(TMakePartParams params) { + TPartEggs MakePart(TTestParams params) { NPage::TConf conf; switch (params.Levels) { case 0: @@ -315,7 +330,7 @@ Y_UNIT_TEST_SUITE(TPartBtreeIndexIt) { } void CheckSeekKey(const TPartStore& part, const TKeyCellDefaults *keyDefaults) { - for (bool reverse : {false}) { + for (bool reverse : {false, true}) { for (ESeek seek : {ESeek::Exact, ESeek::Lower, ESeek::Upper}) { for (ui32 firstCell : xrange(0, part.Stat.Rows / 7 + 1)) { for (ui32 secondCell : xrange(0, 14)) { @@ -370,7 +385,7 @@ Y_UNIT_TEST_SUITE(TPartBtreeIndexIt) { } } - void CheckPart(TMakePartParams params) { + void CheckPart(TTestParams params) { TPartEggs eggs = MakePart(params); const auto part = *eggs.Lone(); @@ -394,8 +409,23 @@ Y_UNIT_TEST_SUITE(TPartBtreeIndexIt) { } Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { + void StickSomePages(TTestParams params, const TPartStore& part, TTagsRef tags, const TKeyCellDefaults &keyDefaults, TTouchEnv& bTreeEnv, TTouchEnv& flatEnv) { + if (params.StickSomePages) { + TChargeBTreeIndex bTree(&bTreeEnv, part, tags, true); + + for (int times = 0; times < 5; times++) { + bTree.ICharge::Do(10, 10, keyDefaults, 0, 0); + bTreeEnv.LoadTouched(); + } + + flatEnv.Loaded = bTreeEnv.Loaded; + flatEnv.StickLoaded(); + bTreeEnv.StickLoaded(); + } + } + void DoChargeRowId(ICharge& charge, TTouchEnv& env, const TRowId row1, const TRowId row2, ui64 itemsLimit, ui64 bytesLimit, - bool reverse, const TKeyCellDefaults &keyDefaults, const TString& message, ui32 failsAllowed = 10) { + bool reverse, const TKeyCellDefaults &keyDefaults, const TString& message, ui32 failsAllowed = 15) { while (true) { bool ready = reverse ? charge.DoReverse(row2, row1, keyDefaults, itemsLimit, bytesLimit) @@ -424,7 +454,7 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { Y_UNREACHABLE(); } - void CheckChargeRowId(const TPartStore& part, TTagsRef tags, const TKeyCellDefaults *keyDefaults) { + void CheckChargeRowId(TTestParams params, const TPartStore& part, TTagsRef tags, const TKeyCellDefaults *keyDefaults) { for (bool reverse : {false, true}) { for (ui64 itemsLimit : TVector{0, 1, 2, 5, 13, 19, part.Stat.Rows - 2, part.Stat.Rows - 1}) { for (TRowId rowId1 : xrange(0, part.Stat.Rows - 1)) { @@ -432,6 +462,7 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { TTouchEnv bTreeEnv, flatEnv; TChargeBTreeIndex bTree(&bTreeEnv, part, tags, true); TCharge flat(&flatEnv, part, tags, true); + StickSomePages(params, part, tags, *keyDefaults, bTreeEnv, flatEnv); TString message = TStringBuilder() << (reverse ? "ChargeRowIdReverse " : "ChargeRowId ") << rowId1 << " " << rowId2 << " items " << itemsLimit; DoChargeRowId(bTree, bTreeEnv, rowId1, rowId2, itemsLimit, 0, reverse, *keyDefaults, message); @@ -444,7 +475,7 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { } } - void CheckChargeKeys(const TPartStore& part, TTagsRef tags, const TKeyCellDefaults *keyDefaults) { + void CheckChargeKeys(TTestParams params, const TPartStore& part, TTagsRef tags, const TKeyCellDefaults *keyDefaults) { for (bool reverse : {false, true}) { for (ui64 itemsLimit : TVector{0, 1, 2, 5, 13, 19, part.Stat.Rows - 2, part.Stat.Rows - 1}) { for (ui32 firstCellKey1 : xrange(0, part.Stat.Rows / 7 + 1)) { @@ -457,6 +488,7 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { TTouchEnv bTreeEnv, flatEnv; TChargeBTreeIndex bTree(&bTreeEnv, part, tags, true); TCharge flat(&flatEnv, part, tags, true); + StickSomePages(params, part, tags, *keyDefaults, bTreeEnv, flatEnv); TStringBuilder message = TStringBuilder() << (reverse ? "ChargeKeysReverse " : "ChargeKeys ") << "("; for (auto c : key1) { @@ -487,7 +519,7 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { } } - void CheckChargeBytesLimit(const TPartStore& part, TTagsRef tags, const TKeyCellDefaults *keyDefaults) { + void CheckChargeBytesLimit(TTestParams params, const TPartStore& part, TTagsRef tags, const TKeyCellDefaults *keyDefaults) { for (bool reverse : {false, true}) { for (ui64 bytesLimit : xrange(1, part.Stat.Bytes + 100, part.Stat.Bytes / 100)) { for (ui32 firstCellKey1 : xrange(0, part.Stat.Rows / 7 + 1)) { @@ -497,6 +529,7 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { TTouchEnv limitedEnv, unlimitedEnv; TChargeBTreeIndex limitedCharge(&limitedEnv, part, tags, true); TChargeBTreeIndex unlimitedCharge(&unlimitedEnv, part, tags, true); + StickSomePages(params, part, tags, *keyDefaults, limitedEnv, unlimitedEnv); TStringBuilder message = TStringBuilder() << (reverse ? "ChargeBytesLimitReverse " : "ChargeBytesLimit ") << "("; for (auto c : key1) { @@ -533,7 +566,12 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { } if (!groupId.IsMain() && !loaded.contains(pageId)) { // only check that we loaded consecutive pages - break; + if (params.StickSomePages) { + // extra pages may appear after the bytes limit is applied on main pages + continue; + } else { + break; + } } expected.insert(pageId); if (size > bytesLimit) { @@ -551,7 +589,7 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { } } - void CheckPart(TMakePartParams params) { + void CheckPart(TTestParams params) { TPartEggs eggs = MakePart(params); const auto part = *eggs.Lone(); @@ -560,9 +598,9 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { tags.push_back(c.Tag); } - CheckChargeRowId(part, tags, eggs.Scheme->Keys.Get()); - CheckChargeKeys(part, tags, eggs.Scheme->Keys.Get()); - CheckChargeBytesLimit(part, tags, eggs.Scheme->Keys.Get()); + CheckChargeRowId(params, part, tags, eggs.Scheme->Keys.Get()); + CheckChargeKeys(params, part, tags, eggs.Scheme->Keys.Get()); + CheckChargeBytesLimit(params, part, tags, eggs.Scheme->Keys.Get()); } Y_UNIT_TEST(NoNodes) { @@ -609,9 +647,17 @@ Y_UNIT_TEST_SUITE(TChargeBTreeIndex) { CheckPart({.Levels = 3, .History = true}); } + Y_UNIT_TEST(FewNodes_Sticky) { + CheckPart({.Levels = 3, .StickSomePages = true}); + } + Y_UNIT_TEST(FewNodes_Groups_History) { CheckPart({.Levels = 3, .Groups = true, .History = true}); } + + Y_UNIT_TEST(FewNodes_Groups_History_Sticky) { + CheckPart({.Levels = 3, .Groups = true, .History = true, .StickSomePages = true}); + } } Y_UNIT_TEST_SUITE(TPartBtreeIndexIteration) { @@ -756,7 +802,7 @@ Y_UNIT_TEST_SUITE(TPartBtreeIndexIteration) { } } - void CheckPart(TMakePartParams params) { + void CheckPart(TTestParams params) { TPartEggs eggs = MakePart(params); const auto part = *eggs.Lone(); @@ -816,6 +862,10 @@ Y_UNIT_TEST_SUITE(TPartBtreeIndexIteration) { CheckPart({.Levels = 3, .History = true}); } + Y_UNIT_TEST(FewNodes_Sticky) { + CheckPart({.Levels = 3, .StickSomePages = true}); + } + Y_UNIT_TEST(FewNodes_Slices) { CheckPart({.Levels = 3, .Slices = true}); } @@ -831,6 +881,10 @@ Y_UNIT_TEST_SUITE(TPartBtreeIndexIteration) { Y_UNIT_TEST(FewNodes_Groups_History_Slices) { CheckPart({.Levels = 3, .Groups = true, .History = true, .Slices = true}); } + + Y_UNIT_TEST(FewNodes_Groups_History_Slices_Sticky) { + CheckPart({.Levels = 3, .Groups = true, .History = true, .Slices = true, .StickSomePages = true}); + } } }