Skip to content

Commit

Permalink
speed up bloom construction (#13073)
Browse files Browse the repository at this point in the history
  • Loading branch information
ivanmorozov333 authored Dec 28, 2024
1 parent 307b994 commit 7155bd1
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 21 deletions.
2 changes: 1 addition & 1 deletion ydb/core/tx/columnshard/columnshard__write.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ void TColumnShard::Handle(NPrivateEvents::NWrite::TEvWritePortionResult::TPtr& e
std::vector<TInsertedPortions> writtenPacks = ev->Get()->DetachInsertedPacks();
const TMonotonic now = TMonotonic::Now();
for (auto&& i : writtenPacks) {
AFL_WARN(NKikimrServices::TX_COLUMNSHARD_WRITE)("writing_size", i.GetDataSize())("event", "data_write_finished")(
AFL_DEBUG(NKikimrServices::TX_COLUMNSHARD_WRITE)("writing_size", i.GetDataSize())("event", "data_write_finished")(
"writing_id", i.GetWriteMeta().GetId());
Counters.OnWritePutBlobsSuccess(now - i.GetWriteMeta().GetWriteStartInstant(), i.GetRecordsCount());
Counters.GetWritesMonitor()->OnFinishWrite(i.GetDataSize(), 1);
Expand Down
42 changes: 33 additions & 9 deletions ydb/core/tx/columnshard/engines/storage/indexes/bloom/checker.h
Original file line number Diff line number Diff line change
@@ -1,27 +1,50 @@
#pragma once
#include <ydb/core/tx/columnshard/engines/scheme/indexes/abstract/simple.h>

#include <util/generic/bitmap.h>

namespace NKikimr::NOlap::NIndexes {

class TFixStringBitsStorage {
private:
YDB_READONLY_DEF(TString, Data);

template <class T>
class TSizeDetector {};

template <>
class TSizeDetector<std::vector<bool>> {
public:
static ui32 GetSize(const std::vector<bool>& v) {
return v.size();
}
};

template <>
class TSizeDetector<TDynBitMap> {
public:
static ui32 GetSize(const TDynBitMap& v) {
return v.Size();
}
};

public:
TFixStringBitsStorage(const TString& data)
: Data(data)
{}
: Data(data) {
}

static ui32 GrowBitsCountToByte(const ui32 bitsCount) {
const ui32 bytesCount = bitsCount / 8;
return (bytesCount + ((bitsCount % 8) ? 1 : 0)) * 8;
}

TFixStringBitsStorage(const std::vector<bool>& bitsVector)
: TFixStringBitsStorage(bitsVector.size()) {
template <class TBitsVector>
TFixStringBitsStorage(const TBitsVector& bitsVector)
: TFixStringBitsStorage(TSizeDetector<TBitsVector>::GetSize(bitsVector)) {
ui32 byteIdx = 0;
ui8 byteCurrent = 0;
ui8 shiftCurrent = 0;
for (ui32 i = 0; i < bitsVector.size(); ++i) {
for (ui32 i = 0; i < TSizeDetector<TBitsVector>::GetSize(bitsVector); ++i) {
if (i && i % 8 == 0) {
Data[byteIdx] = (char)byteCurrent;
byteCurrent = 0;
Expand Down Expand Up @@ -70,26 +93,27 @@ class TBloomFilterChecker: public TSimpleIndexChecker {
static TString GetClassNameStatic() {
return "BLOOM_FILTER";
}

private:
using TBase = TSimpleIndexChecker;
std::set<ui64> HashValues;
static inline auto Registrator = TFactory::TRegistrator<TBloomFilterChecker>(GetClassNameStatic());

protected:
virtual bool DoDeserializeFromProtoImpl(const NKikimrSSA::TProgram::TOlapIndexChecker& proto) override;
virtual void DoSerializeToProtoImpl(NKikimrSSA::TProgram::TOlapIndexChecker& proto) const override;

virtual bool DoCheckImpl(const std::vector<TString>& blobs) const override;

public:
TBloomFilterChecker() = default;
TBloomFilterChecker(const ui32 indexId, std::set<ui64>&& hashes)
: TBase(indexId)
, HashValues(std::move(hashes))
{

, HashValues(std::move(hashes)) {
}
virtual TString GetClassName() const override {
return GetClassNameStatic();
}
};

} // namespace NKikimr::NOlap::NIndexes
} // namespace NKikimr::NOlap::NIndexes
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

#include <contrib/libs/apache/arrow/cpp/src/arrow/array/builder_primitive.h>
#include <library/cpp/deprecated/atomic/atomic.h>
#include <util/generic/bitmap.h>

namespace NKikimr::NOlap::NIndexes::NBloomNGramm {

Expand Down Expand Up @@ -179,31 +180,61 @@ class TNGrammBuilder {

class TVectorInserter {
private:
bool* Values;
TDynBitMap& Values;
const ui32 Size;

public:
TVectorInserter(std::vector<bool>& values)
: Values(&values[0])
, Size(values.size()) {
TVectorInserter(TDynBitMap& values)
: Values(values)
, Size(values.Size()) {
AFL_VERIFY(values.Size());
}

void operator()(const ui64 hash) {
Values[hash % Size] = true;
Values.Set(hash % Size);
}
};

class TVectorInserterPower2 {
private:
TDynBitMap& Values;
const ui32 SizeMask;

public:
TVectorInserterPower2(TDynBitMap& values)
: Values(values)
, SizeMask(values.Size() - 1) {
AFL_VERIFY(values.Size());
}

void operator()(const ui64 hash) {
Values.Set(hash & SizeMask);
}
};

TString TIndexMeta::DoBuildIndexImpl(TChunkedBatchReader& reader, const ui32 /*recordsCount*/) const {
AFL_VERIFY(reader.GetColumnsCount() == 1)("count", reader.GetColumnsCount());
TNGrammBuilder builder(HashesCount);

std::vector<bool> bitsVector(FilterSizeBytes * 8, false);
TVectorInserter inserter(bitsVector);
for (reader.Start(); reader.IsCorrect();) {
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), inserter);
reader.ReadNext(reader.begin()->GetCurrentChunk()->length());
TDynBitMap bitMap;
const ui32 size = FilterSizeBytes * 8;
bitMap.Reserve(FilterSizeBytes * 8);

const auto doFillFilter = [&](auto& inserter) {
for (reader.Start(); reader.IsCorrect();) {
builder.FillNGrammHashes(NGrammSize, reader.begin()->GetCurrentChunk(), inserter);
reader.ReadNext(reader.begin()->GetCurrentChunk()->length());
}
};

if ((size & (size - 1)) == 0) {
TVectorInserterPower2 inserter(bitMap);
doFillFilter(inserter);
} else {
TVectorInserter inserter(bitMap);
doFillFilter(inserter);
}
return TFixStringBitsStorage(bitsVector).GetData();
return TFixStringBitsStorage(bitMap).GetData();
}

void TIndexMeta::DoFillIndexCheckers(
Expand Down

0 comments on commit 7155bd1

Please sign in to comment.