Skip to content

Commit

Permalink
Merge 6eda117 into 546c8cb
Browse files Browse the repository at this point in the history
  • Loading branch information
yumkam authored May 20, 2024
2 parents 546c8cb + 6eda117 commit 8e84798
Show file tree
Hide file tree
Showing 11 changed files with 285 additions and 257 deletions.
104 changes: 7 additions & 97 deletions ydb/library/yql/minikql/comp_nodes/packed_tuple/hashes_calc.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,106 +8,10 @@ namespace NPackedTuple {



// Calculates CRC32 of data using hardware acceleration instruction (Size <= 16)
template<typename TTraits, ui32 Size> ui32 CalculateCRC32(const ui8 * data, ui32 initHash = 0) {
static_assert(Size <= 16, "Size for template CRC32 calculation should be <= 16 !");

using TSimdI8 = TTraits::TSimdI8;

ui32 hash = initHash;

if constexpr (Size == 1 ) {
hash = TSimdI8::CRC32u8(hash, *(ui8*) data);
}

if constexpr (Size == 2 ) {
hash = TSimdI8::CRC32u16(hash, *(ui16*) data);
}

if constexpr (Size == 3 ) {

hash = TSimdI8::CRC32u16(hash, *(ui16*) data);
hash = TSimdI8::CRC32u8(hash, *(ui8*) (data+2));
}

if constexpr (Size == 4 ) {
hash = TSimdI8::CRC32u32(hash, *(ui32*) data);
}

if constexpr (Size == 5 ) {
hash = TSimdI8::CRC32u32(hash, *(ui32*) data);
hash = TSimdI8::CRC32u8(hash, *(ui8*) (data+4));
}

if constexpr (Size == 6 ) {
hash = TSimdI8::CRC32u32(hash, *(ui32*) data);
hash = TSimdI8::CRC32u16(hash, *(ui16*) (data+4));
}

if constexpr (Size == 7 ) {
hash = TSimdI8::CRC32u32(hash, *(ui32*) data);
hash = TSimdI8::CRC32u16(hash, *(ui16*) (data+4));
hash = TSimdI8::CRC32u8(hash, *(ui8*) (data+6));
}

if constexpr (Size == 8 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
}

if constexpr (Size == 9 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u8(hash, *(ui8*) (data+8));
}

if constexpr (Size == 10 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u16(hash, *(ui16*) (data+8));
}

if constexpr (Size == 11 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u16(hash, *(ui16*) (data+8));
hash = TSimdI8::CRC32u8(hash, *(ui8*) (data+10));
}

if constexpr (Size == 12 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u32(hash, *(ui32*) (data+8));
}

if constexpr (Size == 13 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u32(hash, *(ui32*) (data+8));
hash = TSimdI8::CRC32u8(hash, *(ui8*) (data+12));
}

if constexpr (Size == 14 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u32(hash, *(ui32*) (data+8));
hash = TSimdI8::CRC32u16(hash, *(ui16*) (data+12));
}

if constexpr (Size == 15 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u32(hash, *(ui32*) (data+8));
hash = TSimdI8::CRC32u16(hash, *(ui16*) (data+12));
hash = TSimdI8::CRC32u8(hash, *(ui8*) (data+14));
}

if constexpr (Size == 16 ) {
hash = TSimdI8::CRC32u64(hash, *(ui64*) data);
hash = TSimdI8::CRC32u64(hash, *(ui64*) (data+8));
}

return hash;

}


template <typename TTraits>
inline ui32 CalculateCRC32(const ui8 * data, ui32 size, ui32 hash = 0 ) {

using TSimdI8 = TTraits::TSimdI8;
using TSimdI8 = typename TTraits::TSimdI8;

while (size >= 8) {
hash = TSimdI8::CRC32u64(hash, ReadUnaligned<ui64>(data));
Expand Down Expand Up @@ -148,6 +52,12 @@ inline ui32 CalculateCRC32(const ui8 * data, ui32 size, ui32 hash = 0 ) {
return hash;

}
template
__attribute__((target("avx2")))
ui32 CalculateCRC32<NSimd::TSimdAVX2Traits>(const ui8 * data, ui32 size, ui32 hash = 0 );
template
__attribute__((target("sse4.2")))
ui32 CalculateCRC32<NSimd::TSimdSSE42Traits>(const ui8 * data, ui32 size, ui32 hash = 0 );
}

}
Expand Down
76 changes: 69 additions & 7 deletions ydb/library/yql/minikql/comp_nodes/packed_tuple/tuple.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,46 @@

#include <algorithm>

namespace {
// Transpose 8x8 bit-matrix packed in ui64 integer
ui64 transposeBitmatrix(ui64 x) {
// a b A B aa bb AA BB
// c d C D cc dd CC DD
// ->
// a c A C aa cc AA CC
// b d B D bb dd BB DD
// a b A B aa bb AA BB // c d C D cc dd CC DD
// a c A C aa cc AA CC // b d B D bb dd BB DD
x =
((x & 0b10101010'01010101'10101010'01010101'10101010'01010101'10101010'01010101ull)) |
((x & 0b01010101'00000000'01010101'00000000'01010101'00000000'01010101'00000000ull) >> 7) |
((x & 0b00000000'10101010'00000000'10101010'00000000'10101010'00000000'10101010ull) << 7);
// a1 a2 b1 b2 A1 A2 B1 B2
// a3 a4 b3 b4 A3 A4 B3 B4
// c1 c2 d1 d2 C1 C2 D1 D2
// c3 c4 d3 d4 C3 C4 D3 D4
// ->
// a1 a2 c1 c2 A1 A2 C1 C2
// a3 a4 c3 c4 A3 A4 C3 C4
// b1 b2 d1 d2 B1 B2 D1 D2
// b3 b4 d3 d4 B3 B4 D3 D4
//
//
// a1 a2 b1 b2 A1 A2 B1 B2 // a3 a4 b3 b4 A3 A4 B3 B4 // c1 c2 d1 d2 C1 C2 D1 D2 // c3 c4 d3 d4 C3 C4 D3 D4
// ->
// a1 a2 c1 c2 A1 A2 C1 C2 // a3 a4 c3 c4 A3 A4 C3 C4 // b1 b2 d1 d2 B1 B2 D1 D2 // b3 b4 d3 d4 B3 B4 D3 D4
x =
((x & 0b1100110011001100'0011001100110011'1100110011001100'0011001100110011ull)) |
((x & 0b0011001100110011'0000000000000000'0011001100110011'0000000000000000ull) >> 14) |
((x & 0b0000000000000000'1100110011001100'0000000000000000'1100110011001100ull) << 14);
x =
((x & 0b11110000111100001111000011110000'00001111000011110000111100001111ull)) |
((x & 0b00001111000011110000111100001111'00000000000000000000000000000000ull) >> 28) |
((x & 0b00000000000000000000000000000000'11110000111100001111000011110000ull) << 28);
return x;
}
}

namespace NKikimr {
namespace NMiniKQL {
namespace NPackedTuple {
Expand Down Expand Up @@ -149,10 +189,21 @@ namespace NPackedTuple {

std::vector<ui64> bitmaskMatrix(BitmaskSize);

if (auto off = (start % 8)) {
auto bitmaskIdx = start / 8;

for (ui32 j = Columns.size(); j--; )
bitmaskMatrix[j / 8] |= ui64(isValidBitmask[Columns[j].OriginalIndex][bitmaskIdx]) << ((j % 8)*8);

for (auto &m: bitmaskMatrix) {
m = transposeBitmatrix(m);
m >>= off * 8;
}
}

for (; count--; ++start, res += TotalRowSize) {
ui32 hash = 0;
auto bitmaskIdx = start / 8;
auto bitmaskShift = start % 8;

bool anyOverflow = false;

Expand All @@ -168,12 +219,17 @@ namespace NPackedTuple {
}
}

std::memset(res + BitmaskOffset, 0, BitmaskSize);

for (ui32 i = 0; i < Columns.size(); ++i) {
auto& col = Columns[i];
if ((start % 8) == 0) {
std::fill(bitmaskMatrix.begin(), bitmaskMatrix.end(), 0);
for (ui32 j = Columns.size(); j--; )
bitmaskMatrix[j / 8] |= ui64(isValidBitmask[Columns[j].OriginalIndex][bitmaskIdx]) << ((j % 8)*8);
for (auto &m: bitmaskMatrix)
m = transposeBitmatrix(m);
}

res[BitmaskOffset + (i / 8)] |= ((isValidBitmask[col.OriginalIndex][bitmaskIdx] >> bitmaskShift) & 1u) << (i % 8);
for (ui32 j = 0; j < BitmaskSize; ++j) {
res[BitmaskOffset + j] = ui8(bitmaskMatrix[j]);
bitmaskMatrix[j] >>= 8;
}

for (auto &col: FixedNPOTColumns_) {
Expand Down Expand Up @@ -219,7 +275,7 @@ namespace NPackedTuple {
}

if (anyOverflow && col.Role == EColumnRole::Key) {
hash = CalculateCRC32<TTraits, sizeof(ui32)>((ui8 *)&size, hash);
hash = TSimdI8::CRC32u32(hash, size);
hash = CalculateCRC32<TTraits>(data, size, hash);
}
}
Expand All @@ -233,6 +289,12 @@ namespace NPackedTuple {
WriteUnaligned<ui32>(res, hash);
}
}
template
__attribute__((target("avx2")))
void TTupleLayoutFallback<NSimd::TSimdAVX2Traits>::Pack( const ui8** columns, const ui8** isValidBitmask, ui8 * res, std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, ui32 count) const;
template
__attribute__((target("sse4.2")))
void TTupleLayoutFallback<NSimd::TSimdSSE42Traits>::Pack( const ui8** columns, const ui8** isValidBitmask, ui8 * res, std::vector<ui8, TMKQLAllocator<ui8>> &overflow, ui32 start, ui32 count) const;
}
}
}
2 changes: 1 addition & 1 deletion ydb/library/yql/minikql/comp_nodes/packed_tuple/tuple.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ struct TTupleLayoutFallback: public TTupleLayout {
std::array<std::vector<TColumnDesc>, 5> FixedPOTColumns_; // Fixed-size columns for power-of-two sizes from 1 to 16 bytes
std::vector<TColumnDesc> FixedNPOTColumns_; // Remaining fixed-size columns
std::vector<TColumnDesc> VariableColumns_; // Variable-size columns only
using TSimdI8 = TTrait::TSimdI8;
using TSimdI8 = typename TTrait::TSimdI8;
};

}
Expand Down
1 change: 1 addition & 0 deletions ydb/library/yql/minikql/comp_nodes/ya.make
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ END()
RECURSE(
llvm14
no_llvm
packed_tuple
)

RECURSE_FOR_TESTS(
Expand Down
4 changes: 2 additions & 2 deletions ydb/library/yql/utils/simd/exec/pack_tuple/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ struct TPerfomancer {
using TSimd = typename TTraits::template TSimd8<T>;
TWorker() = default;

ui8* ShuffleMask(ui32 v[8]) {
ui8* det = new ui8[32];
TSimd<ui8> ShuffleMask(ui32 v[8]) {
ui8 det[32];
for (size_t i = 0; i < 32; i += 1) {
det[i] = v[i / 4] == ui32(-1) ? ui8(-1) : 4 * v[i / 4] + i % 4;
}
Expand Down
10 changes: 9 additions & 1 deletion ydb/library/yql/utils/simd/exec/stream_store/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,11 +114,19 @@ struct TPerfomancer {
};
};

template
__attribute__((target("avx2")))
int TPerfomancer::TWorker<NSimd::TSimdAVX2Traits>::StoreStream(bool);

template
__attribute__((target("sse4.2")))
int TPerfomancer::TWorker<NSimd::TSimdSSE42Traits>::StoreStream(bool);

int main() {
TPerfomancer tp;
auto worker = NSimd::SelectSimdTraits(tp);

bool fine = true;
fine &= worker->StoreStream(false);
return !fine;
}
}
10 changes: 9 additions & 1 deletion ydb/library/yql/utils/simd/exec/tuples_to_bucket/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,16 @@ struct TPerfomancer {
};
};

template
__attribute__((target("avx2")))
int TPerfomancer::TWorker<NSimd::TSimdAVX2Traits>::TuplesToBucket(bool);

template
__attribute__((target("sse4.2")))
int TPerfomancer::TWorker<NSimd::TSimdSSE42Traits>::TuplesToBucket(bool);

int main() {
TPerfomancer tp;
auto worker = NSimd::SelectSimdTraits(tp);
return !worker->TuplesToBucket(false);
}
}
30 changes: 29 additions & 1 deletion ydb/library/yql/utils/simd/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,14 @@ auto CreateUnpackMask(ui32 dataSize, ui32 stripeSize, bool needOffset) {
return TSimdI8(indexes);
}

template
__attribute__((target("avx2")))
auto CreateUnpackMask<NSimd::TSimdAVX2Traits>(ui32, ui32, bool);

template
__attribute__((target("sse4.2")))
auto CreateUnpackMask<NSimd::TSimdSSE42Traits>(ui32, ui32, bool);


// Creates mask to advance register content for N bytes. When N is negative, move data to lower bytes.
template<typename TTraits> auto AdvanceBytesMask(const int N) {
Expand All @@ -89,6 +97,16 @@ template<typename TTraits> auto AdvanceBytesMask(const int N) {
}


template
__attribute__((target("avx2")))
auto AdvanceBytesMask<NSimd::TSimdAVX2Traits>(const int);


template
__attribute__((target("sse4.2")))
auto AdvanceBytesMask<NSimd::TSimdSSE42Traits>(const int);


// Prepare unpack mask to merge two columns in one register. col1Bytes, col2Bytes - size of data in columns.
template<typename TTraits>
void PrepareMergeMasks( ui32 col1Bytes, ui32 col2Bytes, typename TTraits::TSimdI8& unpackMask1, typename TTraits::TSimdI8& unpackMask2) {
Expand All @@ -97,4 +115,14 @@ void PrepareMergeMasks( ui32 col1Bytes, ui32 col2Bytes, typename TTraits::TSimdI
}


}
template
__attribute__((target("avx2")))
void PrepareMergeMasks<NSimd::TSimdAVX2Traits>(ui32, ui32, NSimd::TSimdAVX2Traits::TSimdI8 &, NSimd::TSimdAVX2Traits::TSimdI8 &);


template
__attribute__((target("sse4.2")))
void PrepareMergeMasks<NSimd::TSimdSSE42Traits>(ui32, ui32, NSimd::TSimdSSE42Traits::TSimdI8 &, NSimd::TSimdSSE42Traits::TSimdI8 &);


}
Loading

0 comments on commit 8e84798

Please sign in to comment.