Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[refine](bits) refine bytes_mask_to_bits_mask code (#38360) #43511

Merged
merged 1 commit into from
Nov 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 14 additions & 16 deletions be/src/olap/rowset/segment_v2/segment_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1804,23 +1804,21 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_

uint32_t sel_pos = 0;
const uint32_t sel_end = sel_pos + selected_size;
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;

while (sel_pos < sel_end_simd) {
auto mask = simd::bytes32_mask_to_bits32_mask(ret_flags + sel_pos);
auto mask = simd::bytes_mask_to_bits_mask((const uint8_t*)ret_flags + sel_pos);
if (0 == mask) {
//pass
} else if (0xffffffff == mask) {
} else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_pos + i;
}
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
sel_rowid_idx[new_size++] = sel_pos + bit_pos;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { sel_rowid_idx[new_size++] = sel_pos + bit_pos; },
mask);
}
sel_pos += SIMD_BYTES;
}
Expand Down Expand Up @@ -2277,23 +2275,23 @@ uint16_t SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx,
uint16_t new_size = 0;
uint32_t sel_pos = 0;
const uint32_t sel_end = selected_size;
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;

while (sel_pos < sel_end_simd) {
auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos);
auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos);
if (0 == mask) {
//pass
} else if (0xffffffff == mask) {
} else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i];
}
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos];
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) {
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos];
},
mask);
}
sel_pos += SIMD_BYTES;
}
Expand Down
80 changes: 75 additions & 5 deletions be/src/util/simd/bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,58 @@
#include <cstring>
#include <vector>

#if defined(__ARM_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif

#include "util/sse_util.hpp"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

warning: 'util/sse_util.hpp' file not found [clang-diagnostic-error]

#include "util/sse_util.hpp"
         ^


namespace doris {
namespace simd {

/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask
/// Transform 32-byte mask to 32-bit mask
consteval auto bits_mask_length() {
#if defined(__ARM_NEON) && defined(__aarch64__)
return 16;
#else
return 32;
#endif
}

#if defined(__ARM_NEON) && defined(__aarch64__)
inline uint64_t get_nibble_mask(uint8x16_t values) {
// It produces 4-bit out of each byte, alternating between the high 4-bits and low 4-bits of the 16-byte vector.
// Given that the comparison operators give a 16-byte result of 0x00 or 0xff, the result is close to being a PMOVMSKB,
// the only difference is that every matching bit is repeated 4 times and is a 64-bit integer.
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(values), 4)), 0);
}
/*
Input 16 bytes of data and convert it into a 64-bit integer, where one bit appears 4 times.
Compare with bytes32_mask_to_bits32_mask, a u8 array with a length of 32
std::vector<uint8_t> vec = {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};

bytes32_mask_to_bits32_mask 0100 0000 0000 0000,1101 0000 0000 0011


(1101 0000 0000 0011)
bytes16_mask_to_bits64_mask 1111 1111 0000 1111,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 1111 1111
(0100 0000 0000 0000)
0000 1111 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000
*/

inline uint64_t bytes16_mask_to_bits64_mask(const uint8_t* data) {
const uint8x16_t vfilter = vld1q_u8(data);
return get_nibble_mask(vmvnq_u8(vceqzq_u8(vfilter)));
}
#endif

inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
#ifdef __AVX2__
auto zero32 = _mm256_setzero_si256();
uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
#elif defined(__SSE2__) || defined(__aarch64__)
#elif defined(__SSE2__)
auto zero16 = _mm_setzero_si128();
uint32_t mask =
(static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(
Expand All @@ -51,8 +90,39 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
return mask;
}

inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) {
return bytes32_mask_to_bits32_mask(reinterpret_cast<const uint8_t*>(data));
inline auto bytes_mask_to_bits_mask(const uint8_t* data) {
#if defined(__ARM_NEON) && defined(__aarch64__)
return bytes16_mask_to_bits64_mask(data);
#else
return bytes32_mask_to_bits32_mask(data);
#endif
}

inline constexpr auto bits_mask_all() {
#if defined(__ARM_NEON) && defined(__aarch64__)
return 0xffff'ffff'ffff'ffffULL;
#else
return 0xffffffff;
#endif
}

template <typename Func>
void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullptr)) mask) {
#if defined(__ARM_NEON) && defined(__aarch64__)
mask &= 0x8888'8888'8888'8888ULL;
while (mask) {
const auto index = __builtin_ctzll(mask) >> 2;
func(index);
mask &= mask - 1;
}

#else
while (mask) {
const auto bit_pos = __builtin_ctzll(mask);
func(bit_pos);
mask = mask & (mask - 1);
}
#endif
}

inline size_t count_zero_num(const int8_t* __restrict data, size_t size) {
Expand Down
37 changes: 18 additions & 19 deletions be/src/vec/columns/column_decimal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,20 +329,18 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter& filt, ssize_t result_s
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
res_data.push_back(data_pos[idx]);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { res_data.push_back(data_pos[bit_pos]); }, mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -374,22 +372,23 @@ size_t ColumnDecimal<T>::filter(const IColumn::Filter& filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
*result_data = data_pos[idx];
++result_data;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) {
*result_data = data_pos[idx];
++result_data;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down
38 changes: 19 additions & 19 deletions be/src/vec/columns/column_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -400,20 +400,19 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter& filt, ssize_t result_si
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
res_data.push_back_without_reserve(data_pos[idx]);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) { res_data.push_back_without_reserve(data_pos[idx]); },
mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -447,22 +446,23 @@ size_t ColumnVector<T>::filter(const IColumn::Filter& filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
*result_data = data_pos[idx];
++result_data;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) {
*result_data = data_pos[idx];
++result_data;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down
37 changes: 18 additions & 19 deletions be/src/vec/columns/columns_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -194,13 +194,14 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems,
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
};

static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_aligned) {
auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (mask == 0xffffffff) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;

Expand All @@ -215,11 +216,8 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems,
res_elems.resize(elems_size_old + chunk_size);
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
copy_array(offsets_pos + bit_pos);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { copy_array(offsets_pos + bit_pos); }, mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -271,13 +269,14 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += arr_size;
};

static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filter_end_aligned = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_aligned) {
auto mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (mask == 0xffffffff) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;

Expand All @@ -293,12 +292,12 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += chunk_size;
result_size += SIMD_BYTES;
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
copy_array(offsets_pos + bit_pos);
++result_size;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) {
copy_array(offsets_pos + bit_pos);
++result_size;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down
Loading