Skip to content

Commit

Permalink
[refine](bits) refine bytes_mask_to_bits_mask code (apache#38360)
Browse files Browse the repository at this point in the history
## Proposed changes

The previous code only considered the x86 architecture, and
_mm_movemask_epi8 does not have a corresponding instruction in ARM.
According to the article below, we need to abstract the overall logic.
For ARM, optimize using the content mentioned in the following article:
filter function origin 0.711375 seconds 0.7154 seconds 0.71782 seconds
0.715296 seconds
filter function arm opt 0.559854 seconds 0.559854 seconds 0.559854
seconds 0.559854 seconds


[link](https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e)
  • Loading branch information
Mryange authored and dataroaring committed Aug 26, 2024
1 parent 74e831d commit 1d8073b
Show file tree
Hide file tree
Showing 5 changed files with 144 additions and 78 deletions.
30 changes: 14 additions & 16 deletions be/src/olap/rowset/segment_v2/segment_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2223,23 +2223,21 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_

uint32_t sel_pos = 0;
const uint32_t sel_end = sel_pos + selected_size;
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;

while (sel_pos < sel_end_simd) {
auto mask = simd::bytes32_mask_to_bits32_mask(_ret_flags.data() + sel_pos);
auto mask = simd::bytes_mask_to_bits_mask(_ret_flags.data() + sel_pos);
if (0 == mask) {
//pass
} else if (0xffffffff == mask) {
} else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_pos + i;
}
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
sel_rowid_idx[new_size++] = sel_pos + bit_pos;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { sel_rowid_idx[new_size++] = sel_pos + bit_pos; },
mask);
}
sel_pos += SIMD_BYTES;
}
Expand Down Expand Up @@ -2709,23 +2707,23 @@ uint16_t SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx,
uint16_t new_size = 0;
uint32_t sel_pos = 0;
const uint32_t sel_end = selected_size;
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;

while (sel_pos < sel_end_simd) {
auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos);
auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos);
if (0 == mask) {
//pass
} else if (0xffffffff == mask) {
} else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i];
}
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos];
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) {
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos];
},
mask);
}
sel_pos += SIMD_BYTES;
}
Expand Down
80 changes: 75 additions & 5 deletions be/src/util/simd/bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,58 @@
#include <cstring>
#include <vector>

#if defined(__ARM_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif

#include "util/sse_util.hpp"

namespace doris {
namespace simd {

/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask
/// Transform 32-byte mask to 32-bit mask
consteval auto bits_mask_length() {
#if defined(__ARM_NEON) && defined(__aarch64__)
return 16;
#else
return 32;
#endif
}

#if defined(__ARM_NEON) && defined(__aarch64__)
inline uint64_t get_nibble_mask(uint8x16_t values) {
// It produces 4-bit out of each byte, alternating between the high 4-bits and low 4-bits of the 16-byte vector.
// Given that the comparison operators give a 16-byte result of 0x00 or 0xff, the result is close to being a PMOVMSKB,
// the only difference is that every matching bit is repeated 4 times and is a 64-bit integer.
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(values), 4)), 0);
}
/*
Input 16 bytes of data and convert it into a 64-bit integer, where one bit appears 4 times.
Compare with bytes32_mask_to_bits32_mask, a u8 array with a length of 32
std::vector<uint8_t> vec = {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0};
bytes32_mask_to_bits32_mask 0100 0000 0000 0000,1101 0000 0000 0011
(1101 0000 0000 0011)
bytes16_mask_to_bits64_mask 1111 1111 0000 1111,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 1111 1111
(0100 0000 0000 0000)
0000 1111 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000
*/

inline uint64_t bytes16_mask_to_bits64_mask(const uint8_t* data) {
const uint8x16_t vfilter = vld1q_u8(data);
return get_nibble_mask(vmvnq_u8(vceqzq_u8(vfilter)));
}
#endif

inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
#ifdef __AVX2__
auto zero32 = _mm256_setzero_si256();
uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
#elif defined(__SSE2__) || defined(__aarch64__)
#elif defined(__SSE2__)
auto zero16 = _mm_setzero_si128();
uint32_t mask =
(static_cast<uint32_t>(_mm_movemask_epi8(_mm_cmpgt_epi8(
Expand All @@ -51,8 +90,39 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
return mask;
}

inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) {
return bytes32_mask_to_bits32_mask(reinterpret_cast<const uint8_t*>(data));
inline auto bytes_mask_to_bits_mask(const uint8_t* data) {
#if defined(__ARM_NEON) && defined(__aarch64__)
return bytes16_mask_to_bits64_mask(data);
#else
return bytes32_mask_to_bits32_mask(data);
#endif
}

inline constexpr auto bits_mask_all() {
#if defined(__ARM_NEON) && defined(__aarch64__)
return 0xffff'ffff'ffff'ffffULL;
#else
return 0xffffffff;
#endif
}

template <typename Func>
void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullptr)) mask) {
#if defined(__ARM_NEON) && defined(__aarch64__)
mask &= 0x8888'8888'8888'8888ULL;
while (mask) {
const auto index = __builtin_ctzll(mask) >> 2;
func(index);
mask &= mask - 1;
}

#else
while (mask) {
const auto bit_pos = __builtin_ctzll(mask);
func(bit_pos);
mask = mask & (mask - 1);
}
#endif
}

inline size_t count_zero_num(const int8_t* __restrict data, size_t size) {
Expand Down
37 changes: 18 additions & 19 deletions be/src/vec/columns/column_decimal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -337,20 +337,18 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter& filt, ssize_t result_s
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
res_data.push_back(data_pos[idx]);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { res_data.push_back(data_pos[bit_pos]); }, mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -382,22 +380,23 @@ size_t ColumnDecimal<T>::filter(const IColumn::Filter& filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
*result_data = data_pos[idx];
++result_data;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) {
*result_data = data_pos[idx];
++result_data;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down
38 changes: 19 additions & 19 deletions be/src/vec/columns/column_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -406,20 +406,19 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter& filt, ssize_t result_si
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
res_data.push_back_without_reserve(data_pos[idx]);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) { res_data.push_back_without_reserve(data_pos[idx]); },
mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -453,22 +452,23 @@ size_t ColumnVector<T>::filter(const IColumn::Filter& filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
*result_data = data_pos[idx];
++result_data;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) {
*result_data = data_pos[idx];
++result_data;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down
37 changes: 18 additions & 19 deletions be/src/vec/columns/columns_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,14 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems,
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
};

static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_aligned) {
auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (mask == 0xffffffff) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;

Expand All @@ -203,11 +204,8 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems,
res_elems.resize(elems_size_old + chunk_size);
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
copy_array(offsets_pos + bit_pos);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { copy_array(offsets_pos + bit_pos); }, mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -259,13 +257,14 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += arr_size;
};

static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filter_end_aligned = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_aligned) {
auto mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (mask == 0xffffffff) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;

Expand All @@ -281,12 +280,12 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += chunk_size;
result_size += SIMD_BYTES;
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
copy_array(offsets_pos + bit_pos);
++result_size;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) {
copy_array(offsets_pos + bit_pos);
++result_size;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down

0 comments on commit 1d8073b

Please sign in to comment.