diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 79ab794a9a0853..1b60440d6855da 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -2223,23 +2223,21 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ uint32_t sel_pos = 0; const uint32_t sel_end = sel_pos + selected_size; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(_ret_flags.data() + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(_ret_flags.data() + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_pos + i; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_pos + bit_pos; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { sel_rowid_idx[new_size++] = sel_pos + bit_pos; }, + mask); } sel_pos += SIMD_BYTES; } @@ -2709,23 +2707,23 @@ uint16_t SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx, uint16_t new_size = 0; uint32_t sel_pos = 0; const uint32_t sel_end = selected_size; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i]; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { + sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; + }, + mask); } sel_pos += SIMD_BYTES; } diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h index a36a95b6eef5db..7e2e7c8202569d 100644 --- a/be/src/util/simd/bits.h +++ b/be/src/util/simd/bits.h @@ -21,19 +21,58 @@ #include #include +#if defined(__ARM_NEON) && defined(__aarch64__) +#include +#endif + #include "util/sse_util.hpp" namespace doris { namespace simd { -/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask -/// Transform 32-byte mask to 32-bit mask +consteval auto bits_mask_length() { +#if defined(__ARM_NEON) && defined(__aarch64__) + return 16; +#else + return 32; +#endif +} + +#if defined(__ARM_NEON) && defined(__aarch64__) +inline uint64_t get_nibble_mask(uint8x16_t values) { + // It produces 4-bit out of each byte, alternating between the high 4-bits and low 4-bits of the 16-byte vector. + // Given that the comparison operators give a 16-byte result of 0x00 or 0xff, the result is close to being a PMOVMSKB, + // the only difference is that every matching bit is repeated 4 times and is a 64-bit integer. + // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(values), 4)), 0); +} +/* +Input 16 bytes of data and convert it into a 64-bit integer, where one bit appears 4 times. +Compare with bytes32_mask_to_bits32_mask, a u8 array with a length of 32 + std::vector vec = {1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0}; + +bytes32_mask_to_bits32_mask 0100 0000 0000 0000,1101 0000 0000 0011 + + + (1101 0000 0000 0011) +bytes16_mask_to_bits64_mask 1111 1111 0000 1111,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 1111 1111 + (0100 0000 0000 0000) + 0000 1111 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000,0000 0000 0000 0000 +*/ + +inline uint64_t bytes16_mask_to_bits64_mask(const uint8_t* data) { + const uint8x16_t vfilter = vld1q_u8(data); + return get_nibble_mask(vmvnq_u8(vceqzq_u8(vfilter))); +} +#endif + inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { #ifdef __AVX2__ auto zero32 = _mm256_setzero_si256(); uint32_t mask = static_cast(_mm256_movemask_epi8( _mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast(data)), zero32))); -#elif defined(__SSE2__) || defined(__aarch64__) +#elif defined(__SSE2__) auto zero16 = _mm_setzero_si128(); uint32_t mask = (static_cast(_mm_movemask_epi8(_mm_cmpgt_epi8( @@ -51,8 +90,39 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { return mask; } -inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) { - return bytes32_mask_to_bits32_mask(reinterpret_cast(data)); +inline auto bytes_mask_to_bits_mask(const uint8_t* data) { +#if defined(__ARM_NEON) && defined(__aarch64__) + return bytes16_mask_to_bits64_mask(data); +#else + return bytes32_mask_to_bits32_mask(data); +#endif +} + +inline constexpr auto bits_mask_all() { +#if defined(__ARM_NEON) && defined(__aarch64__) + return 0xffff'ffff'ffff'ffffULL; +#else + return 0xffffffff; +#endif +} + +template +void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullptr)) mask) { +#if defined(__ARM_NEON) && defined(__aarch64__) + mask &= 0x8888'8888'8888'8888ULL; + while (mask) { + const auto index = __builtin_ctzll(mask) >> 2; + func(index); + mask &= mask - 1; + } + +#else + while (mask) { + const auto bit_pos = __builtin_ctzll(mask); + func(bit_pos); + mask = mask & (mask - 1); + } +#endif } inline size_t count_zero_num(const int8_t* __restrict data, size_t size) { diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index 65e8c9d79ac57f..beeb6224c2253c 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -337,20 +337,18 @@ ColumnPtr ColumnDecimal::filter(const IColumn::Filter& filt, ssize_t result_s * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - res_data.push_back(data_pos[idx]); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { res_data.push_back(data_pos[bit_pos]); }, mask); } filt_pos += SIMD_BYTES; @@ -382,22 +380,23 @@ size_t ColumnDecimal::filter(const IColumn::Filter& filter) { * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES); result_data += SIMD_BYTES; } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - *result_data = data_pos[idx]; - ++result_data; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { + *result_data = data_pos[idx]; + ++result_data; + }, + mask); } filter_pos += SIMD_BYTES; diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp index 590e2047cab7c6..3d34bd5d55ba61 100644 --- a/be/src/vec/columns/column_vector.cpp +++ b/be/src/vec/columns/column_vector.cpp @@ -406,20 +406,19 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter& filt, ssize_t result_si * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - res_data.push_back_without_reserve(data_pos[idx]); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { res_data.push_back_without_reserve(data_pos[idx]); }, + mask); } filt_pos += SIMD_BYTES; @@ -453,22 +452,23 @@ size_t ColumnVector::filter(const IColumn::Filter& filter) { * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES); result_data += SIMD_BYTES; } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - *result_data = data_pos[idx]; - ++result_data; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { + *result_data = data_pos[idx]; + ++result_data; + }, + mask); } filter_pos += SIMD_BYTES; diff --git a/be/src/vec/columns/columns_common.cpp b/be/src/vec/columns/columns_common.cpp index d1f7df85433ab2..0671e9abd85393 100644 --- a/be/src/vec/columns/columns_common.cpp +++ b/be/src/vec/columns/columns_common.cpp @@ -182,13 +182,14 @@ void filter_arrays_impl_generic(const PaddedPODArray& src_elems, memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T)); }; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_aligned) { - auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (mask == 0xffffffff) { + auto mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (mask == simd::bits_mask_all()) { /// SIMD_BYTES consecutive rows pass the filter const auto first = offsets_pos == offsets_begin; @@ -203,11 +204,8 @@ void filter_arrays_impl_generic(const PaddedPODArray& src_elems, res_elems.resize(elems_size_old + chunk_size); memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - copy_array(offsets_pos + bit_pos); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { copy_array(offsets_pos + bit_pos); }, mask); } filt_pos += SIMD_BYTES; @@ -259,13 +257,14 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray& elems, result_data += arr_size; }; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const auto filter_end_aligned = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_aligned) { - auto mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (mask == 0xffffffff) { + auto mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (mask == simd::bits_mask_all()) { /// SIMD_BYTES consecutive rows pass the filter const auto first = offsets_pos == offsets_begin; @@ -281,12 +280,12 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray& elems, result_data += chunk_size; result_size += SIMD_BYTES; } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - copy_array(offsets_pos + bit_pos); - ++result_size; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { + copy_array(offsets_pos + bit_pos); + ++result_size; + }, + mask); } filter_pos += SIMD_BYTES;