diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 7c9df76eb7868b4..b6b409ef42083ca 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -2206,23 +2206,21 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ uint32_t sel_pos = 0; const uint32_t sel_end = sel_pos + selected_size; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(_ret_flags.data() + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(_ret_flags.data() + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_pos + i; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_pos + bit_pos; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { sel_rowid_idx[new_size++] = sel_pos + bit_pos; }, + mask); } sel_pos += SIMD_BYTES; } @@ -2692,23 +2690,23 @@ uint16_t SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx, uint16_t new_size = 0; uint32_t sel_pos = 0; const uint32_t sel_end = selected_size; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i]; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { + sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; + }, + mask); } sel_pos += SIMD_BYTES; } diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h index 45f82b23ac99000..ef3fc9a00099aa2 100644 --- a/be/src/util/simd/bits.h +++ b/be/src/util/simd/bits.h @@ -21,18 +21,42 @@ #include #include +#if defined(__ARM_NEON) && defined(__aarch64__) +#include +#endif + #include "util/sse_util.hpp" namespace doris { namespace simd { -/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask -/// Transform 32-byte mask to 32-bit mask -inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { +consteval inline auto bits_mask_length() { +#if defined(__ARM_NEON) && defined(__aarch64__) + return 16; +#else + return 32; +#endif +} + +#if defined(__ARM_NEON) && defined(__aarch64__) +inline uint64_t get_nibble_mask(uint8x16_t values) { + // It produces 4-bit out of each byte, alternating between the high 4-bits and low 4-bits of the 16-byte vector. + // Given that the comparison operators give a 16-byte result of 0x00 or 0xff, the result is close to being a PMOVMSKB, + // the only difference is that every matching bit is repeated 4 times and is a 64-bit integer. + // https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(values), 4)), 0); +} +#endif + +/// Currently, transforming a 32-byte mask to a 32-bit mask has a faster processing method in the ARM version. +inline auto bytes_mask_to_bits_mask(const uint8_t* data) { #ifdef __AVX2__ auto zero32 = _mm256_setzero_si256(); - uint32_t mask = static_cast(_mm256_movemask_epi8( + auto mask = static_cast(_mm256_movemask_epi8( _mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast(data)), zero32))); + return mask; +#elif defined(__ARM_NEON) && defined(__aarch64__) + return get_nibble_mask(vmvnq_u8(vceqzq_u8(vld1q_u8(data)))); #elif defined(__SSE2__) || defined(__aarch64__) auto zero16 = _mm_setzero_si128(); uint32_t mask = @@ -42,17 +66,41 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { _mm_loadu_si128(reinterpret_cast(data + 16)), zero16))) << 16) & 0xffff0000); + return mask; #else uint32_t mask = 0; for (std::size_t i = 0; i < 32; ++i) { mask |= static_cast(1 == *(data + i)) << i; } -#endif return mask; +#endif +} + +inline constexpr auto bits_mask_all() { +#if defined(__ARM_NEON) && defined(__aarch64__) + return 0xffff'ffff'ffff'ffffULL; +#else + return 0xffffffff; +#endif } -inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) { - return bytes32_mask_to_bits32_mask(reinterpret_cast(data)); +template +void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullptr)) mask) { +#if defined(__ARM_NEON) && defined(__aarch64__) + mask &= 0x8888'8888'8888'8888ULL; + while (mask) { + const auto index = __builtin_ctzll(mask) >> 2; + func(index); + mask &= mask - 1; + } + +#else + while (mask) { + const auto bit_pos = __builtin_ctzll(mask); + func(bit_pos); + mask = mask & (mask - 1); + } +#endif } inline size_t count_zero_num(const int8_t* __restrict data, size_t size) { diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index b4eb523d165d7d9..b9518e319d3699c 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -335,20 +335,18 @@ ColumnPtr ColumnDecimal::filter(const IColumn::Filter& filt, ssize_t result_s * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - res_data.push_back(data_pos[idx]); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { res_data.push_back(data_pos[bit_pos]); }, mask); } filt_pos += SIMD_BYTES; @@ -380,22 +378,23 @@ size_t ColumnDecimal::filter(const IColumn::Filter& filter) { * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES); result_data += SIMD_BYTES; } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - *result_data = data_pos[idx]; - ++result_data; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { + *result_data = data_pos[idx]; + ++result_data; + }, + mask); } filter_pos += SIMD_BYTES; diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp index c3b2f43bb66b5fd..2d4b52371131f0e 100644 --- a/be/src/vec/columns/column_vector.cpp +++ b/be/src/vec/columns/column_vector.cpp @@ -404,20 +404,19 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter& filt, ssize_t result_si * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - res_data.push_back_without_reserve(data_pos[idx]); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { res_data.push_back_without_reserve(data_pos[idx]); }, + mask); } filt_pos += SIMD_BYTES; @@ -451,22 +450,23 @@ size_t ColumnVector::filter(const IColumn::Filter& filter) { * completely pass or do not pass the filter. * Therefore, we will optimistically check the parts of `SIMD_BYTES` values. */ - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (0xFFFFFFFF == mask) { + auto mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES); result_data += SIMD_BYTES; } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - *result_data = data_pos[idx]; - ++result_data; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { + *result_data = data_pos[idx]; + ++result_data; + }, + mask); } filter_pos += SIMD_BYTES; diff --git a/be/src/vec/columns/columns_common.cpp b/be/src/vec/columns/columns_common.cpp index d1f7df85433ab28..0671e9abd85393f 100644 --- a/be/src/vec/columns/columns_common.cpp +++ b/be/src/vec/columns/columns_common.cpp @@ -182,13 +182,14 @@ void filter_arrays_impl_generic(const PaddedPODArray& src_elems, memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T)); }; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_aligned) { - auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (mask == 0xffffffff) { + auto mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (mask == simd::bits_mask_all()) { /// SIMD_BYTES consecutive rows pass the filter const auto first = offsets_pos == offsets_begin; @@ -203,11 +204,8 @@ void filter_arrays_impl_generic(const PaddedPODArray& src_elems, res_elems.resize(elems_size_old + chunk_size); memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - copy_array(offsets_pos + bit_pos); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { copy_array(offsets_pos + bit_pos); }, mask); } filt_pos += SIMD_BYTES; @@ -259,13 +257,14 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray& elems, result_data += arr_size; }; - static constexpr size_t SIMD_BYTES = 32; + static constexpr size_t SIMD_BYTES = simd::bits_mask_length(); const auto filter_end_aligned = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_aligned) { - auto mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (mask == 0xffffffff) { + auto mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (mask == simd::bits_mask_all()) { /// SIMD_BYTES consecutive rows pass the filter const auto first = offsets_pos == offsets_begin; @@ -281,12 +280,12 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray& elems, result_data += chunk_size; result_size += SIMD_BYTES; } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - copy_array(offsets_pos + bit_pos); - ++result_size; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { + copy_array(offsets_pos + bit_pos); + ++result_size; + }, + mask); } filter_pos += SIMD_BYTES;