diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 51245b57022b039..b29bfe602f77982 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -2200,19 +2200,17 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_ const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(_ret_flags.data() + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(_ret_flags.data() + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_pos + i; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_pos + bit_pos; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { sel_rowid_idx[new_size++] = sel_pos + bit_pos; }, + mask); } sel_pos += SIMD_BYTES; } @@ -2686,19 +2684,19 @@ uint16_t SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx, const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES; while (sel_pos < sel_end_simd) { - auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos); + auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos); if (0 == mask) { //pass - } else if (0xffffffff == mask) { + } else if (simd::bits_mask_all() == mask) { for (uint32_t i = 0; i < SIMD_BYTES; i++) { sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i]; } } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { + sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos]; + }, + mask); } sel_pos += SIMD_BYTES; } diff --git a/be/src/util/simd/bits.h b/be/src/util/simd/bits.h index 45f82b23ac99000..78148b13d0f3efa 100644 --- a/be/src/util/simd/bits.h +++ b/be/src/util/simd/bits.h @@ -26,9 +26,8 @@ namespace doris { namespace simd { -/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask -/// Transform 32-byte mask to 32-bit mask -inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { +/// Currently, transforming a 32-byte mask to a 32-bit mask has a faster processing method in the ARM version. +inline auto bytes_mask_to_bits_mask(const uint8_t* data) { #ifdef __AVX2__ auto zero32 = _mm256_setzero_si256(); uint32_t mask = static_cast(_mm256_movemask_epi8( @@ -51,8 +50,17 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) { return mask; } -inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) { - return bytes32_mask_to_bits32_mask(reinterpret_cast(data)); +inline constexpr auto bits_mask_all() { + return 0xffffffff; +} + +template +void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullptr)) mask) { + while (mask) { + const size_t bit_pos = __builtin_ctzll(mask); + func(bit_pos); + mask = mask & (mask - 1); + } } inline size_t count_zero_num(const int8_t* __restrict data, size_t size) { diff --git a/be/src/vec/columns/column_decimal.cpp b/be/src/vec/columns/column_decimal.cpp index 420984bf83c6e1d..2f738750ad05eb7 100644 --- a/be/src/vec/columns/column_decimal.cpp +++ b/be/src/vec/columns/column_decimal.cpp @@ -322,16 +322,14 @@ ColumnPtr ColumnDecimal::filter(const IColumn::Filter& filt, ssize_t result_s const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (0xFFFFFFFF == mask) { + uint32_t mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - res_data.push_back(data_pos[idx]); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { res_data.push_back(data_pos[bit_pos]); }, mask); } filt_pos += SIMD_BYTES; @@ -367,18 +365,19 @@ size_t ColumnDecimal::filter(const IColumn::Filter& filter) { const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (0xFFFFFFFF == mask) { + uint32_t mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES); result_data += SIMD_BYTES; } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - *result_data = data_pos[idx]; - ++result_data; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { + *result_data = data_pos[idx]; + ++result_data; + }, + mask); } filter_pos += SIMD_BYTES; diff --git a/be/src/vec/columns/column_vector.cpp b/be/src/vec/columns/column_vector.cpp index 14d52045943ce4c..79cdd4a375f063a 100644 --- a/be/src/vec/columns/column_vector.cpp +++ b/be/src/vec/columns/column_vector.cpp @@ -389,16 +389,15 @@ ColumnPtr ColumnVector::filter(const IColumn::Filter& filt, ssize_t result_si const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (0xFFFFFFFF == mask) { + uint32_t mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { res_data.insert(data_pos, data_pos + SIMD_BYTES); } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - res_data.push_back_without_reserve(data_pos[idx]); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { res_data.push_back_without_reserve(data_pos[idx]); }, + mask); } filt_pos += SIMD_BYTES; @@ -436,18 +435,19 @@ size_t ColumnVector::filter(const IColumn::Filter& filter) { const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_sse) { - uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (0xFFFFFFFF == mask) { + uint32_t mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (simd::bits_mask_all() == mask) { memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES); result_data += SIMD_BYTES; } else { - while (mask) { - const size_t idx = __builtin_ctzll(mask); - *result_data = data_pos[idx]; - ++result_data; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t idx) { + *result_data = data_pos[idx]; + ++result_data; + }, + mask); } filter_pos += SIMD_BYTES; diff --git a/be/src/vec/columns/columns_common.cpp b/be/src/vec/columns/columns_common.cpp index d1f7df85433ab28..2699200c72e39d1 100644 --- a/be/src/vec/columns/columns_common.cpp +++ b/be/src/vec/columns/columns_common.cpp @@ -186,9 +186,10 @@ void filter_arrays_impl_generic(const PaddedPODArray& src_elems, const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES; while (filt_pos < filt_end_aligned) { - auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos); - - if (mask == 0xffffffff) { + auto mask = simd::bytes_mask_to_bits_mask(filt_pos); + if (0 == mask) { + //pass + } else if (mask == simd::bits_mask_all()) { /// SIMD_BYTES consecutive rows pass the filter const auto first = offsets_pos == offsets_begin; @@ -203,11 +204,8 @@ void filter_arrays_impl_generic(const PaddedPODArray& src_elems, res_elems.resize(elems_size_old + chunk_size); memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T)); } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - copy_array(offsets_pos + bit_pos); - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { copy_array(offsets_pos + bit_pos); }, mask); } filt_pos += SIMD_BYTES; @@ -263,9 +261,10 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray& elems, const auto filter_end_aligned = filter_pos + size / SIMD_BYTES * SIMD_BYTES; while (filter_pos < filter_end_aligned) { - auto mask = simd::bytes32_mask_to_bits32_mask(filter_pos); - - if (mask == 0xffffffff) { + auto mask = simd::bytes_mask_to_bits_mask(filter_pos); + if (0 == mask) { + //pass + } else if (mask == simd::bits_mask_all()) { /// SIMD_BYTES consecutive rows pass the filter const auto first = offsets_pos == offsets_begin; @@ -281,12 +280,12 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray& elems, result_data += chunk_size; result_size += SIMD_BYTES; } else { - while (mask) { - const size_t bit_pos = __builtin_ctzll(mask); - copy_array(offsets_pos + bit_pos); - ++result_size; - mask = mask & (mask - 1); - } + simd::iterate_through_bits_mask( + [&](const size_t bit_pos) { + copy_array(offsets_pos + bit_pos); + ++result_size; + }, + mask); } filter_pos += SIMD_BYTES;