Skip to content

Commit

Permalink
upd
Browse files Browse the repository at this point in the history
  • Loading branch information
Mryange committed Jul 30, 2024
1 parent bb5b05b commit 1eeedfa
Show file tree
Hide file tree
Showing 5 changed files with 124 additions and 80 deletions.
30 changes: 14 additions & 16 deletions be/src/olap/rowset/segment_v2/segment_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2206,23 +2206,21 @@ uint16_t SegmentIterator::_evaluate_vectorization_predicate(uint16_t* sel_rowid_

uint32_t sel_pos = 0;
const uint32_t sel_end = sel_pos + selected_size;
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;

while (sel_pos < sel_end_simd) {
auto mask = simd::bytes32_mask_to_bits32_mask(_ret_flags.data() + sel_pos);
auto mask = simd::bytes_mask_to_bits_mask(_ret_flags.data() + sel_pos);
if (0 == mask) {
//pass
} else if (0xffffffff == mask) {
} else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_pos + i;
}
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
sel_rowid_idx[new_size++] = sel_pos + bit_pos;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { sel_rowid_idx[new_size++] = sel_pos + bit_pos; },
mask);
}
sel_pos += SIMD_BYTES;
}
Expand Down Expand Up @@ -2692,23 +2690,23 @@ uint16_t SegmentIterator::_evaluate_common_expr_filter(uint16_t* sel_rowid_idx,
uint16_t new_size = 0;
uint32_t sel_pos = 0;
const uint32_t sel_end = selected_size;
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const uint32_t sel_end_simd = sel_pos + selected_size / SIMD_BYTES * SIMD_BYTES;

while (sel_pos < sel_end_simd) {
auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos + sel_pos);
auto mask = simd::bytes_mask_to_bits_mask(filt_pos + sel_pos);
if (0 == mask) {
//pass
} else if (0xffffffff == mask) {
} else if (simd::bits_mask_all() == mask) {
for (uint32_t i = 0; i < SIMD_BYTES; i++) {
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + i];
}
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos];
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) {
sel_rowid_idx[new_size++] = sel_rowid_idx[sel_pos + bit_pos];
},
mask);
}
sel_pos += SIMD_BYTES;
}
Expand Down
62 changes: 55 additions & 7 deletions be/src/util/simd/bits.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,42 @@
#include <cstring>
#include <vector>

#if defined(__ARM_NEON) && defined(__aarch64__)
#include <arm_neon.h>
#endif

#include "util/sse_util.hpp"

namespace doris {
namespace simd {

/// todo(zeno) Compile add avx512 parameter, modify it to bytes64_mask_to_bits64_mask
/// Transform 32-byte mask to 32-bit mask
inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
consteval inline auto bits_mask_length() {
#if defined(__ARM_NEON) && defined(__aarch64__)
return 16;
#else
return 32;
#endif
}

#if defined(__ARM_NEON) && defined(__aarch64__)
inline uint64_t get_nibble_mask(uint8x16_t values) {
// It produces 4-bit out of each byte, alternating between the high 4-bits and low 4-bits of the 16-byte vector.
// Given that the comparison operators give a 16-byte result of 0x00 or 0xff, the result is close to being a PMOVMSKB,
// the only difference is that every matching bit is repeated 4 times and is a 64-bit integer.
// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon?CommentId=af187ac6-ae00-4e4d-bbf0-e142187aa92e
return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vreinterpretq_u16_u8(values), 4)), 0);
}
#endif

/// Currently, transforming a 32-byte mask to a 32-bit mask has a faster processing method in the ARM version.
inline auto bytes_mask_to_bits_mask(const uint8_t* data) {
#ifdef __AVX2__
auto zero32 = _mm256_setzero_si256();
uint32_t mask = static_cast<uint32_t>(_mm256_movemask_epi8(
auto mask = static_cast<uint32_t>(_mm256_movemask_epi8(
_mm256_cmpgt_epi8(_mm256_loadu_si256(reinterpret_cast<const __m256i*>(data)), zero32)));
return mask;
#elif defined(__ARM_NEON) && defined(__aarch64__)
return get_nibble_mask(vmvnq_u8(vceqzq_u8(vld1q_u8(data))));
#elif defined(__SSE2__) || defined(__aarch64__)
auto zero16 = _mm_setzero_si128();
uint32_t mask =
Expand All @@ -42,17 +66,41 @@ inline uint32_t bytes32_mask_to_bits32_mask(const uint8_t* data) {
_mm_loadu_si128(reinterpret_cast<const __m128i*>(data + 16)), zero16)))
<< 16) &
0xffff0000);
return mask;
#else
uint32_t mask = 0;
for (std::size_t i = 0; i < 32; ++i) {
mask |= static_cast<uint32_t>(1 == *(data + i)) << i;
}
#endif
return mask;
#endif
}

inline constexpr auto bits_mask_all() {
#if defined(__ARM_NEON) && defined(__aarch64__)
return 0xffff'ffff'ffff'ffffULL;
#else
return 0xffffffff;
#endif
}

inline uint32_t bytes32_mask_to_bits32_mask(const bool* data) {
return bytes32_mask_to_bits32_mask(reinterpret_cast<const uint8_t*>(data));
template <typename Func>
void iterate_through_bits_mask(Func func, decltype(bytes_mask_to_bits_mask(nullptr)) mask) {
#if defined(__ARM_NEON) && defined(__aarch64__)
mask &= 0x8888'8888'8888'8888ULL;
while (mask) {
const auto index = __builtin_ctzll(mask) >> 2;
func(index);
mask &= mask - 1;
}

#else
while (mask) {
const auto bit_pos = __builtin_ctzll(mask);
func(bit_pos);
mask = mask & (mask - 1);
}
#endif
}

inline size_t count_zero_num(const int8_t* __restrict data, size_t size) {
Expand Down
37 changes: 18 additions & 19 deletions be/src/vec/columns/column_decimal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -335,20 +335,18 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter& filt, ssize_t result_s
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
res_data.push_back(data_pos[idx]);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { res_data.push_back(data_pos[bit_pos]); }, mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -380,22 +378,23 @@ size_t ColumnDecimal<T>::filter(const IColumn::Filter& filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
*result_data = data_pos[idx];
++result_data;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) {
*result_data = data_pos[idx];
++result_data;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down
38 changes: 19 additions & 19 deletions be/src/vec/columns/column_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -404,20 +404,19 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter& filt, ssize_t result_si
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
res_data.insert(data_pos, data_pos + SIMD_BYTES);
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
res_data.push_back_without_reserve(data_pos[idx]);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) { res_data.push_back_without_reserve(data_pos[idx]); },
mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -451,22 +450,23 @@ size_t ColumnVector<T>::filter(const IColumn::Filter& filter) {
* completely pass or do not pass the filter.
* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
*/
static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const UInt8* filter_end_sse = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_sse) {
uint32_t mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (0xFFFFFFFF == mask) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (simd::bits_mask_all() == mask) {
memmove(result_data, data_pos, sizeof(T) * SIMD_BYTES);
result_data += SIMD_BYTES;
} else {
while (mask) {
const size_t idx = __builtin_ctzll(mask);
*result_data = data_pos[idx];
++result_data;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t idx) {
*result_data = data_pos[idx];
++result_data;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down
37 changes: 18 additions & 19 deletions be/src/vec/columns/columns_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,14 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems,
memcpy(&res_elems[elems_size_old], &src_elems[arr_offset], arr_size * sizeof(T));
};

static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filt_end_aligned = filt_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filt_pos < filt_end_aligned) {
auto mask = simd::bytes32_mask_to_bits32_mask(filt_pos);

if (mask == 0xffffffff) {
auto mask = simd::bytes_mask_to_bits_mask(filt_pos);
if (0 == mask) {
//pass
} else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;

Expand All @@ -203,11 +204,8 @@ void filter_arrays_impl_generic(const PaddedPODArray<T>& src_elems,
res_elems.resize(elems_size_old + chunk_size);
memcpy(&res_elems[elems_size_old], &src_elems[chunk_offset], chunk_size * sizeof(T));
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
copy_array(offsets_pos + bit_pos);
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) { copy_array(offsets_pos + bit_pos); }, mask);
}

filt_pos += SIMD_BYTES;
Expand Down Expand Up @@ -259,13 +257,14 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += arr_size;
};

static constexpr size_t SIMD_BYTES = 32;
static constexpr size_t SIMD_BYTES = simd::bits_mask_length();
const auto filter_end_aligned = filter_pos + size / SIMD_BYTES * SIMD_BYTES;

while (filter_pos < filter_end_aligned) {
auto mask = simd::bytes32_mask_to_bits32_mask(filter_pos);

if (mask == 0xffffffff) {
auto mask = simd::bytes_mask_to_bits_mask(filter_pos);
if (0 == mask) {
//pass
} else if (mask == simd::bits_mask_all()) {
/// SIMD_BYTES consecutive rows pass the filter
const auto first = offsets_pos == offsets_begin;

Expand All @@ -281,12 +280,12 @@ size_t filter_arrays_impl_generic_without_reserving(PaddedPODArray<T>& elems,
result_data += chunk_size;
result_size += SIMD_BYTES;
} else {
while (mask) {
const size_t bit_pos = __builtin_ctzll(mask);
copy_array(offsets_pos + bit_pos);
++result_size;
mask = mask & (mask - 1);
}
simd::iterate_through_bits_mask(
[&](const size_t bit_pos) {
copy_array(offsets_pos + bit_pos);
++result_size;
},
mask);
}

filter_pos += SIMD_BYTES;
Expand Down

0 comments on commit 1eeedfa

Please sign in to comment.