From 241a5b4f16d447d39faa3c79db51f41d0b04349c Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 13 Jul 2023 16:47:11 +0200 Subject: [PATCH] Fix conditional compilation logic for runtime-selected AVX2 functions --- cpp/src/arrow/CMakeLists.txt | 22 +++++++++---------- cpp/src/arrow/acero/CMakeLists.txt | 6 ++--- cpp/src/arrow/acero/bloom_filter.cc | 8 +++---- cpp/src/arrow/acero/bloom_filter.h | 5 +++-- cpp/src/arrow/acero/bloom_filter_avx2.cc | 5 +---- cpp/src/arrow/acero/swiss_join_avx2.cc | 4 ---- cpp/src/arrow/acero/swiss_join_internal.h | 2 +- cpp/src/arrow/compute/key_hash.cc | 6 ++--- cpp/src/arrow/compute/key_hash.h | 4 ++-- cpp/src/arrow/compute/key_hash_avx2.cc | 4 ---- cpp/src/arrow/compute/key_map.cc | 4 ++-- cpp/src/arrow/compute/key_map.h | 2 +- cpp/src/arrow/compute/key_map_avx2.cc | 4 ---- cpp/src/arrow/compute/row/compare_internal.cc | 8 +++---- cpp/src/arrow/compute/row/compare_internal.h | 2 +- .../compute/row/compare_internal_avx2.cc | 4 ---- cpp/src/arrow/compute/row/encode_internal.cc | 10 ++++----- cpp/src/arrow/compute/row/encode_internal.h | 6 ++--- .../arrow/compute/row/encode_internal_avx2.cc | 4 ---- cpp/src/arrow/compute/util.cc | 10 ++++----- cpp/src/arrow/compute/util.h | 2 +- cpp/src/arrow/compute/util_avx2.cc | 8 ++----- 22 files changed, 52 insertions(+), 78 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index fccff6c8cf1a9..a398e790de14b 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -119,7 +119,7 @@ function(ADD_ARROW_BENCHMARK REL_TEST_NAME) ${ARG_UNPARSED_ARGUMENTS}) endfunction() -macro(append_avx2_src SRC) +macro(append_runtime_avx2_src SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ARROW_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -127,7 +127,7 @@ macro(append_avx2_src SRC) endif() endmacro() -macro(append_avx512_src SRC) +macro(append_runtime_avx512_src SRC) if(ARROW_HAVE_RUNTIME_AVX512) list(APPEND ARROW_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -254,8 +254,8 @@ if(ARROW_JEMALLOC) PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) endif() -append_avx2_src(util/bpacking_avx2.cc) -append_avx512_src(util/bpacking_avx512.cc) +append_runtime_avx2_src(util/bpacking_avx2.cc) +append_runtime_avx512_src(util/bpacking_avx512.cc) if(ARROW_HAVE_NEON) list(APPEND ARROW_SRCS util/bpacking_neon.cc) @@ -425,11 +425,11 @@ list(APPEND compute/row/row_internal.cc compute/util.cc) -append_avx2_src(compute/key_hash_avx2.cc) -append_avx2_src(compute/key_map_avx2.cc) -append_avx2_src(compute/row/compare_internal_avx2.cc) -append_avx2_src(compute/row/encode_internal_avx2.cc) -append_avx2_src(compute/util_avx2.cc) +append_runtime_avx2_src(compute/key_hash_avx2.cc) +append_runtime_avx2_src(compute/key_map_avx2.cc) +append_runtime_avx2_src(compute/row/compare_internal_avx2.cc) +append_runtime_avx2_src(compute/row/encode_internal_avx2.cc) +append_runtime_avx2_src(compute/util_avx2.cc) if(ARROW_COMPUTE) # Include the remaining kernels @@ -464,8 +464,8 @@ if(ARROW_COMPUTE) compute/kernels/vector_select_k.cc compute/kernels/vector_sort.cc) - append_avx2_src(compute/kernels/aggregate_basic_avx2.cc) - append_avx512_src(compute/kernels/aggregate_basic_avx512.cc) + append_runtime_avx2_src(compute/kernels/aggregate_basic_avx2.cc) + append_runtime_avx512_src(compute/kernels/aggregate_basic_avx512.cc) endif() if(ARROW_FILESYSTEM) diff --git a/cpp/src/arrow/acero/CMakeLists.txt b/cpp/src/arrow/acero/CMakeLists.txt index 287884432b9fe..c2c91db58d38a 100644 --- a/cpp/src/arrow/acero/CMakeLists.txt +++ b/cpp/src/arrow/acero/CMakeLists.txt @@ -19,7 +19,7 @@ add_custom_target(arrow_acero) arrow_install_all_headers("arrow/acero") -macro(append_acero_avx2_src SRC) +macro(append_acero_runtime_avx2_src SRC) if(ARROW_HAVE_RUNTIME_AVX2) list(APPEND ARROW_ACERO_SRCS ${SRC}) set_source_files_properties(${SRC} PROPERTIES SKIP_PRECOMPILE_HEADERS ON) @@ -56,8 +56,8 @@ set(ARROW_ACERO_SRCS union_node.cc util.cc) -append_acero_avx2_src(bloom_filter_avx2.cc) -append_acero_avx2_src(swiss_join_avx2.cc) +append_acero_runtime_avx2_src(bloom_filter_avx2.cc) +append_acero_runtime_avx2_src(swiss_join_avx2.cc) set(ARROW_ACERO_SHARED_LINK_LIBS) set(ARROW_ACERO_STATIC_LINK_LIBS) diff --git a/cpp/src/arrow/acero/bloom_filter.cc b/cpp/src/arrow/acero/bloom_filter.cc index ad5e66ded0613..b9855ee506d27 100644 --- a/cpp/src/arrow/acero/bloom_filter.cc +++ b/cpp/src/arrow/acero/bloom_filter.cc @@ -123,7 +123,7 @@ void BlockedBloomFilter::InsertImp(int64_t num_rows, const T* hashes) { void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, const uint32_t* hashes) { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = Insert_avx2(num_rows, hashes); } @@ -134,7 +134,7 @@ void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, void BlockedBloomFilter::Insert(int64_t hardware_flags, int64_t num_rows, const uint64_t* hashes) { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = Insert_avx2(num_rows, hashes); } @@ -181,7 +181,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, int64_t num_rows, bool enable_prefetch) const { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (!(enable_prefetch && UsePrefetch()) && (hardware_flags & arrow::internal::CpuInfo::AVX2)) { num_processed = Find_avx2(num_rows, hashes, result_bit_vector); @@ -202,7 +202,7 @@ void BlockedBloomFilter::Find(int64_t hardware_flags, int64_t num_rows, bool enable_prefetch) const { int64_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (!(enable_prefetch && UsePrefetch()) && (hardware_flags & arrow::internal::CpuInfo::AVX2)) { num_processed = Find_avx2(num_rows, hashes, result_bit_vector); diff --git a/cpp/src/arrow/acero/bloom_filter.h b/cpp/src/arrow/acero/bloom_filter.h index b8f7f8cd256b1..50d07bfd948e0 100644 --- a/cpp/src/arrow/acero/bloom_filter.h +++ b/cpp/src/arrow/acero/bloom_filter.h @@ -17,13 +17,14 @@ #pragma once -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) #include #endif #include #include #include + #include "arrow/acero/partition_util.h" #include "arrow/acero/util.h" #include "arrow/memory_pool.h" @@ -203,7 +204,7 @@ class ARROW_ACERO_EXPORT BlockedBloomFilter { void SingleFold(int num_folds); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) inline __m256i mask_avx2(__m256i hash) const; inline __m256i block_id_avx2(__m256i hash) const; int64_t Insert_avx2(int64_t num_rows, const uint32_t* hashes); diff --git a/cpp/src/arrow/acero/bloom_filter_avx2.cc b/cpp/src/arrow/acero/bloom_filter_avx2.cc index b6c281276db8d..5816bb4fc0a32 100644 --- a/cpp/src/arrow/acero/bloom_filter_avx2.cc +++ b/cpp/src/arrow/acero/bloom_filter_avx2.cc @@ -16,14 +16,13 @@ // under the License. #include + #include "arrow/acero/bloom_filter.h" #include "arrow/util/bit_util.h" namespace arrow { namespace acero { -#if defined(ARROW_HAVE_AVX2) - inline __m256i BlockedBloomFilter::mask_avx2(__m256i hash) const { // AVX2 translation of mask() method // @@ -132,7 +131,5 @@ int64_t BlockedBloomFilter::Insert_avx2(int64_t num_rows, const uint64_t* hashes return InsertImp_avx2(num_rows, hashes); } -#endif - } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/swiss_join_avx2.cc b/cpp/src/arrow/acero/swiss_join_avx2.cc index d5c0b7817f55f..0888dd8938455 100644 --- a/cpp/src/arrow/acero/swiss_join_avx2.cc +++ b/cpp/src/arrow/acero/swiss_join_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace acero { -#if defined(ARROW_HAVE_AVX2) - template int RowArrayAccessor::Visit_avx2(const RowTableImpl& rows, int column_id, int num_rows, const uint32_t* row_ids, @@ -191,7 +189,5 @@ int RowArrayAccessor::VisitNulls_avx2(const RowTableImpl& rows, int column_id, return num_rows - (num_rows % unroll); } -#endif - } // namespace acero } // namespace arrow diff --git a/cpp/src/arrow/acero/swiss_join_internal.h b/cpp/src/arrow/acero/swiss_join_internal.h index cd12b34a0c6dc..88b80f06f57f2 100644 --- a/cpp/src/arrow/acero/swiss_join_internal.h +++ b/cpp/src/arrow/acero/swiss_join_internal.h @@ -80,7 +80,7 @@ class RowArrayAccessor { const uint32_t* row_ids, PROCESS_VALUE_FN process_value_fn); private: -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) // This is equivalent to Visit method, but processing 8 rows at a time in a // loop. // Returns the number of processed rows, which may be less than requested (up diff --git a/cpp/src/arrow/compute/key_hash.cc b/cpp/src/arrow/compute/key_hash.cc index 3fcfbf3d8312d..f5867b405ec71 100644 --- a/cpp/src/arrow/compute/key_hash.cc +++ b/cpp/src/arrow/compute/key_hash.cc @@ -236,7 +236,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t const uint32_t* offsets, const uint8_t* concatenated_keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, concatenated_keys, hashes, hashes_temp_for_combine); @@ -255,7 +255,7 @@ void Hashing32::HashVarLen(int64_t hardware_flags, bool combine_hashes, uint32_t const uint64_t* offsets, const uint8_t* concatenated_keys, uint32_t* hashes, uint32_t* hashes_temp_for_combine) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashVarLen_avx2(combine_hashes, num_rows, offsets, concatenated_keys, hashes, hashes_temp_for_combine); @@ -361,7 +361,7 @@ void Hashing32::HashFixed(int64_t hardware_flags, bool combine_hashes, uint32_t } uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { num_processed = HashFixedLen_avx2(combine_hashes, num_rows, length, keys, hashes, hashes_temp_for_combine); diff --git a/cpp/src/arrow/compute/key_hash.h b/cpp/src/arrow/compute/key_hash.h index e43d7b8df523d..b193716c9bdfd 100644 --- a/cpp/src/arrow/compute/key_hash.h +++ b/cpp/src/arrow/compute/key_hash.h @@ -17,7 +17,7 @@ #pragma once -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) #include #endif @@ -115,7 +115,7 @@ class ARROW_EXPORT Hashing32 { static void HashInt(bool combine_hashes, uint32_t num_keys, uint64_t length_key, const uint8_t* keys, uint32_t* hashes); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static inline __m256i Avalanche_avx2(__m256i hash); static inline __m256i CombineHashesImp_avx2(__m256i previous_hash, __m256i hash); template diff --git a/cpp/src/arrow/compute/key_hash_avx2.cc b/cpp/src/arrow/compute/key_hash_avx2.cc index f30c3460bda60..1b444b576784f 100644 --- a/cpp/src/arrow/compute/key_hash_avx2.cc +++ b/cpp/src/arrow/compute/key_hash_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - inline __m256i Hashing32::Avalanche_avx2(__m256i hash) { hash = _mm256_xor_si256(hash, _mm256_srli_epi32(hash, 15)); hash = _mm256_mullo_epi32(hash, _mm256_set1_epi32(PRIME32_2)); @@ -315,7 +313,5 @@ uint32_t Hashing32::HashVarLen_avx2(bool combine_hashes, uint32_t num_rows, } } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/key_map.cc b/cpp/src/arrow/compute/key_map.cc index fd5c404a07f8d..71ca56c91a9ff 100644 --- a/cpp/src/arrow/compute/key_map.cc +++ b/cpp/src/arrow/compute/key_map.cc @@ -133,7 +133,7 @@ void SwissTable::extract_group_ids(const int num_keys, const uint16_t* optional_ // Optimistically use simplified lookup involving only a start block to find // a single group id candidate for every input. -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) int num_group_id_bytes = num_group_id_bits / 8; if ((hardware_flags_ & arrow::internal::CpuInfo::AVX2) && !optional_selection) { num_processed = extract_group_ids_avx2(num_keys, hashes, local_slots, out_group_ids, @@ -301,7 +301,7 @@ void SwissTable::early_filter(const int num_keys, const uint32_t* hashes, // Optimistically use simplified lookup involving only a start block to find // a single group id candidate for every input. int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags_ & arrow::internal::CpuInfo::AVX2) { if (log_blocks_ <= 4) { num_processed = early_filter_imp_avx2_x32(num_keys, hashes, out_match_bitvector, diff --git a/cpp/src/arrow/compute/key_map.h b/cpp/src/arrow/compute/key_map.h index 7ab48470f21e4..95fb3be274288 100644 --- a/cpp/src/arrow/compute/key_map.h +++ b/cpp/src/arrow/compute/key_map.h @@ -163,7 +163,7 @@ class ARROW_EXPORT SwissTable { // void early_filter_imp(const int num_keys, const uint32_t* hashes, uint8_t* out_match_bitvector, uint8_t* out_local_slots) const; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) int early_filter_imp_avx2_x8(const int num_hashes, const uint32_t* hashes, uint8_t* out_match_bitvector, uint8_t* out_local_slots) const; diff --git a/cpp/src/arrow/compute/key_map_avx2.cc b/cpp/src/arrow/compute/key_map_avx2.cc index eb318ff188fbb..731553511044f 100644 --- a/cpp/src/arrow/compute/key_map_avx2.cc +++ b/cpp/src/arrow/compute/key_map_avx2.cc @@ -23,8 +23,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - // This is more or less translation of equivalent scalar code, adjusted for a // different instruction set (e.g. missing leading zero count instruction). // @@ -412,7 +410,5 @@ int SwissTable::extract_group_ids_avx2(const int num_keys, const uint32_t* hashe return num_keys - (num_keys % unroll); } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/compare_internal.cc b/cpp/src/arrow/compute/row/compare_internal.cc index 39ac33932b548..7c402e7a2384d 100644 --- a/cpp/src/arrow/compute/row/compare_internal.cc +++ b/cpp/src/arrow/compute/row/compare_internal.cc @@ -42,7 +42,7 @@ void KeyCompare::NullUpdateColumnToRow(uint32_t id_col, uint32_t num_rows_to_com return; } uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = NullUpdateColumnToRow_avx2(use_selection, id_col, num_rows_to_compare, sel_left_maybe_null, left_to_right_map, @@ -130,7 +130,7 @@ void KeyCompare::CompareBinaryColumnToRow(uint32_t offset_within_row, const RowTableImpl& rows, uint8_t* match_bytevector) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = CompareBinaryColumnToRow_avx2( use_selection, offset_within_row, num_rows_to_compare, sel_left_maybe_null, @@ -297,7 +297,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t id_varbinary_col, const RowTableImpl& rows, uint8_t* match_bytevector) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = CompareVarBinaryColumnToRow_avx2( use_selection, is_first_varbinary_col, id_varbinary_col, num_rows_to_compare, @@ -313,7 +313,7 @@ void KeyCompare::CompareVarBinaryColumnToRow(uint32_t id_varbinary_col, void KeyCompare::AndByteVectors(LightContext* ctx, uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B) { uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { num_processed = AndByteVectors_avx2(num_elements, bytevector_A, bytevector_B); } diff --git a/cpp/src/arrow/compute/row/compare_internal.h b/cpp/src/arrow/compute/row/compare_internal.h index 638b8c2ec721f..db953fbe11271 100644 --- a/cpp/src/arrow/compute/row/compare_internal.h +++ b/cpp/src/arrow/compute/row/compare_internal.h @@ -86,7 +86,7 @@ class ARROW_EXPORT KeyCompare { static void AndByteVectors(LightContext* ctx, uint32_t num_elements, uint8_t* bytevector_A, const uint8_t* bytevector_B); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) template static uint32_t NullUpdateColumnToRowImp_avx2( diff --git a/cpp/src/arrow/compute/row/compare_internal_avx2.cc b/cpp/src/arrow/compute/row/compare_internal_avx2.cc index 95f37ab617db5..ff407c51b83cb 100644 --- a/cpp/src/arrow/compute/row/compare_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/compare_internal_avx2.cc @@ -24,8 +24,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - inline __m256i set_first_n_bytes_avx2(int n) { constexpr uint64_t kByteSequence0To7 = 0x0706050403020100ULL; constexpr uint64_t kByteSequence8To15 = 0x0f0e0d0c0b0a0908ULL; @@ -670,7 +668,5 @@ uint32_t KeyCompare::CompareVarBinaryColumnToRow_avx2( return num_rows_to_compare; } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/row/encode_internal.cc b/cpp/src/arrow/compute/row/encode_internal.cc index 3a6a85b0272f8..01d552ef8270f 100644 --- a/cpp/src/arrow/compute/row/encode_internal.cc +++ b/cpp/src/arrow/compute/row/encode_internal.cc @@ -455,7 +455,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, bool is_row_fixed_length = rows.metadata().is_fixed_length; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { DecodeHelper_avx2(is_row_fixed_length, start_row, num_rows, offset_within_row, rows, col); @@ -466,7 +466,7 @@ void EncoderBinary::Decode(uint32_t start_row, uint32_t num_rows, } else { DecodeImp(start_row, num_rows, offset_within_row, rows, col); } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif @@ -524,7 +524,7 @@ void EncoderBinaryPair::Decode(uint32_t start_row, uint32_t num_rows, bool is_row_fixed_length = rows.metadata().is_fixed_length; uint32_t num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2() && col_width1 == col_width2) { num_processed = DecodeHelper_avx2(is_row_fixed_length, col_width1, start_row, num_rows, @@ -772,7 +772,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, KeyColumnArray* col, LightContext* ctx) { // Output column varbinary buffer needs an extra 32B // at the end in avx2 version and 8B otherwise. -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (ctx->has_avx2()) { DecodeHelper_avx2(start_row, num_rows, varbinary_col_id, rows, col); } else { @@ -782,7 +782,7 @@ void EncoderVarBinary::Decode(uint32_t start_row, uint32_t num_rows, } else { DecodeImp(start_row, num_rows, varbinary_col_id, rows, col); } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif } diff --git a/cpp/src/arrow/compute/row/encode_internal.h b/cpp/src/arrow/compute/row/encode_internal.h index b83767b694cfd..6091fb66982af 100644 --- a/cpp/src/arrow/compute/row/encode_internal.h +++ b/cpp/src/arrow/compute/row/encode_internal.h @@ -187,7 +187,7 @@ class EncoderBinary { template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static void DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col); @@ -213,7 +213,7 @@ class EncoderBinaryPair { static void DecodeImp(uint32_t num_rows_to_skip, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col1, KeyColumnArray* col2); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static uint32_t DecodeHelper_avx2(bool is_row_fixed_length, uint32_t col_width, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, @@ -300,7 +300,7 @@ class EncoderVarBinary { template static void DecodeImp(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const RowTableImpl& rows, KeyColumnArray* col); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) static void DecodeHelper_avx2(uint32_t start_row, uint32_t num_rows, uint32_t varbinary_col_id, const RowTableImpl& rows, KeyColumnArray* col); diff --git a/cpp/src/arrow/compute/row/encode_internal_avx2.cc b/cpp/src/arrow/compute/row/encode_internal_avx2.cc index 02ba310bded20..50969c7bd6034 100644 --- a/cpp/src/arrow/compute/row/encode_internal_avx2.cc +++ b/cpp/src/arrow/compute/row/encode_internal_avx2.cc @@ -22,8 +22,6 @@ namespace arrow { namespace compute { -#if defined(ARROW_HAVE_AVX2) - void EncoderBinary::DecodeHelper_avx2(bool is_row_fixed_length, uint32_t start_row, uint32_t num_rows, uint32_t offset_within_row, const RowTableImpl& rows, KeyColumnArray* col) { @@ -230,7 +228,5 @@ void EncoderVarBinary::DecodeImp_avx2(uint32_t start_row, uint32_t num_rows, }); } -#endif - } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/util.cc b/cpp/src/arrow/compute/util.cc index f69f60a5af434..faf3e0c87e4d2 100644 --- a/cpp/src/arrow/compute/util.cc +++ b/cpp/src/arrow/compute/util.cc @@ -118,7 +118,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, // 64 bits at a time constexpr int unroll = 64; int tail = num_bits % unroll; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { if (filter_input_indexes) { avx2::bits_filter_indexes_avx2(bit_to_search, num_bits - tail, bits, input_indexes, @@ -141,7 +141,7 @@ void bits_to_indexes_internal(int64_t hardware_flags, const int num_bits, bits_to_indexes_helper(word, i * 64 + base_index, num_indexes, indexes); } } -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) } #endif // Optionally process the last partial word with masking out bits outside range @@ -253,7 +253,7 @@ void bits_to_bytes(int64_t hardware_flags, const int num_bits, const uint8_t* bi } int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { // The function call below processes whole 32 bit chunks together. num_processed = num_bits - (num_bits % 32); @@ -309,7 +309,7 @@ void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* by } int num_processed = 0; -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { // The function call below processes whole 32 bit chunks together. num_processed = num_bits - (num_bits % 32); @@ -339,7 +339,7 @@ void bytes_to_bits(int64_t hardware_flags, const int num_bits, const uint8_t* by bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, uint32_t num_bytes) { -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) if (hardware_flags & arrow::internal::CpuInfo::AVX2) { return avx2::are_all_bytes_zero_avx2(bytes, num_bytes); } diff --git a/cpp/src/arrow/compute/util.h b/cpp/src/arrow/compute/util.h index 489139eab87f2..730e59f346a52 100644 --- a/cpp/src/arrow/compute/util.h +++ b/cpp/src/arrow/compute/util.h @@ -168,7 +168,7 @@ ARROW_EXPORT void bytes_to_bits(int64_t hardware_flags, const int num_bits, ARROW_EXPORT bool are_all_bytes_zero(int64_t hardware_flags, const uint8_t* bytes, uint32_t num_bytes); -#if defined(ARROW_HAVE_AVX2) +#if defined(ARROW_HAVE_RUNTIME_AVX2) namespace avx2 { ARROW_EXPORT void bits_filter_indexes_avx2(int bit_to_search, const int num_bits, diff --git a/cpp/src/arrow/compute/util_avx2.cc b/cpp/src/arrow/compute/util_avx2.cc index 89ec6aa97a608..0191ab06f9532 100644 --- a/cpp/src/arrow/compute/util_avx2.cc +++ b/cpp/src/arrow/compute/util_avx2.cc @@ -21,9 +21,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/logging.h" -#if defined(ARROW_HAVE_AVX2) - -namespace arrow::util::avx2 { +namespace arrow::util::bit_util::avx2 { template void bits_to_indexes_imp_avx2(const int num_bits, const uint8_t* bits, int* num_indexes, @@ -211,6 +209,4 @@ bool are_all_bytes_zero_avx2(const uint8_t* bytes, uint32_t num_bytes) { return result_or32 == 0; } -} // namespace arrow::util::avx2 - -#endif // ARROW_HAVE_AVX2 +} // namespace arrow::util::bit_util::avx2