diff --git a/.github/workflows/cpp.yml b/.github/workflows/cpp.yml index 1110d359f72c0..63a16c8c114ba 100644 --- a/.github/workflows/cpp.yml +++ b/.github/workflows/cpp.yml @@ -54,7 +54,7 @@ env: jobs: docker: - name: ${{ matrix.title }} ${{ matrix.simd-level }} + name: ${{ matrix.title }} runs-on: ${{ matrix.runs-on }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 75 @@ -68,7 +68,7 @@ jobs: llvm: "14" runs-on: ubuntu-latest simd-level: AVX2 - title: AMD64 Conda C++ + title: AMD64 Conda C++ AVX2 ubuntu: "22.04" - arch: amd64 clang-tools: "14" @@ -226,7 +226,7 @@ jobs: ci/scripts/cpp_test.sh $(pwd) $(pwd)/build windows: - name: AMD64 ${{ matrix.name }} C++17 ${{ matrix.simd-level }} + name: ${{ matrix.title }} runs-on: ${{ matrix.os }} if: ${{ !contains(github.event.pull_request.title, 'WIP') }} timeout-minutes: 60 @@ -237,8 +237,8 @@ jobs: - windows-2019 include: - os: windows-2019 - name: Windows 2019 simd-level: AVX2 + title: AMD64 Windows 2019 C++17 AVX2 env: ARROW_BOOST_USE_SHARED: OFF ARROW_BUILD_BENCHMARKS: ON diff --git a/cpp/src/arrow/acero/bloom_filter_test.cc b/cpp/src/arrow/acero/bloom_filter_test.cc index eadd098cf85cb..95375e277e2b8 100644 --- a/cpp/src/arrow/acero/bloom_filter_test.cc +++ b/cpp/src/arrow/acero/bloom_filter_test.cc @@ -22,19 +22,18 @@ #include #include #include + #include "arrow/acero/bloom_filter.h" #include "arrow/acero/task_util.h" #include "arrow/acero/test_util_internal.h" #include "arrow/acero/util.h" #include "arrow/compute/key_hash.h" #include "arrow/util/bitmap_ops.h" -#include "arrow/util/cpu_info.h" namespace arrow { using compute::Hashing32; using compute::Hashing64; -using internal::CpuInfo; namespace acero { @@ -172,13 +171,7 @@ void TestBloomSmallHashHelper(int64_t num_input_hashes, const T* input_hashes, // Output FPR and build and probe cost. // void TestBloomSmall(BloomFilterBuildStrategy strategy, int64_t num_build, - int num_build_copies, bool use_avx2, bool enable_prefetch) { - int64_t hardware_flags = use_avx2 ? CpuInfo::AVX2 : 0; - if (hardware_flags && !CpuInfo::GetInstance()->IsSupported(hardware_flags)) { - // What else? - return; - } - + int num_build_copies, int64_t hardware_flags, bool enable_prefetch) { // Generate input keys // int64_t num_probe = 4 * num_build; @@ -329,14 +322,8 @@ void TestBloomLargeHashHelper(int64_t hardware_flags, int64_t block, // Test with larger size Bloom filters (use large prime with arithmetic // sequence modulo 2^64). // -void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, bool use_avx2, - bool enable_prefetch) { - int64_t hardware_flags = use_avx2 ? CpuInfo::AVX2 : 0; - if (hardware_flags && !CpuInfo::GetInstance()->IsSupported(hardware_flags)) { - // What else? - return; - } - +void TestBloomLarge(BloomFilterBuildStrategy strategy, int64_t num_build, + int64_t hardware_flags, bool enable_prefetch) { // Largest 63-bit prime constexpr uint64_t prime = 0x7FFFFFFFFFFFFFE7ULL; @@ -467,42 +454,40 @@ TEST(BloomFilter, Basic) { num_build.push_back(1LL << log_large); #endif - constexpr int num_param_sets = 3; - struct { - bool use_avx2; + struct TestParam { + int64_t hardware_flags; bool enable_prefetch; bool insert_multiple_copies; - } params[num_param_sets]; - for (int i = 0; i < num_param_sets; ++i) { - params[i].use_avx2 = (i == 1); - params[i].enable_prefetch = (i == 2); - params[i].insert_multiple_copies = (i == 3); + }; + std::vector test_params; + for (const auto hardware_flags : HardwareFlagsForTesting()) { + test_params.push_back({hardware_flags, false, false}); } + test_params.push_back({0, true, false}); + test_params.push_back({0, false, true}); - std::vector strategy; - strategy.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); + std::vector strategies; + strategies.push_back(BloomFilterBuildStrategy::SINGLE_THREADED); #ifndef ARROW_VALGRIND - strategy.push_back(BloomFilterBuildStrategy::PARALLEL); + strategies.push_back(BloomFilterBuildStrategy::PARALLEL); #endif static constexpr int64_t min_rows_for_large = 2 * 1024 * 1024; - for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) { - for (int iparam_set = 0; iparam_set < num_param_sets; ++iparam_set) { - ARROW_SCOPED_TRACE("%s ", params[iparam_set].use_avx2 ? "AVX2" - : params[iparam_set].enable_prefetch ? "PREFETCH" - : params[iparam_set].insert_multiple_copies ? "FOLDING" - : "REGULAR"); - for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) { - ARROW_SCOPED_TRACE("num_build ", static_cast(num_build[inum_build])); - if (num_build[inum_build] >= min_rows_for_large) { - TestBloomLarge(strategy[istrategy], num_build[inum_build], - params[iparam_set].use_avx2, params[iparam_set].enable_prefetch); + for (const auto& strategy : strategies) { + for (const auto& test_param : test_params) { + ARROW_SCOPED_TRACE("hardware_flags = ", test_param.hardware_flags, + test_param.enable_prefetch ? " PREFETCH" : "", + test_param.insert_multiple_copies ? " FOLDING" : "REGULAR"); + for (const auto n : num_build) { + ARROW_SCOPED_TRACE("num_build ", n); + if (n >= min_rows_for_large) { + TestBloomLarge(strategy, n, test_param.hardware_flags, + test_param.enable_prefetch); } else { - TestBloomSmall(strategy[istrategy], num_build[inum_build], - params[iparam_set].insert_multiple_copies ? 8 : 1, - params[iparam_set].use_avx2, params[iparam_set].enable_prefetch); + TestBloomSmall(strategy, n, test_param.insert_multiple_copies ? 8 : 1, + test_param.hardware_flags, test_param.enable_prefetch); } } } @@ -515,19 +500,18 @@ TEST(BloomFilter, Scaling) { num_build.push_back(1000000); num_build.push_back(4000000); - std::vector strategy; - strategy.push_back(BloomFilterBuildStrategy::PARALLEL); - - for (bool use_avx2 : {false, true}) { - for (size_t istrategy = 0; istrategy < strategy.size(); ++istrategy) { - for (size_t inum_build = 0; inum_build < num_build.size(); ++inum_build) { - ARROW_SCOPED_TRACE("num_build = ", static_cast(num_build[inum_build])); - ARROW_SCOPED_TRACE("strategy = ", - strategy[istrategy] == BloomFilterBuildStrategy::PARALLEL - ? "PARALLEL" - : "SINGLE_THREADED"); - ARROW_SCOPED_TRACE("avx2 = ", use_avx2 ? "AVX2" : "SCALAR"); - TestBloomLarge(strategy[istrategy], num_build[inum_build], use_avx2, + std::vector strategies; + strategies.push_back(BloomFilterBuildStrategy::PARALLEL); + + for (const auto hardware_flags : HardwareFlagsForTesting()) { + for (const auto& strategy : strategies) { + for (const auto n : num_build) { + ARROW_SCOPED_TRACE("num_build = ", n); + ARROW_SCOPED_TRACE("strategy = ", strategy == BloomFilterBuildStrategy::PARALLEL + ? "PARALLEL" + : "SINGLE_THREADED"); + ARROW_SCOPED_TRACE("hardware_flags = ", hardware_flags); + TestBloomLarge(strategy, n, hardware_flags, /*enable_prefetch=*/false); } } diff --git a/cpp/src/arrow/acero/test_util_internal.cc b/cpp/src/arrow/acero/test_util_internal.cc index 2042650be6acb..f50ca92238dc4 100644 --- a/cpp/src/arrow/acero/test_util_internal.cc +++ b/cpp/src/arrow/acero/test_util_internal.cc @@ -45,8 +45,10 @@ #include "arrow/testing/builder.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/random.h" +#include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/async_generator.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/iterator.h" #include "arrow/util/logging.h" #include "arrow/util/unreachable.h" @@ -54,6 +56,7 @@ namespace arrow { +using arrow::internal::CpuInfo; using arrow::internal::Executor; using compute::SortKey; @@ -62,6 +65,7 @@ using compute::Take; namespace acero { namespace { + void ValidateOutputImpl(const ArrayData& output) { ASSERT_OK(::arrow::internal::ValidateArrayFull(output)); TestInitialized(output); @@ -116,6 +120,11 @@ void ValidateOutput(const Datum& output) { } } +std::vector HardwareFlagsForTesting() { + // Acero currently only has AVX2 optimizations + return arrow::GetSupportedHardwareFlags({CpuInfo::AVX2}); +} + namespace { struct DummyNode : ExecNode { diff --git a/cpp/src/arrow/acero/test_util_internal.h b/cpp/src/arrow/acero/test_util_internal.h index 03f417028650b..569fb1254db4a 100644 --- a/cpp/src/arrow/acero/test_util_internal.h +++ b/cpp/src/arrow/acero/test_util_internal.h @@ -20,6 +20,7 @@ #include "arrow/testing/gtest_util.h" #include "arrow/util/vector.h" +#include #include #include #include @@ -33,12 +34,14 @@ #include "arrow/util/async_generator.h" #include "arrow/util/pcg_random.h" -namespace arrow { - -namespace acero { +namespace arrow::acero { void ValidateOutput(const Datum& output); +// Enumerate all hardware flags that can be tested on this platform +// and would lead to different code paths being tested in Acero. +std::vector HardwareFlagsForTesting(); + using StartProducingFunc = std::function; using StopProducingFunc = std::function; @@ -204,5 +207,4 @@ struct TableGenerationProperties { Result> MakeRandomTimeSeriesTable( const TableGenerationProperties& properties); -} // namespace acero -} // namespace arrow +} // namespace arrow::acero diff --git a/cpp/src/arrow/compute/kernels/CMakeLists.txt b/cpp/src/arrow/compute/kernels/CMakeLists.txt index a17d6275a763a..0bd6fe86134ab 100644 --- a/cpp/src/arrow/compute/kernels/CMakeLists.txt +++ b/cpp/src/arrow/compute/kernels/CMakeLists.txt @@ -18,11 +18,20 @@ # ---------------------------------------------------------------------- # Tests that don't require the full kernel library +# Define arrow_compute_testing object library for common test files +if(ARROW_TESTING) + add_library(arrow_compute_kernels_testing OBJECT test_util.cc) + # Even though this is still just an object library we still need to "link" our + # dependencies so that include paths are configured correctly + target_link_libraries(arrow_compute_kernels_testing ${ARROW_GTEST_GTEST}) +endif() + add_arrow_test(scalar_cast_test ${ARROW_COMPUTE_TEST_ARGS} SOURCES scalar_cast_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) # ---------------------------------------------------------------------- # Scalar kernels @@ -32,25 +41,36 @@ add_arrow_compute_test(scalar_type_test scalar_boolean_test.cc scalar_nested_test.cc scalar_string_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(scalar_if_else_test SOURCES scalar_if_else_test.cc test_util.cc) +add_arrow_compute_test(scalar_if_else_test + SOURCES + scalar_if_else_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(scalar_temporal_test SOURCES scalar_temporal_test.cc test_util.cc) +add_arrow_compute_test(scalar_temporal_test + SOURCES + scalar_temporal_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_compute_test(scalar_math_test SOURCES scalar_arithmetic_test.cc scalar_compare_test.cc scalar_round_arithmetic_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_compute_test(scalar_utility_test SOURCES scalar_random_test.cc scalar_set_lookup_test.cc scalar_validity_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_benchmark(scalar_arithmetic_benchmark PREFIX "arrow-compute") add_arrow_benchmark(scalar_boolean_benchmark PREFIX "arrow-compute") @@ -75,12 +95,20 @@ add_arrow_compute_test(vector_test vector_replace_test.cc vector_run_end_encode_test.cc select_k_test.cc - test_util.cc) + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(vector_sort_test SOURCES vector_sort_test.cc test_util.cc) +add_arrow_compute_test(vector_sort_test + SOURCES + vector_sort_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) -add_arrow_compute_test(vector_selection_test SOURCES vector_selection_test.cc - test_util.cc) +add_arrow_compute_test(vector_selection_test + SOURCES + vector_selection_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) add_arrow_benchmark(vector_hash_benchmark PREFIX "arrow-compute") add_arrow_benchmark(vector_sort_benchmark PREFIX "arrow-compute") @@ -94,7 +122,11 @@ add_arrow_benchmark(vector_selection_benchmark PREFIX "arrow-compute") # Aggregates -add_arrow_compute_test(aggregate_test SOURCES aggregate_test.cc test_util.cc) +add_arrow_compute_test(aggregate_test + SOURCES + aggregate_test.cc + EXTRA_LINK_LIBS + arrow_compute_kernels_testing) # ---------------------------------------------------------------------- # Utilities diff --git a/cpp/src/arrow/compute/key_hash_test.cc b/cpp/src/arrow/compute/key_hash_test.cc index f7c1834030c55..3e6d41525cf44 100644 --- a/cpp/src/arrow/compute/key_hash_test.cc +++ b/cpp/src/arrow/compute/key_hash_test.cc @@ -21,9 +21,11 @@ #include #include #include + #include "arrow/array/builder_binary.h" #include "arrow/compute/key_hash.h" #include "arrow/testing/gtest_util.h" +#include "arrow/testing/util.h" #include "arrow/util/cpu_info.h" #include "arrow/util/pcg_random.h" @@ -34,6 +36,11 @@ using internal::CpuInfo; namespace compute { +std::vector HardwareFlagsForTesting() { + // Our key-hash and key-map routines currently only have AVX2 optimizations + return GetSupportedHardwareFlags({CpuInfo::AVX2}); +} + class TestVectorHash { private: template ::ArrayType> @@ -132,74 +139,62 @@ class TestVectorHash { const offset_t* key_offsets = reinterpret_cast(keys_array->raw_value_offsets()); - std::vector hashes_scalar32; - std::vector hashes_scalar64; - hashes_scalar32.resize(num_rows); - hashes_scalar64.resize(num_rows); - std::vector hashes_simd32; - std::vector hashes_simd64; - hashes_simd32.resize(num_rows); - hashes_simd64.resize(num_rows); - - const int64_t hardware_flags_scalar = 0LL; - const int64_t hardware_flags_simd = CpuInfo::AVX2; - const bool simd_supported = CpuInfo::GetInstance()->IsSupported(hardware_flags_simd); + // For each tested hardware flags, we will compute the hashes and check + // them for consistency. + const auto hardware_flags_for_testing = HardwareFlagsForTesting(); + ASSERT_GT(hardware_flags_for_testing.size(), 0); + std::vector> hashes32(hardware_flags_for_testing.size()); + std::vector> hashes64(hardware_flags_for_testing.size()); + for (auto& h : hashes32) { + h.resize(num_rows); + } + for (auto& h : hashes64) { + h.resize(num_rows); + } constexpr int mini_batch_size = 1024; std::vector temp_buffer; temp_buffer.resize(mini_batch_size * 4); - for (bool use_simd : {false, true}) { - if (use_simd && !simd_supported) { - // XXX what else? - continue; - } + for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + const auto hardware_flags = hardware_flags_for_testing[i]; if (use_32bit_hash) { if (!use_varlen_input) { - Hashing32::HashFixed(use_simd ? hardware_flags_simd : hardware_flags_scalar, + Hashing32::HashFixed(hardware_flags, /*combine_hashes=*/false, num_rows, fixed_length, keys, - use_simd ? hashes_simd32.data() : hashes_scalar32.data(), - temp_buffer.data()); + hashes32[i].data(), temp_buffer.data()); } else { for (int first_row = 0; first_row < num_rows;) { int batch_size_next = std::min(num_rows - first_row, mini_batch_size); - Hashing32::HashVarLen( - use_simd ? hardware_flags_simd : hardware_flags_scalar, - /*combine_hashes=*/false, batch_size_next, key_offsets + first_row, keys, - (use_simd ? hashes_simd32.data() : hashes_scalar32.data()) + first_row, - temp_buffer.data()); + Hashing32::HashVarLen(hardware_flags, + /*combine_hashes=*/false, batch_size_next, + key_offsets + first_row, keys, + hashes32[i].data() + first_row, temp_buffer.data()); first_row += batch_size_next; } } + for (int j = 0; j < num_rows; ++j) { + hashes64[i][j] = hashes32[i][j]; + } } else { if (!use_varlen_input) { Hashing64::HashFixed( - /*combine_hashes=*/false, num_rows, fixed_length, keys, - use_simd ? hashes_simd64.data() : hashes_scalar64.data()); + /*combine_hashes=*/false, num_rows, fixed_length, keys, hashes64[i].data()); } else { Hashing64::HashVarLen( - /*combine_hashes=*/false, num_rows, key_offsets, keys, - use_simd ? hashes_simd64.data() : hashes_scalar64.data()); + /*combine_hashes=*/false, num_rows, key_offsets, keys, hashes64[i].data()); } } } - if (use_32bit_hash) { - for (int i = 0; i < num_rows; ++i) { - hashes_scalar64[i] = hashes_scalar32[i]; - if (simd_supported) { - hashes_simd64[i] = hashes_simd32[i]; - } - } - } - - // Verify that both scalar and AVX2 implementations give the same hashes + // Verify that all implementations (scalar, SIMD) give the same hashes // - if (simd_supported) { - for (int i = 0; i < num_rows; ++i) { - ASSERT_EQ(hashes_scalar64[i], hashes_simd64[i]) + const auto& hashes_scalar64 = hashes64[0]; + for (int i = 0; i < static_cast(hardware_flags_for_testing.size()); ++i) { + for (int j = 0; j < num_rows; ++j) { + ASSERT_EQ(hashes64[i][j], hashes_scalar64[j]) << "scalar and simd approaches yielded different hashes"; } } @@ -208,18 +203,15 @@ class TestVectorHash { // each time. Measure the number of unique hashes and compare to the number // of unique keys. // - std::map unique_key_to_hash; - std::set unique_hashes; + std::unordered_map unique_key_to_hash; + std::unordered_set unique_hashes; for (int i = 0; i < num_rows; ++i) { - std::map::iterator iter = unique_key_to_hash.find(row_ids[i]); - if (iter == unique_key_to_hash.end()) { - unique_key_to_hash.insert(std::make_pair(row_ids[i], hashes_scalar64[i])); - } else { - ASSERT_EQ(iter->second, hashes_scalar64[i]); - } - if (unique_hashes.find(hashes_scalar64[i]) == unique_hashes.end()) { - unique_hashes.insert(hashes_scalar64[i]); + auto [it, inserted] = + unique_key_to_hash.try_emplace(row_ids[i], hashes_scalar64[i]); + if (!inserted) { + ASSERT_EQ(it->second, hashes_scalar64[i]); } + unique_hashes.insert(hashes_scalar64[i]); } float percent_hash_collisions = 100.0f * static_cast(num_unique - unique_hashes.size()) / diff --git a/cpp/src/arrow/testing/util.cc b/cpp/src/arrow/testing/util.cc index b59854480765b..e8a782575e278 100644 --- a/cpp/src/arrow/testing/util.cc +++ b/cpp/src/arrow/testing/util.cc @@ -43,6 +43,7 @@ #include "arrow/table.h" #include "arrow/testing/random.h" #include "arrow/type.h" +#include "arrow/util/cpu_info.h" #include "arrow/util/io_util.h" #include "arrow/util/logging.h" #include "arrow/util/pcg_random.h" @@ -211,4 +212,18 @@ const std::vector>& all_dictionary_index_types() { return types; } +std::vector GetSupportedHardwareFlags( + const std::vector& candidate_flags) { + std::vector hardware_flags; + // Always test fallback codepaths + hardware_flags.push_back(0); + for (const int64_t candidate_flag : candidate_flags) { + if (candidate_flag != 0 && + internal::CpuInfo::GetInstance()->IsSupported(candidate_flag)) { + hardware_flags.push_back(candidate_flag); + } + } + return hardware_flags; +} + } // namespace arrow diff --git a/cpp/src/arrow/testing/util.h b/cpp/src/arrow/testing/util.h index 4f4b03438fd58..b4b2785a36292 100644 --- a/cpp/src/arrow/testing/util.h +++ b/cpp/src/arrow/testing/util.h @@ -131,4 +131,10 @@ ARROW_TESTING_EXPORT std::string GetListenAddress(); ARROW_TESTING_EXPORT const std::vector>& all_dictionary_index_types(); +// Get a list of supported hardware flags from the given candidates. +// The result will always contain 0, meaning no optional CPU feature enabled at all. +ARROW_TESTING_EXPORT +std::vector GetSupportedHardwareFlags( + const std::vector& candidate_flags); + } // namespace arrow