From 4b8e0e1cca2cbc0773771a9f074ec3f46b9270d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cezary=20Skrzy=C5=84ski?= Date: Thu, 22 Sep 2022 16:39:29 +0200 Subject: [PATCH] Port Atomic MinMax tests --- core/perf_test/CMakeLists.txt | 17 + core/perf_test/test_atomic_minmax_simple.cpp | 495 +++++++++++-------- 2 files changed, 306 insertions(+), 206 deletions(-) diff --git a/core/perf_test/CMakeLists.txt b/core/perf_test/CMakeLists.txt index 0fb2419867..6566ca9a76 100644 --- a/core/perf_test/CMakeLists.txt +++ b/core/perf_test/CMakeLists.txt @@ -89,6 +89,14 @@ IF(KOKKOS_ENABLE_TESTS) ENDIF() # Find or download google/benchmark library +IF(NOT Kokkos_ENABLE_BENCHMARKS) + RETURN() +ENDIF() + +IF (KOKKOS_HAS_TRILINOS) + message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") +ENDIF() + find_package(benchmark QUIET) IF(benchmark_FOUND) MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") @@ -209,3 +217,12 @@ KOKKOS_ADD_BENCHMARK( PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES} ) + +IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) + KOKKOS_ADD_BENCHMARK( + Benchmark_Atomic_MinMax + SOURCES + Benchmark_Context.cpp + test_atomic_minmax_simple.cpp + ) +ENDIF() diff --git a/core/perf_test/test_atomic_minmax_simple.cpp b/core/perf_test/test_atomic_minmax_simple.cpp index 4c2ae5c2d1..f4fde65848 100644 --- a/core/perf_test/test_atomic_minmax_simple.cpp +++ b/core/perf_test/test_atomic_minmax_simple.cpp @@ -21,240 +21,323 @@ // core/src/libkokkoscore.a -ldl && OMP_NUM_THREADS=1 // ./test_atomic_minmax_simple.x 10000000 -#include -#include - -#include -#include +#include +#include +#include #include -#include using exec_space = Kokkos::DefaultExecutionSpace; +constexpr int LENGTH = 1000000; + +template +Kokkos::View prepare_input(const int length, const T value) { + Kokkos::View input("input", length); + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { input(i) = value; }); + Kokkos::fence(); + return input; +} + +int get_length(benchmark::State& state) { + return (Test::command_line_num_args() == 2) + ? std::stoi(Test::command_line_arg(1)) + : state.range(0); +} + template -void test(const int length) { +int check_errors_replacement(Kokkos::View view) { + int errors = 0; + Kokkos::parallel_reduce( + view.size(), + KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != (T)i); }, + errors); + Kokkos::fence(); + return errors; +} + +template +double atomic_min_replacement(Kokkos::View input) { + const int length = input.size(); Kokkos::Timer timer; + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_fetch_min(&(input(i)), (T)i); + }); + Kokkos::fence(); + return timer.seconds(); +} - using vector = Kokkos::View; - - vector inp("input", length); - T max = std::numeric_limits::max(); - T min = std::numeric_limits::lowest(); - - // input is max values - all min atomics will replace - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_fetch_min(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% min replacements: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; +template +static void Atomic_MinReplacements(benchmark::State& state) { + const int length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::max()); + + for (auto _ : state) { + const auto time = atomic_min_replacement(inp); + const auto errors = check_errors_replacement(inp); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% min replacements: " << time << std::endl; } +} + +template +double atomic_max_replacement(Kokkos::View input) { + const int length = input.size(); + Kokkos::Timer timer; + Kokkos::parallel_for( + length, KOKKOS_LAMBDA(const int i) { + (void)Kokkos::atomic_max_fetch(&(input(i)), (T)i); + }); + Kokkos::fence(); + return timer.seconds(); +} - // input is min values - all max atomics will replace - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { inner += (inp(i) != (T)i); }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% max replacements: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; +template +static void Atomic_MaxReplacements(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::lowest()); + + for (auto _ : state) { + const auto time = atomic_max_replacement(inp); + const auto errors = check_errors_replacement(inp); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% max replacements: " << time << std::endl; } +} - // input is max values - all max atomics will early exit - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_max_fetch(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { - T ref = max; - inner += (inp(i) != ref); - }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% max early exits: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; +template +int check_errors_early_exit(Kokkos::View view, const T ref) { + int errors = 0; + Kokkos::parallel_reduce( + view.size(), + KOKKOS_LAMBDA(const int i, int& inner) { inner += (view(i) != ref); }, + errors); + Kokkos::fence(); + return errors; +} + +template +static void Atomic_MaxEarlyExits(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::max()); + + for (auto _ : state) { + const auto time = atomic_max_replacement(inp); + const auto errors = + check_errors_early_exit(inp, std::numeric_limits::max()); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% max early exits: " << time << std::endl; } +} - // input is min values - all min atomics will early exit - { - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); - Kokkos::fence(); - - timer.reset(); - Kokkos::parallel_for( - length, KOKKOS_LAMBDA(const int i) { - (void)Kokkos::atomic_min_fetch(&(inp(i)), (T)i); - }); - Kokkos::fence(); - double time = timer.seconds(); - - int errors(0); - Kokkos::parallel_reduce( - length, - KOKKOS_LAMBDA(const int i, int& inner) { - T ref = min; - inner += (inp(i) != ref); - }, - errors); - Kokkos::fence(); - - if (errors) { - std::cerr << "Error in 100% min early exits: " << errors << std::endl; - std::cerr << "inp(0)=" << inp(0) << std::endl; - if (length > 9) std::cout << "inp(9)=" << inp(9) << std::endl; +template +static void Atomic_MinEarlyExits(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(length, std::numeric_limits::lowest()); + + for (auto _ : state) { + const auto time = atomic_min_replacement(inp); + const auto errors = + check_errors_early_exit(inp, std::numeric_limits::lowest()); + + // report results + state.SetIterationTime(time); + if (errors > 0) { + state.counters["Errors"] = benchmark::Counter(errors); } - std::cout << "Time for 100% min early exits: " << time << std::endl; } +} - // limit iterations for contentious test, takes ~50x longer for same length - auto con_length = length / 5; - // input is min values - some max atomics will replace - { - Kokkos::parallel_for( - 1, KOKKOS_LAMBDA(const int i) { inp(i) = min; }); - Kokkos::fence(); - - T current(0); - timer.reset(); - Kokkos::parallel_reduce( - con_length, - KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_max_fetch(&(inp(0)), inner + 1); - if (i == con_length - 1) { - Kokkos::atomic_max_fetch(&(inp(0)), max); - inner = max; - } - }, - Kokkos::Max(current)); - Kokkos::fence(); - double time = timer.seconds(); - - if (current < max) { - std::cerr << "Error in contentious max replacements: " << std::endl; - std::cerr << "final=" << current << " inp(0)=" << inp(0) << " max=" << max - << std::endl; - } - std::cout << "Time for contentious max " << con_length - << " replacements: " << time << std::endl; +template +void report_errors_contentious_replacement(benchmark::State& state, + const T final, const T first, + const T expected) { + state.counters["Errors"] = benchmark::Counter(1); + state.counters["Final"] = benchmark::Counter(final); + state.counters["First"] = benchmark::Counter(first); + state.counters["Expected"] = benchmark::Counter(expected); +} + +template +double atomic_contentious_max_replacement(benchmark::State& state, + Kokkos::View input, + const int con_length) { + const auto max = std::numeric_limits::max(); + T current = 0; + + Kokkos::Timer timer; + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_max_fetch(&(input(0)), inner + 1); + if (i == con_length - 1) { + Kokkos::atomic_max_fetch(&(input(0)), max); + inner = max; + } + }, + Kokkos::Max(current)); + Kokkos::fence(); + const auto time = timer.seconds(); + + if (current < max) { + report_errors_contentious_replacement(state, current, input(0), max); } - // input is max values - some min atomics will replace - { - Kokkos::parallel_for( - 1, KOKKOS_LAMBDA(const int i) { inp(i) = max; }); - Kokkos::fence(); - - timer.reset(); - T current(100000000); - Kokkos::parallel_reduce( - con_length, - KOKKOS_LAMBDA(const int i, T& inner) { - inner = Kokkos::atomic_min_fetch(&(inp(0)), inner - 1); - if (i == con_length - 1) { - Kokkos::atomic_min_fetch(&(inp(0)), min); - inner = min; - } - }, - Kokkos::Min(current)); - Kokkos::fence(); - double time = timer.seconds(); - - if (current > min) { - std::cerr << "Error in contentious min replacements: " << std::endl; - std::cerr << "final=" << current << " inp(0)=" << inp(0) << " min=" << min - << std::endl; - } - std::cout << "Time for contentious min " << con_length - << " replacements: " << time << std::endl; + return time; +} + +template +static void Atomic_ContentiousMaxReplacements(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(1, std::numeric_limits::lowest()); + + for (auto _ : state) { + const auto time = atomic_contentious_max_replacement(state, inp, length); + + state.SetIterationTime(time); } } -int main(int argc, char* argv[]) { - Kokkos::initialize(argc, argv); - { - int length = 1000000; - if (argc == 2) { - length = std::stoi(argv[1]); - } +template +double atomic_contentious_min_replacement(benchmark::State& state, + Kokkos::View input, + const int con_length) { + const auto min = std::numeric_limits::lowest(); + T current = 0; - if (length < 1) { - throw std::invalid_argument(""); - } + Kokkos::Timer timer; + Kokkos::parallel_reduce( + con_length, + KOKKOS_LAMBDA(const int i, T& inner) { + inner = Kokkos::atomic_min_fetch(&(input(0)), inner - 1); + if (i == con_length - 1) { + Kokkos::atomic_min_fetch(&(input(0)), min); + inner = min; + } + }, + Kokkos::Min(current)); + Kokkos::fence(); + const auto time = timer.seconds(); + + if (current > min) { + report_errors_contentious_replacement(state, current, input(0), min); + } + + return time; +} - std::cout << "================ int" << std::endl; - test(length); - std::cout << "================ long" << std::endl; - test(length); - std::cout << "================ long long" << std::endl; - test(length); - - std::cout << "================ unsigned int" << std::endl; - test(length); - std::cout << "================ unsigned long" << std::endl; - test(length); - std::cout << "================ unsigned long long" << std::endl; - test(length); - - std::cout << "================ float" << std::endl; - test(length); - std::cout << "================ double" << std::endl; - test(length); +template +static void Atomic_ContentiousMinReplacements(benchmark::State& state) { + const auto length = get_length(state); + auto inp = prepare_input(1, std::numeric_limits::max()); + + for (auto _ : state) { + const auto time = atomic_contentious_max_replacement(state, inp, length); + + state.SetIterationTime(time); } +} + +BENCHMARK(Atomic_MinReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MaxEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_MinEarlyExits) + ->ArgName("Length") + ->Arg(LENGTH) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMaxReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +BENCHMARK(Atomic_ContentiousMinReplacements) + ->ArgName("Length") + ->Arg(LENGTH / 5) + ->UseManualTime() + ->Iterations(10); + +// FIXME: duplicated +namespace Test { +int command_line_num_args(int n) { + static int n_args = 0; + if (n > 0) n_args = n; + return n_args; +} + +const char* command_line_arg(int k, char** input_args) { + static char** args; + if (input_args != nullptr) args = input_args; + if (command_line_num_args() > k) + return args[k]; + else + return nullptr; +} +} // namespace Test + +int main(int argc, char* argv[]) { + Kokkos::initialize(argc, argv); + benchmark::Initialize(&argc, argv); + benchmark::SetDefaultTimeUnit(benchmark::kSecond); + KokkosBenchmark::add_benchmark_context(true); + + (void)Test::command_line_num_args(argc); + (void)Test::command_line_arg(0, argv); + + benchmark::RunSpecifiedBenchmarks(); + + // std::cout << "================ int" << std::endl; + // test(length); + // std::cout << "================ long" << std::endl; + // test(length); + // std::cout << "================ long long" << std::endl; + // test(length); + + // std::cout << "================ unsigned int" << std::endl; + // test(length); + // std::cout << "================ unsigned long" << std::endl; + // test(length); + // std::cout << "================ unsigned long long" << std::endl; + // test(length); + + // std::cout << "================ float" << std::endl; + // test(length); + // std::cout << "================ double" << std::endl; + // test(length); + // } + + benchmark::Shutdown(); Kokkos::finalize(); return 0; }