From 056f1521c48684de6a2716965545dd6daa9a9991 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Tue, 11 Jul 2023 19:22:36 +0200 Subject: [PATCH 1/7] Fix OMP row reduction kernel The kernel requested more memory than necessary in most scenarios because of a faulty temporary storage estimation. --- omp/base/kernel_launch_reduction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index d8d081e323b..a46ce970421 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -327,7 +327,7 @@ void run_kernel_col_reduction_sized_impl( const auto reduction_size = ceildiv(reduction_kernel_oversubscription * num_threads, cols); const auto rows_per_thread = ceildiv(rows, reduction_size); - const auto required_storage = sizeof(ValueType) * rows * reduction_size; + const auto required_storage = sizeof(ValueType) * cols * reduction_size; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } From 75a4e14ea780e976ad394e574c096cdfb94084f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Thu, 13 Jul 2023 16:01:44 +0200 Subject: [PATCH 2/7] Add specific tests for OMP reductions --- test/base/kernel_launch_generic.cpp | 168 ++++++++++++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 3dd1570c5f8..cf07f867c82 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common/unified/base/kernel_launch.hpp" +#include #include #include @@ -364,6 +365,39 @@ void run1d_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); } +void run1d_reduction_cached(std::shared_ptr exec, + std::vector sizes) +{ + gko::array output{exec, 1}; + gko::array temp(exec); + for (const auto& size : sizes) { + temp.clear(); + gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + exec, + [] GKO_KERNEL(auto i) { + static_assert(is_same::value, "index"); + return i + 1; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + size, temp); + + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + static_cast(size)); + // The temporary storage (used for partial sums) must be smaller than + // the input array + ASSERT_LT(temp.get_num_elems() / sizeof(int64), size); + } +} + +TEST_F(KernelLaunch, Reduction1DCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run1d_reduction_cached(exec, {1000, 1000000, 1234567, 7654321}); +} + + void run2d_reduction(std::shared_ptr exec) { gko::array output{exec, {-1l}}; @@ -432,6 +466,47 @@ void run2d_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } +void run2d_reduction_cached(std::shared_ptr exec, + std::vector> dims) +{ + gko::array output{exec, 1}; + gko::array temp(exec); + for (const auto& dim : dims) { + temp.clear(); + gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + return i + j + 2; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + dim, temp); + + ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), + static_cast(dim[0] + dim[1])); + // The temporary storage (used for partial sums) must be smaller than + // the input array + ASSERT_LT(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + } +} + +TEST_F(KernelLaunch, Reduction2DCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run2d_reduction_cached(exec, {{20, 10}, + {10, 3000}, + {1000, 5}, + {30, 50}, + {1, 100000}, + {100000, 1}, + {500000, 20}, + {20, 500000}}); +} + + void run2d_row_reduction(std::shared_ptr exec) { for (auto num_rows : {0, 100, 1000, 10000}) { @@ -481,6 +556,53 @@ void run2d_row_reduction(std::shared_ptr exec) TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } +void run2d_row_reduction_cached(std::shared_ptr exec, + std::vector> dims) +{ + // This assumes at most 256 OpenMP Threads + constexpr int64_t max_tmp_elems = 4 * 256; + const size_type result_stride = 1; + gko::array temp(exec); + for (const auto& dim : dims) { + gko::array host_ref{exec->get_master(), dim[0]}; + gko::array output{exec, host_ref}; + temp.clear(); + for (int64 i = 0; i < host_ref.get_num_elems(); ++i) { + host_ref.get_data()[i] = dim[1] + i + 1; + } + + gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + return i + j + 2; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + result_stride, dim, temp); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), + max_tmp_elems * max_tmp_elems); + } +} + +TEST_F(KernelLaunch, ReductionRowCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run2d_row_reduction_cached(exec, {{20, 10}, + {10, 3000}, + {1000, 5}, + {30, 50}, + {1, 100000}, + {100000, 1}, + {500000, 20}, + {20, 500000}}); +} + + void run2d_col_reduction(std::shared_ptr exec) { // empty, most threads idle, most threads busy, multiple blocks @@ -530,3 +652,49 @@ void run2d_col_reduction(std::shared_ptr exec) } TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } + + +void run2d_col_reduction_cached(std::shared_ptr exec, + std::vector> dims) +{ + gko::array temp(exec); + for (const auto& dim : dims) { + gko::array host_ref{exec->get_master(), dim[1]}; + gko::array output{exec, host_ref}; + temp.clear(); + for (int64 i = 0; i < host_ref.get_num_elems(); ++i) { + host_ref.get_data()[i] = dim[0] + i + 1; + } + + gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j) { + static_assert(is_same::value, "index"); + static_assert(is_same::value, "index"); + return i + j + 2; + }, + [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, + [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), + dim, temp); + + GKO_ASSERT_ARRAY_EQ(host_ref, output); + // This assumes at most 256 OpenMP Threads + const size_type temp_elem_limit = + std::max(size_type{4 * 256}, dim[0] * dim[1]); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit); + } +} + +TEST_F(KernelLaunch, ReductionColCached) +{ + // Note: Start with at least 200 elements in case the machine has a lot of + // cores + run2d_col_reduction_cached(exec, {{20, 10}, + {10, 3000}, + {1000, 5}, + {30, 50}, + {1, 100000}, + {100000, 1}, + {500000, 20}, + {20, 500000}}); +} From 288546d664079ca7f5b342431bcb83401a046790 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Fri, 21 Jul 2023 15:26:07 +0200 Subject: [PATCH 3/7] Update reduction tests to all scale with size --- test/base/kernel_launch_generic.cpp | 68 +++++++++++++---------------- 1 file changed, 31 insertions(+), 37 deletions(-) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index cf07f867c82..57bab96d9c0 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -373,11 +373,7 @@ void run1d_reduction_cached(std::shared_ptr exec, for (const auto& size : sizes) { temp.clear(); gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( - exec, - [] GKO_KERNEL(auto i) { - static_assert(is_same::value, "index"); - return i + 1; - }, + exec, [] GKO_KERNEL(auto i) { return i + 1; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), size, temp); @@ -469,17 +465,13 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } void run2d_reduction_cached(std::shared_ptr exec, std::vector> dims) { + constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array output{exec, 1}; gko::array temp(exec); for (const auto& dim : dims) { temp.clear(); gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached( - exec, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - return i + j + 2; - }, + exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), dim, temp); @@ -487,19 +479,23 @@ void run2d_reduction_cached(std::shared_ptr exec, ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), static_cast(dim[0] + dim[1])); // The temporary storage (used for partial sums) must be smaller than - // the input array - ASSERT_LT(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + // the input array (or smaller than a set minimum) + const size_type max_tmp_elems = + std::max(dim[0] * dim[1], min_allowed_tmp_elems); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); } } TEST_F(KernelLaunch, Reduction2DCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores run2d_reduction_cached(exec, {{20, 10}, {10, 3000}, {1000, 5}, {30, 50}, + {600, 500}, + {500, 600}, + {1000, 900}, + {900, 1000}, {1, 100000}, {100000, 1}, {500000, 20}, @@ -559,8 +555,9 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } void run2d_row_reduction_cached(std::shared_ptr exec, std::vector> dims) { - // This assumes at most 256 OpenMP Threads - constexpr int64_t max_tmp_elems = 4 * 256; + // The 2D row reduction potentially needs a lot of memory for small input + // sizes + constexpr size_type min_allowed_tmp_elems = 4 * 256 * 4 * 256; const size_type result_stride = 1; gko::array temp(exec); for (const auto& dim : dims) { @@ -572,30 +569,30 @@ void run2d_row_reduction_cached(std::shared_ptr exec, } gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached( - exec, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - return i + j + 2; - }, + exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), result_stride, dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), - max_tmp_elems * max_tmp_elems); + // The temporary storage (used for partial sums) must be smaller than + // the input array (or smaller than a set minimum) + const size_type max_tmp_elems = + std::max(dim[0] * dim[1], min_allowed_tmp_elems); + ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); } } TEST_F(KernelLaunch, ReductionRowCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores run2d_row_reduction_cached(exec, {{20, 10}, {10, 3000}, {1000, 5}, {30, 50}, + {600, 500}, + {500, 600}, + {1000, 900}, + {900, 1000}, {1, 100000}, {100000, 1}, {500000, 20}, @@ -657,6 +654,7 @@ TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } void run2d_col_reduction_cached(std::shared_ptr exec, std::vector> dims) { + constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array temp(exec); for (const auto& dim : dims) { gko::array host_ref{exec->get_master(), dim[1]}; @@ -667,32 +665,28 @@ void run2d_col_reduction_cached(std::shared_ptr exec, } gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached( - exec, - [] GKO_KERNEL(auto i, auto j) { - static_assert(is_same::value, "index"); - static_assert(is_same::value, "index"); - return i + j + 2; - }, + exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; }, [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); }, [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(), dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - // This assumes at most 256 OpenMP Threads const size_type temp_elem_limit = - std::max(size_type{4 * 256}, dim[0] * dim[1]); + std::max(min_allowed_tmp_elems, dim[0] * dim[1]); ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit); } } TEST_F(KernelLaunch, ReductionColCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores run2d_col_reduction_cached(exec, {{20, 10}, {10, 3000}, {1000, 5}, {30, 50}, + {600, 500}, + {500, 600}, + {1000, 900}, + {900, 1000}, {1, 100000}, {100000, 1}, {500000, 20}, From 17b50ce6d563c41ea4fbc70862e38ec119196c00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= Date: Tue, 25 Jul 2023 18:02:46 +0200 Subject: [PATCH 4/7] Change OMP reduction implementation At most allocate as much as the input vector for OMP reductions. --- omp/base/kernel_launch_reduction.hpp | 45 +++++++++++++++++----------- test/base/kernel_launch_generic.cpp | 27 +++++------------ 2 files changed, 34 insertions(+), 38 deletions(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index a46ce970421..5dfbd5ba6c0 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -62,8 +62,9 @@ void run_kernel_reduction_impl(std::shared_ptr exec, ValueType* result, size_type size, array& tmp, MappedKernelArgs... args) { - const auto num_threads = static_cast(omp_get_max_threads()); const auto ssize = static_cast(size); + // Limit the number of threads to the number of columns + const auto num_threads = std::min(omp_get_max_threads(), ssize); const auto work_per_thread = ceildiv(ssize, num_threads); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { @@ -82,8 +83,8 @@ void run_kernel_reduction_impl(std::shared_ptr exec, } partial[thread_id] = local_partial; } - *result = - finalize(std::accumulate(partial, partial + num_threads, identity, op)); + *result = finalize(std::accumulate( + partial, partial + required_storage / sizeof(ValueType), identity, op)); } @@ -99,7 +100,8 @@ void run_kernel_reduction_sized_impl(syn::value_list, { const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto num_threads = static_cast(omp_get_max_threads()); + // Limit the number of threads to the number of columns + const auto num_threads = std::min(omp_get_max_threads(), rows); const auto work_per_thread = ceildiv(rows, num_threads); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { @@ -109,7 +111,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, static_assert(remainder_cols < block_size, "remainder too large"); const auto rounded_cols = cols / block_size * block_size; GKO_ASSERT(rounded_cols + remainder_cols == cols); -#pragma omp parallel +#pragma omp parallel num_threads(num_threads) { const auto thread_id = omp_get_thread_num(); const auto begin = thread_id * work_per_thread; @@ -147,8 +149,8 @@ void run_kernel_reduction_sized_impl(syn::value_list, } partial[thread_id] = local_partial; } - *result = - finalize(std::accumulate(partial, partial + num_threads, identity, op)); + *result = finalize(std::accumulate( + partial, partial + required_storage / sizeof(ValueType), identity, op)); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, @@ -210,12 +212,12 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, constexpr int block_size = 8; const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto num_threads = static_cast(omp_get_max_threads()); + const auto available_threads = static_cast(omp_get_max_threads()); if (rows <= 0) { return; } // enough work to keep all threads busy or only very small reduction sizes - if (rows >= reduction_kernel_oversubscription * num_threads || + if (rows >= reduction_kernel_oversubscription * available_threads || cols < rows) { #pragma omp parallel for for (int64 row = 0; row < rows; row++) { @@ -229,8 +231,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, } } else { // small number of rows and large reduction sizes: do partial sum first + const auto num_threads = std::min(available_threads, cols); const auto work_per_thread = ceildiv(cols, num_threads); - const auto required_storage = sizeof(ValueType) * rows * num_threads; + const auto temp_elems_per_row = num_threads; + const auto required_storage = + sizeof(ValueType) * rows * temp_elems_per_row; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); } @@ -247,7 +252,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, return fn(row, col, args...); }()); } - partial[row * num_threads + thread_id] = local_partial; + partial[row * temp_elems_per_row + thread_id] = local_partial; } } // then accumulate the partial sums and write to result @@ -255,10 +260,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, for (int64 row = 0; row < rows; row++) { [&] { auto local_partial = identity; - for (int64 thread_id = 0; thread_id < num_threads; + for (int64 thread_id = 0; thread_id < temp_elems_per_row; thread_id++) { - local_partial = op(local_partial, - partial[row * num_threads + thread_id]); + local_partial = + op(local_partial, + partial[row * temp_elems_per_row + thread_id]); } result[row * result_stride] = finalize(local_partial); }(); @@ -302,12 +308,12 @@ void run_kernel_col_reduction_sized_impl( { const auto rows = static_cast(size[0]); const auto cols = static_cast(size[1]); - const auto num_threads = static_cast(omp_get_max_threads()); + const auto available_threads = static_cast(omp_get_max_threads()); static_assert(remainder_cols < block_size, "remainder too large"); GKO_ASSERT(cols % block_size == remainder_cols); const auto num_col_blocks = ceildiv(cols, block_size); // enough work to keep all threads busy or only very small reduction sizes - if (cols >= reduction_kernel_oversubscription * num_threads || + if (cols >= reduction_kernel_oversubscription * available_threads || rows < cols) { #pragma omp parallel for for (int64 col_block = 0; col_block < num_col_blocks; col_block++) { @@ -324,8 +330,11 @@ void run_kernel_col_reduction_sized_impl( } } else { // number of blocks that need to be reduced afterwards - const auto reduction_size = - ceildiv(reduction_kernel_oversubscription * num_threads, cols); + // This reduction_size definition ensures we don't use more temporary + // storage than the input vector + const auto reduction_size = std::min( + rows, ceildiv(reduction_kernel_oversubscription * available_threads, + cols)); const auto rows_per_thread = ceildiv(rows, reduction_size); const auto required_storage = sizeof(ValueType) * cols * reduction_size; if (tmp.get_num_elems() < required_storage) { diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index 57bab96d9c0..bc4119d2806 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -382,15 +382,13 @@ void run1d_reduction_cached(std::shared_ptr exec, static_cast(size)); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LT(temp.get_num_elems() / sizeof(int64), size); + ASSERT_LE(temp.get_num_elems() / sizeof(int64), size); } } TEST_F(KernelLaunch, Reduction1DCached) { - // Note: Start with at least 200 elements in case the machine has a lot of - // cores - run1d_reduction_cached(exec, {1000, 1000000, 1234567, 7654321}); + run1d_reduction_cached(exec, {10, 1000, 1000000, 1234567, 7654321}); } @@ -465,7 +463,6 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); } void run2d_reduction_cached(std::shared_ptr exec, std::vector> dims) { - constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array output{exec, 1}; gko::array temp(exec); for (const auto& dim : dims) { @@ -479,10 +476,8 @@ void run2d_reduction_cached(std::shared_ptr exec, ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()), static_cast(dim[0] + dim[1])); // The temporary storage (used for partial sums) must be smaller than - // the input array (or smaller than a set minimum) - const size_type max_tmp_elems = - std::max(dim[0] * dim[1], min_allowed_tmp_elems); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); + // the input array + ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); } } @@ -555,9 +550,6 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); } void run2d_row_reduction_cached(std::shared_ptr exec, std::vector> dims) { - // The 2D row reduction potentially needs a lot of memory for small input - // sizes - constexpr size_type min_allowed_tmp_elems = 4 * 256 * 4 * 256; const size_type result_stride = 1; gko::array temp(exec); for (const auto& dim : dims) { @@ -576,10 +568,8 @@ void run2d_row_reduction_cached(std::shared_ptr exec, GKO_ASSERT_ARRAY_EQ(host_ref, output); // The temporary storage (used for partial sums) must be smaller than - // the input array (or smaller than a set minimum) - const size_type max_tmp_elems = - std::max(dim[0] * dim[1], min_allowed_tmp_elems); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems); + // the input array + ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); } } @@ -654,7 +644,6 @@ TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); } void run2d_col_reduction_cached(std::shared_ptr exec, std::vector> dims) { - constexpr size_type min_allowed_tmp_elems = 4 * 256; gko::array temp(exec); for (const auto& dim : dims) { gko::array host_ref{exec->get_master(), dim[1]}; @@ -671,9 +660,7 @@ void run2d_col_reduction_cached(std::shared_ptr exec, dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - const size_type temp_elem_limit = - std::max(min_allowed_tmp_elems, dim[0] * dim[1]); - ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit); + ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); } } From 792edb90e3202db203b9939c661f4597bd392e41 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 14:14:33 +0200 Subject: [PATCH 5/7] fix warning --- include/ginkgo/core/base/range.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp index 5ba07aa834f..1e4c7a5d00e 100644 --- a/include/ginkgo/core/base/range.hpp +++ b/include/ginkgo/core/base/range.hpp @@ -864,7 +864,7 @@ GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose); #define GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(_deprecated_name, _name) \ - struct [[deprecated("Please use " #_name)]] _deprecated_name : _name{}; + struct [[deprecated("Please use " #_name)]] _deprecated_name : _name {} #define GKO_DEFINE_SIMPLE_BINARY_OPERATION(_name, ...) \ struct _name { \ From ee4cd1773d5f63a641d41a558834b8d6a59ac492 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 14:15:03 +0200 Subject: [PATCH 6/7] fix divisions by zero and num_threads == 0 --- omp/base/kernel_launch_reduction.hpp | 112 +++++++++++++++------------ 1 file changed, 62 insertions(+), 50 deletions(-) diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp index 5dfbd5ba6c0..ef57803ad31 100644 --- a/omp/base/kernel_launch_reduction.hpp +++ b/omp/base/kernel_launch_reduction.hpp @@ -65,7 +65,8 @@ void run_kernel_reduction_impl(std::shared_ptr exec, const auto ssize = static_cast(size); // Limit the number of threads to the number of columns const auto num_threads = std::min(omp_get_max_threads(), ssize); - const auto work_per_thread = ceildiv(ssize, num_threads); + const auto work_per_thread = + ceildiv(ssize, std::max(num_threads, 1)); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); @@ -74,17 +75,20 @@ void run_kernel_reduction_impl(std::shared_ptr exec, #pragma omp parallel num_threads(num_threads) { const auto thread_id = omp_get_thread_num(); - const auto begin = thread_id * work_per_thread; - const auto end = std::min(ssize, begin + work_per_thread); + if (thread_id < num_threads) { + const auto begin = thread_id * work_per_thread; + const auto end = std::min(ssize, begin + work_per_thread); - auto local_partial = identity; - for (auto i = begin; i < end; i++) { - local_partial = op(local_partial, fn(i, map_to_device(args)...)); + auto local_partial = identity; + for (auto i = begin; i < end; i++) { + local_partial = + op(local_partial, fn(i, map_to_device(args)...)); + } + partial[thread_id] = local_partial; } - partial[thread_id] = local_partial; } - *result = finalize(std::accumulate( - partial, partial + required_storage / sizeof(ValueType), identity, op)); + *result = + finalize(std::accumulate(partial, partial + num_threads, identity, op)); } @@ -102,7 +106,7 @@ void run_kernel_reduction_sized_impl(syn::value_list, const auto cols = static_cast(size[1]); // Limit the number of threads to the number of columns const auto num_threads = std::min(omp_get_max_threads(), rows); - const auto work_per_thread = ceildiv(rows, num_threads); + const auto work_per_thread = ceildiv(rows, std::max(num_threads, 1)); const auto required_storage = sizeof(ValueType) * num_threads; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); @@ -114,43 +118,46 @@ void run_kernel_reduction_sized_impl(syn::value_list, #pragma omp parallel num_threads(num_threads) { const auto thread_id = omp_get_thread_num(); - const auto begin = thread_id * work_per_thread; - const auto end = std::min(rows, begin + work_per_thread); - - auto local_partial = identity; - if (rounded_cols == 0 || cols == block_size) { - // we group all sizes <= block_size here and unroll explicitly - constexpr auto local_cols = - remainder_cols == 0 ? block_size : remainder_cols; - for (auto row = begin; row < end; row++) { -#pragma unroll - for (int64 col = 0; col < local_cols; col++) { - local_partial = op(local_partial, fn(row, col, args...)); - } - } - } else { - // we operate in block_size blocks plus an explicitly unrolled - // remainder - for (auto row = begin; row < end; row++) { - for (int64 base_col = 0; base_col < rounded_cols; - base_col += block_size) { + if (thread_id < num_threads) { + const auto begin = thread_id * work_per_thread; + const auto end = std::min(rows, begin + work_per_thread); + + auto local_partial = identity; + if (rounded_cols == 0 || cols == block_size) { + // we group all sizes <= block_size here and unroll explicitly + constexpr auto local_cols = + remainder_cols == 0 ? block_size : remainder_cols; + for (auto row = begin; row < end; row++) { #pragma unroll - for (int64 i = 0; i < block_size; i++) { + for (int64 col = 0; col < local_cols; col++) { local_partial = - op(local_partial, fn(row, base_col + i, args...)); + op(local_partial, fn(row, col, args...)); } } + } else { + // we operate in block_size blocks plus an explicitly unrolled + // remainder + for (auto row = begin; row < end; row++) { + for (int64 base_col = 0; base_col < rounded_cols; + base_col += block_size) { #pragma unroll - for (int64 i = 0; i < remainder_cols; i++) { - local_partial = - op(local_partial, fn(row, rounded_cols + i, args...)); + for (int64 i = 0; i < block_size; i++) { + local_partial = op(local_partial, + fn(row, base_col + i, args...)); + } + } +#pragma unroll + for (int64 i = 0; i < remainder_cols; i++) { + local_partial = op(local_partial, + fn(row, rounded_cols + i, args...)); + } } } + partial[thread_id] = local_partial; } - partial[thread_id] = local_partial; } - *result = finalize(std::accumulate( - partial, partial + required_storage / sizeof(ValueType), identity, op)); + *result = + finalize(std::accumulate(partial, partial + num_threads, identity, op)); } GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized, @@ -232,7 +239,8 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, } else { // small number of rows and large reduction sizes: do partial sum first const auto num_threads = std::min(available_threads, cols); - const auto work_per_thread = ceildiv(cols, num_threads); + const auto work_per_thread = + ceildiv(cols, std::max(num_threads, 1)); const auto temp_elems_per_row = num_threads; const auto required_storage = sizeof(ValueType) * rows * temp_elems_per_row; @@ -243,16 +251,19 @@ void run_kernel_row_reduction_impl(std::shared_ptr exec, #pragma omp parallel num_threads(num_threads) { const auto thread_id = static_cast(omp_get_thread_num()); - const auto begin = thread_id * work_per_thread; - const auto end = std::min(begin + work_per_thread, cols); - for (int64 row = 0; row < rows; row++) { - auto local_partial = identity; - for (int64 col = begin; col < end; col++) { - local_partial = op(local_partial, [&]() { - return fn(row, col, args...); - }()); + if (thread_id < num_threads) { + const auto begin = thread_id * work_per_thread; + const auto end = std::min(begin + work_per_thread, cols); + for (int64 row = 0; row < rows; row++) { + auto local_partial = identity; + for (int64 col = begin; col < end; col++) { + local_partial = op(local_partial, [&]() { + return fn(row, col, args...); + }()); + } + partial[row * temp_elems_per_row + thread_id] = + local_partial; } - partial[row * temp_elems_per_row + thread_id] = local_partial; } } // then accumulate the partial sums and write to result @@ -334,8 +345,9 @@ void run_kernel_col_reduction_sized_impl( // storage than the input vector const auto reduction_size = std::min( rows, ceildiv(reduction_kernel_oversubscription * available_threads, - cols)); - const auto rows_per_thread = ceildiv(rows, reduction_size); + std::max(cols, 1))); + const auto rows_per_thread = + ceildiv(rows, std::max(reduction_size, 1)); const auto required_storage = sizeof(ValueType) * cols * reduction_size; if (tmp.get_num_elems() < required_storage) { tmp.resize_and_reset(required_storage); From a3b2be05739c3ccd24dac5709ac233e20d8ebf29 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 2 Aug 2023 14:15:16 +0200 Subject: [PATCH 7/7] simplify size calculations --- test/base/kernel_launch_generic.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp index bc4119d2806..d4a0f83c819 100644 --- a/test/base/kernel_launch_generic.cpp +++ b/test/base/kernel_launch_generic.cpp @@ -382,7 +382,7 @@ void run1d_reduction_cached(std::shared_ptr exec, static_cast(size)); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LE(temp.get_num_elems() / sizeof(int64), size); + ASSERT_LE(temp.get_num_elems(), size * sizeof(int64)); } } @@ -477,7 +477,7 @@ void run2d_reduction_cached(std::shared_ptr exec, static_cast(dim[0] + dim[1])); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64)); } } @@ -569,7 +569,7 @@ void run2d_row_reduction_cached(std::shared_ptr exec, GKO_ASSERT_ARRAY_EQ(host_ref, output); // The temporary storage (used for partial sums) must be smaller than // the input array - ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64)); } } @@ -660,7 +660,7 @@ void run2d_col_reduction_cached(std::shared_ptr exec, dim, temp); GKO_ASSERT_ARRAY_EQ(host_ref, output); - ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]); + ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64)); } }