From 056f1521c48684de6a2716965545dd6daa9a9991 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Tue, 11 Jul 2023 19:22:36 +0200
Subject: [PATCH 1/7] Fix OMP row reduction kernel

The kernel requested more memory than necessary in most scenarios
because of a faulty temporary storage estimation.
---
 omp/base/kernel_launch_reduction.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index d8d081e323b..a46ce970421 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -327,7 +327,7 @@ void run_kernel_col_reduction_sized_impl(
         const auto reduction_size =
             ceildiv(reduction_kernel_oversubscription * num_threads, cols);
         const auto rows_per_thread = ceildiv(rows, reduction_size);
-        const auto required_storage = sizeof(ValueType) * rows * reduction_size;
+        const auto required_storage = sizeof(ValueType) * cols * reduction_size;
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }

From 75a4e14ea780e976ad394e574c096cdfb94084f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Thu, 13 Jul 2023 16:01:44 +0200
Subject: [PATCH 2/7] Add specific tests for OMP reductions

---
 test/base/kernel_launch_generic.cpp | 168 ++++++++++++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index 3dd1570c5f8..cf07f867c82 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -33,6 +33,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "common/unified/base/kernel_launch.hpp"
 
 
+#include <algorithm>
 #include <memory>
 #include <type_traits>
 
@@ -364,6 +365,39 @@ void run1d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 TEST_F(KernelLaunch, Reduction1D) { run1d_reduction(exec); }
 
 
+void run1d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                            std::vector<size_type> sizes)
+{
+    gko::array<int64> output{exec, 1};
+    gko::array<char> temp(exec);
+    for (const auto& size : sizes) {
+        temp.clear();
+        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+            exec,
+            [] GKO_KERNEL(auto i) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                return i + 1;
+            },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            size, temp);
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  static_cast<int64>(size));
+        // The temporary storage (used for partial sums) must be smaller than
+        // the input array
+        ASSERT_LT(temp.get_num_elems() / sizeof(int64), size);
+    }
+}
+
+TEST_F(KernelLaunch, Reduction1DCached)
+{
+    // Note: Start with at least 200 elements in case the machine has a lot of
+    //       cores
+    run1d_reduction_cached(exec, {1000, 1000000, 1234567, 7654321});
+}
+
+
 void run2d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     gko::array<int64> output{exec, {-1l}};
@@ -432,6 +466,47 @@ void run2d_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 
 
+void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                            std::vector<gko::dim<2>> dims)
+{
+    gko::array<int64> output{exec, 1};
+    gko::array<char> temp(exec);
+    for (const auto& dim : dims) {
+        temp.clear();
+        gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
+            exec,
+            [] GKO_KERNEL(auto i, auto j) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(j), int64>::value, "index");
+                return i + j + 2;
+            },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            dim, temp);
+
+        ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
+                  static_cast<int64>(dim[0] + dim[1]));
+        // The temporary storage (used for partial sums) must be smaller than
+        // the input array
+        ASSERT_LT(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
+    }
+}
+
+TEST_F(KernelLaunch, Reduction2DCached)
+{
+    // Note: Start with at least 200 elements in case the machine has a lot of
+    //       cores
+    run2d_reduction_cached(exec, {{20, 10},
+                                  {10, 3000},
+                                  {1000, 5},
+                                  {30, 50},
+                                  {1, 100000},
+                                  {100000, 1},
+                                  {500000, 20},
+                                  {20, 500000}});
+}
+
+
 void run2d_row_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     for (auto num_rows : {0, 100, 1000, 10000}) {
@@ -481,6 +556,53 @@ void run2d_row_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
 
 
+void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                                std::vector<gko::dim<2>> dims)
+{
+    // This assumes at most 256 OpenMP Threads
+    constexpr int64_t max_tmp_elems = 4 * 256;
+    const size_type result_stride = 1;
+    gko::array<char> temp(exec);
+    for (const auto& dim : dims) {
+        gko::array<int64> host_ref{exec->get_master(), dim[0]};
+        gko::array<int64> output{exec, host_ref};
+        temp.clear();
+        for (int64 i = 0; i < host_ref.get_num_elems(); ++i) {
+            host_ref.get_data()[i] = dim[1] + i + 1;
+        }
+
+        gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached(
+            exec,
+            [] GKO_KERNEL(auto i, auto j) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(j), int64>::value, "index");
+                return i + j + 2;
+            },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            result_stride, dim, temp);
+
+        GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        ASSERT_LT(temp.get_num_elems() / sizeof(int64),
+                  max_tmp_elems * max_tmp_elems);
+    }
+}
+
+TEST_F(KernelLaunch, ReductionRowCached)
+{
+    // Note: Start with at least 200 elements in case the machine has a lot of
+    //       cores
+    run2d_row_reduction_cached(exec, {{20, 10},
+                                      {10, 3000},
+                                      {1000, 5},
+                                      {30, 50},
+                                      {1, 100000},
+                                      {100000, 1},
+                                      {500000, 20},
+                                      {20, 500000}});
+}
+
+
 void run2d_col_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 {
     // empty, most threads idle, most threads busy, multiple blocks
@@ -530,3 +652,49 @@ void run2d_col_reduction(std::shared_ptr<gko::EXEC_TYPE> exec)
 }
 
 TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
+
+
+void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
+                                std::vector<gko::dim<2>> dims)
+{
+    gko::array<char> temp(exec);
+    for (const auto& dim : dims) {
+        gko::array<int64> host_ref{exec->get_master(), dim[1]};
+        gko::array<int64> output{exec, host_ref};
+        temp.clear();
+        for (int64 i = 0; i < host_ref.get_num_elems(); ++i) {
+            host_ref.get_data()[i] = dim[0] + i + 1;
+        }
+
+        gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached(
+            exec,
+            [] GKO_KERNEL(auto i, auto j) {
+                static_assert(is_same<decltype(i), int64>::value, "index");
+                static_assert(is_same<decltype(j), int64>::value, "index");
+                return i + j + 2;
+            },
+            [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
+            [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
+            dim, temp);
+
+        GKO_ASSERT_ARRAY_EQ(host_ref, output);
+        // This assumes at most 256 OpenMP Threads
+        const size_type temp_elem_limit =
+            std::max(size_type{4 * 256}, dim[0] * dim[1]);
+        ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit);
+    }
+}
+
+TEST_F(KernelLaunch, ReductionColCached)
+{
+    // Note: Start with at least 200 elements in case the machine has a lot of
+    //       cores
+    run2d_col_reduction_cached(exec, {{20, 10},
+                                      {10, 3000},
+                                      {1000, 5},
+                                      {30, 50},
+                                      {1, 100000},
+                                      {100000, 1},
+                                      {500000, 20},
+                                      {20, 500000}});
+}

From 288546d664079ca7f5b342431bcb83401a046790 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Fri, 21 Jul 2023 15:26:07 +0200
Subject: [PATCH 3/7] Update reduction tests to all scale with size

---
 test/base/kernel_launch_generic.cpp | 68 +++++++++++++----------------
 1 file changed, 31 insertions(+), 37 deletions(-)

diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index cf07f867c82..57bab96d9c0 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -373,11 +373,7 @@ void run1d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
     for (const auto& size : sizes) {
         temp.clear();
         gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
-            exec,
-            [] GKO_KERNEL(auto i) {
-                static_assert(is_same<decltype(i), int64>::value, "index");
-                return i + 1;
-            },
+            exec, [] GKO_KERNEL(auto i) { return i + 1; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
             size, temp);
@@ -469,17 +465,13 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                             std::vector<gko::dim<2>> dims)
 {
+    constexpr size_type min_allowed_tmp_elems = 4 * 256;
     gko::array<int64> output{exec, 1};
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
         temp.clear();
         gko::kernels::EXEC_NAMESPACE::run_kernel_reduction_cached(
-            exec,
-            [] GKO_KERNEL(auto i, auto j) {
-                static_assert(is_same<decltype(i), int64>::value, "index");
-                static_assert(is_same<decltype(j), int64>::value, "index");
-                return i + j + 2;
-            },
+            exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
             dim, temp);
@@ -487,19 +479,23 @@ void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
         ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
                   static_cast<int64>(dim[0] + dim[1]));
         // The temporary storage (used for partial sums) must be smaller than
-        // the input array
-        ASSERT_LT(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
+        // the input array (or smaller than a set minimum)
+        const size_type max_tmp_elems =
+            std::max(dim[0] * dim[1], min_allowed_tmp_elems);
+        ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems);
     }
 }
 
 TEST_F(KernelLaunch, Reduction2DCached)
 {
-    // Note: Start with at least 200 elements in case the machine has a lot of
-    //       cores
     run2d_reduction_cached(exec, {{20, 10},
                                   {10, 3000},
                                   {1000, 5},
                                   {30, 50},
+                                  {600, 500},
+                                  {500, 600},
+                                  {1000, 900},
+                                  {900, 1000},
                                   {1, 100000},
                                   {100000, 1},
                                   {500000, 20},
@@ -559,8 +555,9 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
 void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                                 std::vector<gko::dim<2>> dims)
 {
-    // This assumes at most 256 OpenMP Threads
-    constexpr int64_t max_tmp_elems = 4 * 256;
+    // The 2D row reduction potentially needs a lot of memory for small input
+    // sizes
+    constexpr size_type min_allowed_tmp_elems = 4 * 256 * 4 * 256;
     const size_type result_stride = 1;
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
@@ -572,30 +569,30 @@ void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
         }
 
         gko::kernels::EXEC_NAMESPACE::run_kernel_row_reduction_cached(
-            exec,
-            [] GKO_KERNEL(auto i, auto j) {
-                static_assert(is_same<decltype(i), int64>::value, "index");
-                static_assert(is_same<decltype(j), int64>::value, "index");
-                return i + j + 2;
-            },
+            exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
             result_stride, dim, temp);
 
         GKO_ASSERT_ARRAY_EQ(host_ref, output);
-        ASSERT_LT(temp.get_num_elems() / sizeof(int64),
-                  max_tmp_elems * max_tmp_elems);
+        // The temporary storage (used for partial sums) must be smaller than
+        // the input array (or smaller than a set minimum)
+        const size_type max_tmp_elems =
+            std::max(dim[0] * dim[1], min_allowed_tmp_elems);
+        ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems);
     }
 }
 
 TEST_F(KernelLaunch, ReductionRowCached)
 {
-    // Note: Start with at least 200 elements in case the machine has a lot of
-    //       cores
     run2d_row_reduction_cached(exec, {{20, 10},
                                       {10, 3000},
                                       {1000, 5},
                                       {30, 50},
+                                      {600, 500},
+                                      {500, 600},
+                                      {1000, 900},
+                                      {900, 1000},
                                       {1, 100000},
                                       {100000, 1},
                                       {500000, 20},
@@ -657,6 +654,7 @@ TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
 void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                                 std::vector<gko::dim<2>> dims)
 {
+    constexpr size_type min_allowed_tmp_elems = 4 * 256;
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
         gko::array<int64> host_ref{exec->get_master(), dim[1]};
@@ -667,32 +665,28 @@ void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
         }
 
         gko::kernels::EXEC_NAMESPACE::run_kernel_col_reduction_cached(
-            exec,
-            [] GKO_KERNEL(auto i, auto j) {
-                static_assert(is_same<decltype(i), int64>::value, "index");
-                static_assert(is_same<decltype(j), int64>::value, "index");
-                return i + j + 2;
-            },
+            exec, [] GKO_KERNEL(auto i, auto j) { return i + j + 2; },
             [] GKO_KERNEL(auto i, auto j) { return std::max(i, j); },
             [] GKO_KERNEL(auto j) { return j; }, int64{}, output.get_data(),
             dim, temp);
 
         GKO_ASSERT_ARRAY_EQ(host_ref, output);
-        // This assumes at most 256 OpenMP Threads
         const size_type temp_elem_limit =
-            std::max(size_type{4 * 256}, dim[0] * dim[1]);
+            std::max(min_allowed_tmp_elems, dim[0] * dim[1]);
         ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit);
     }
 }
 
 TEST_F(KernelLaunch, ReductionColCached)
 {
-    // Note: Start with at least 200 elements in case the machine has a lot of
-    //       cores
     run2d_col_reduction_cached(exec, {{20, 10},
                                       {10, 3000},
                                       {1000, 5},
                                       {30, 50},
+                                      {600, 500},
+                                      {500, 600},
+                                      {1000, 900},
+                                      {900, 1000},
                                       {1, 100000},
                                       {100000, 1},
                                       {500000, 20},

From 17b50ce6d563c41ea4fbc70862e38ec119196c00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Thomas=20Gr=C3=BCtzmacher?= <thomas.gruetzmacher@kit.edu>
Date: Tue, 25 Jul 2023 18:02:46 +0200
Subject: [PATCH 4/7] Change OMP reduction implementation

At most allocate as much as the input vector for OMP reductions.
---
 omp/base/kernel_launch_reduction.hpp | 45 +++++++++++++++++-----------
 test/base/kernel_launch_generic.cpp  | 27 +++++------------
 2 files changed, 34 insertions(+), 38 deletions(-)

diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index a46ce970421..5dfbd5ba6c0 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -62,8 +62,9 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                                ValueType* result, size_type size,
                                array<char>& tmp, MappedKernelArgs... args)
 {
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
     const auto ssize = static_cast<int64>(size);
+    // Limit the number of threads to the number of columns
+    const auto num_threads = std::min<int64>(omp_get_max_threads(), ssize);
     const auto work_per_thread = ceildiv(ssize, num_threads);
     const auto required_storage = sizeof(ValueType) * num_threads;
     if (tmp.get_num_elems() < required_storage) {
@@ -82,8 +83,8 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         }
         partial[thread_id] = local_partial;
     }
-    *result =
-        finalize(std::accumulate(partial, partial + num_threads, identity, op));
+    *result = finalize(std::accumulate(
+        partial, partial + required_storage / sizeof(ValueType), identity, op));
 }
 
 
@@ -99,7 +100,8 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
 {
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    // Limit the number of threads to the number of columns
+    const auto num_threads = std::min<int64>(omp_get_max_threads(), rows);
     const auto work_per_thread = ceildiv(rows, num_threads);
     const auto required_storage = sizeof(ValueType) * num_threads;
     if (tmp.get_num_elems() < required_storage) {
@@ -109,7 +111,7 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
     static_assert(remainder_cols < block_size, "remainder too large");
     const auto rounded_cols = cols / block_size * block_size;
     GKO_ASSERT(rounded_cols + remainder_cols == cols);
-#pragma omp parallel
+#pragma omp parallel num_threads(num_threads)
     {
         const auto thread_id = omp_get_thread_num();
         const auto begin = thread_id * work_per_thread;
@@ -147,8 +149,8 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
         }
         partial[thread_id] = local_partial;
     }
-    *result =
-        finalize(std::accumulate(partial, partial + num_threads, identity, op));
+    *result = finalize(std::accumulate(
+        partial, partial + required_storage / sizeof(ValueType), identity, op));
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
@@ -210,12 +212,12 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
     constexpr int block_size = 8;
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto available_threads = static_cast<int64>(omp_get_max_threads());
     if (rows <= 0) {
         return;
     }
     // enough work to keep all threads busy or only very small reduction sizes
-    if (rows >= reduction_kernel_oversubscription * num_threads ||
+    if (rows >= reduction_kernel_oversubscription * available_threads ||
         cols < rows) {
 #pragma omp parallel for
         for (int64 row = 0; row < rows; row++) {
@@ -229,8 +231,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         }
     } else {
         // small number of rows and large reduction sizes: do partial sum first
+        const auto num_threads = std::min<int64>(available_threads, cols);
         const auto work_per_thread = ceildiv(cols, num_threads);
-        const auto required_storage = sizeof(ValueType) * rows * num_threads;
+        const auto temp_elems_per_row = num_threads;
+        const auto required_storage =
+            sizeof(ValueType) * rows * temp_elems_per_row;
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);
         }
@@ -247,7 +252,7 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
                         return fn(row, col, args...);
                     }());
                 }
-                partial[row * num_threads + thread_id] = local_partial;
+                partial[row * temp_elems_per_row + thread_id] = local_partial;
             }
         }
         // then accumulate the partial sums and write to result
@@ -255,10 +260,11 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
         for (int64 row = 0; row < rows; row++) {
             [&] {
                 auto local_partial = identity;
-                for (int64 thread_id = 0; thread_id < num_threads;
+                for (int64 thread_id = 0; thread_id < temp_elems_per_row;
                      thread_id++) {
-                    local_partial = op(local_partial,
-                                       partial[row * num_threads + thread_id]);
+                    local_partial =
+                        op(local_partial,
+                           partial[row * temp_elems_per_row + thread_id]);
                 }
                 result[row * result_stride] = finalize(local_partial);
             }();
@@ -302,12 +308,12 @@ void run_kernel_col_reduction_sized_impl(
 {
     const auto rows = static_cast<int64>(size[0]);
     const auto cols = static_cast<int64>(size[1]);
-    const auto num_threads = static_cast<int64>(omp_get_max_threads());
+    const auto available_threads = static_cast<int64>(omp_get_max_threads());
     static_assert(remainder_cols < block_size, "remainder too large");
     GKO_ASSERT(cols % block_size == remainder_cols);
     const auto num_col_blocks = ceildiv(cols, block_size);
     // enough work to keep all threads busy or only very small reduction sizes
-    if (cols >= reduction_kernel_oversubscription * num_threads ||
+    if (cols >= reduction_kernel_oversubscription * available_threads ||
         rows < cols) {
 #pragma omp parallel for
         for (int64 col_block = 0; col_block < num_col_blocks; col_block++) {
@@ -324,8 +330,11 @@ void run_kernel_col_reduction_sized_impl(
         }
     } else {
         // number of blocks that need to be reduced afterwards
-        const auto reduction_size =
-            ceildiv(reduction_kernel_oversubscription * num_threads, cols);
+        // This reduction_size definition ensures we don't use more temporary
+        // storage than the input vector
+        const auto reduction_size = std::min(
+            rows, ceildiv(reduction_kernel_oversubscription * available_threads,
+                          cols));
         const auto rows_per_thread = ceildiv(rows, reduction_size);
         const auto required_storage = sizeof(ValueType) * cols * reduction_size;
         if (tmp.get_num_elems() < required_storage) {
diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index 57bab96d9c0..bc4119d2806 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -382,15 +382,13 @@ void run1d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                   static_cast<int64>(size));
         // The temporary storage (used for partial sums) must be smaller than
         // the input array
-        ASSERT_LT(temp.get_num_elems() / sizeof(int64), size);
+        ASSERT_LE(temp.get_num_elems() / sizeof(int64), size);
     }
 }
 
 TEST_F(KernelLaunch, Reduction1DCached)
 {
-    // Note: Start with at least 200 elements in case the machine has a lot of
-    //       cores
-    run1d_reduction_cached(exec, {1000, 1000000, 1234567, 7654321});
+    run1d_reduction_cached(exec, {10, 1000, 1000000, 1234567, 7654321});
 }
 
 
@@ -465,7 +463,6 @@ TEST_F(KernelLaunch, Reduction2D) { run2d_reduction(exec); }
 void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                             std::vector<gko::dim<2>> dims)
 {
-    constexpr size_type min_allowed_tmp_elems = 4 * 256;
     gko::array<int64> output{exec, 1};
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
@@ -479,10 +476,8 @@ void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
         ASSERT_EQ(exec->copy_val_to_host(output.get_const_data()),
                   static_cast<int64>(dim[0] + dim[1]));
         // The temporary storage (used for partial sums) must be smaller than
-        // the input array (or smaller than a set minimum)
-        const size_type max_tmp_elems =
-            std::max(dim[0] * dim[1], min_allowed_tmp_elems);
-        ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems);
+        // the input array
+        ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
     }
 }
 
@@ -555,9 +550,6 @@ TEST_F(KernelLaunch, ReductionRow2D) { run2d_row_reduction(exec); }
 void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                                 std::vector<gko::dim<2>> dims)
 {
-    // The 2D row reduction potentially needs a lot of memory for small input
-    // sizes
-    constexpr size_type min_allowed_tmp_elems = 4 * 256 * 4 * 256;
     const size_type result_stride = 1;
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
@@ -576,10 +568,8 @@ void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
 
         GKO_ASSERT_ARRAY_EQ(host_ref, output);
         // The temporary storage (used for partial sums) must be smaller than
-        // the input array (or smaller than a set minimum)
-        const size_type max_tmp_elems =
-            std::max(dim[0] * dim[1], min_allowed_tmp_elems);
-        ASSERT_LT(temp.get_num_elems() / sizeof(int64), max_tmp_elems);
+        // the input array
+        ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
     }
 }
 
@@ -654,7 +644,6 @@ TEST_F(KernelLaunch, ReductionCol2D) { run2d_col_reduction(exec); }
 void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                                 std::vector<gko::dim<2>> dims)
 {
-    constexpr size_type min_allowed_tmp_elems = 4 * 256;
     gko::array<char> temp(exec);
     for (const auto& dim : dims) {
         gko::array<int64> host_ref{exec->get_master(), dim[1]};
@@ -671,9 +660,7 @@ void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
             dim, temp);
 
         GKO_ASSERT_ARRAY_EQ(host_ref, output);
-        const size_type temp_elem_limit =
-            std::max(min_allowed_tmp_elems, dim[0] * dim[1]);
-        ASSERT_LT(temp.get_num_elems() / sizeof(int64), temp_elem_limit);
+        ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
     }
 }
 

From 792edb90e3202db203b9939c661f4597bd392e41 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 2 Aug 2023 14:14:33 +0200
Subject: [PATCH 5/7] fix warning

---
 include/ginkgo/core/base/range.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/ginkgo/core/base/range.hpp b/include/ginkgo/core/base/range.hpp
index 5ba07aa834f..1e4c7a5d00e 100644
--- a/include/ginkgo/core/base/range.hpp
+++ b/include/ginkgo/core/base/range.hpp
@@ -864,7 +864,7 @@ GKO_BIND_UNARY_RANGE_OPERATION_TO_OPERATOR(transpose_operation, transpose);
 
 
 #define GKO_DEPRECATED_SIMPLE_BINARY_OPERATION(_deprecated_name, _name) \
-    struct [[deprecated("Please use " #_name)]] _deprecated_name : _name{};
+    struct [[deprecated("Please use " #_name)]] _deprecated_name : _name {}
 
 #define GKO_DEFINE_SIMPLE_BINARY_OPERATION(_name, ...)                         \
     struct _name {                                                             \

From ee4cd1773d5f63a641d41a558834b8d6a59ac492 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 2 Aug 2023 14:15:03 +0200
Subject: [PATCH 6/7] fix divisions by zero and num_threads == 0

---
 omp/base/kernel_launch_reduction.hpp | 112 +++++++++++++++------------
 1 file changed, 62 insertions(+), 50 deletions(-)

diff --git a/omp/base/kernel_launch_reduction.hpp b/omp/base/kernel_launch_reduction.hpp
index 5dfbd5ba6c0..ef57803ad31 100644
--- a/omp/base/kernel_launch_reduction.hpp
+++ b/omp/base/kernel_launch_reduction.hpp
@@ -65,7 +65,8 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
     const auto ssize = static_cast<int64>(size);
     // Limit the number of threads to the number of columns
     const auto num_threads = std::min<int64>(omp_get_max_threads(), ssize);
-    const auto work_per_thread = ceildiv(ssize, num_threads);
+    const auto work_per_thread =
+        ceildiv(ssize, std::max<int64>(num_threads, 1));
     const auto required_storage = sizeof(ValueType) * num_threads;
     if (tmp.get_num_elems() < required_storage) {
         tmp.resize_and_reset(required_storage);
@@ -74,17 +75,20 @@ void run_kernel_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel num_threads(num_threads)
     {
         const auto thread_id = omp_get_thread_num();
-        const auto begin = thread_id * work_per_thread;
-        const auto end = std::min(ssize, begin + work_per_thread);
+        if (thread_id < num_threads) {
+            const auto begin = thread_id * work_per_thread;
+            const auto end = std::min(ssize, begin + work_per_thread);
 
-        auto local_partial = identity;
-        for (auto i = begin; i < end; i++) {
-            local_partial = op(local_partial, fn(i, map_to_device(args)...));
+            auto local_partial = identity;
+            for (auto i = begin; i < end; i++) {
+                local_partial =
+                    op(local_partial, fn(i, map_to_device(args)...));
+            }
+            partial[thread_id] = local_partial;
         }
-        partial[thread_id] = local_partial;
     }
-    *result = finalize(std::accumulate(
-        partial, partial + required_storage / sizeof(ValueType), identity, op));
+    *result =
+        finalize(std::accumulate(partial, partial + num_threads, identity, op));
 }
 
 
@@ -102,7 +106,7 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
     const auto cols = static_cast<int64>(size[1]);
     // Limit the number of threads to the number of columns
     const auto num_threads = std::min<int64>(omp_get_max_threads(), rows);
-    const auto work_per_thread = ceildiv(rows, num_threads);
+    const auto work_per_thread = ceildiv(rows, std::max<int64>(num_threads, 1));
     const auto required_storage = sizeof(ValueType) * num_threads;
     if (tmp.get_num_elems() < required_storage) {
         tmp.resize_and_reset(required_storage);
@@ -114,43 +118,46 @@ void run_kernel_reduction_sized_impl(syn::value_list<int, remainder_cols>,
 #pragma omp parallel num_threads(num_threads)
     {
         const auto thread_id = omp_get_thread_num();
-        const auto begin = thread_id * work_per_thread;
-        const auto end = std::min(rows, begin + work_per_thread);
-
-        auto local_partial = identity;
-        if (rounded_cols == 0 || cols == block_size) {
-            // we group all sizes <= block_size here and unroll explicitly
-            constexpr auto local_cols =
-                remainder_cols == 0 ? block_size : remainder_cols;
-            for (auto row = begin; row < end; row++) {
-#pragma unroll
-                for (int64 col = 0; col < local_cols; col++) {
-                    local_partial = op(local_partial, fn(row, col, args...));
-                }
-            }
-        } else {
-            // we operate in block_size blocks plus an explicitly unrolled
-            // remainder
-            for (auto row = begin; row < end; row++) {
-                for (int64 base_col = 0; base_col < rounded_cols;
-                     base_col += block_size) {
+        if (thread_id < num_threads) {
+            const auto begin = thread_id * work_per_thread;
+            const auto end = std::min(rows, begin + work_per_thread);
+
+            auto local_partial = identity;
+            if (rounded_cols == 0 || cols == block_size) {
+                // we group all sizes <= block_size here and unroll explicitly
+                constexpr auto local_cols =
+                    remainder_cols == 0 ? block_size : remainder_cols;
+                for (auto row = begin; row < end; row++) {
 #pragma unroll
-                    for (int64 i = 0; i < block_size; i++) {
+                    for (int64 col = 0; col < local_cols; col++) {
                         local_partial =
-                            op(local_partial, fn(row, base_col + i, args...));
+                            op(local_partial, fn(row, col, args...));
                     }
                 }
+            } else {
+                // we operate in block_size blocks plus an explicitly unrolled
+                // remainder
+                for (auto row = begin; row < end; row++) {
+                    for (int64 base_col = 0; base_col < rounded_cols;
+                         base_col += block_size) {
 #pragma unroll
-                for (int64 i = 0; i < remainder_cols; i++) {
-                    local_partial =
-                        op(local_partial, fn(row, rounded_cols + i, args...));
+                        for (int64 i = 0; i < block_size; i++) {
+                            local_partial = op(local_partial,
+                                               fn(row, base_col + i, args...));
+                        }
+                    }
+#pragma unroll
+                    for (int64 i = 0; i < remainder_cols; i++) {
+                        local_partial = op(local_partial,
+                                           fn(row, rounded_cols + i, args...));
+                    }
                 }
             }
+            partial[thread_id] = local_partial;
         }
-        partial[thread_id] = local_partial;
     }
-    *result = finalize(std::accumulate(
-        partial, partial + required_storage / sizeof(ValueType), identity, op));
+    *result =
+        finalize(std::accumulate(partial, partial + num_threads, identity, op));
 }
 
 GKO_ENABLE_IMPLEMENTATION_SELECTION(select_run_kernel_reduction_sized,
@@ -232,7 +239,8 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
     } else {
         // small number of rows and large reduction sizes: do partial sum first
         const auto num_threads = std::min<int64>(available_threads, cols);
-        const auto work_per_thread = ceildiv(cols, num_threads);
+        const auto work_per_thread =
+            ceildiv(cols, std::max<int64>(num_threads, 1));
         const auto temp_elems_per_row = num_threads;
         const auto required_storage =
             sizeof(ValueType) * rows * temp_elems_per_row;
@@ -243,16 +251,19 @@ void run_kernel_row_reduction_impl(std::shared_ptr<const OmpExecutor> exec,
 #pragma omp parallel num_threads(num_threads)
         {
             const auto thread_id = static_cast<int64>(omp_get_thread_num());
-            const auto begin = thread_id * work_per_thread;
-            const auto end = std::min(begin + work_per_thread, cols);
-            for (int64 row = 0; row < rows; row++) {
-                auto local_partial = identity;
-                for (int64 col = begin; col < end; col++) {
-                    local_partial = op(local_partial, [&]() {
-                        return fn(row, col, args...);
-                    }());
+            if (thread_id < num_threads) {
+                const auto begin = thread_id * work_per_thread;
+                const auto end = std::min(begin + work_per_thread, cols);
+                for (int64 row = 0; row < rows; row++) {
+                    auto local_partial = identity;
+                    for (int64 col = begin; col < end; col++) {
+                        local_partial = op(local_partial, [&]() {
+                            return fn(row, col, args...);
+                        }());
+                    }
+                    partial[row * temp_elems_per_row + thread_id] =
+                        local_partial;
                 }
-                partial[row * temp_elems_per_row + thread_id] = local_partial;
             }
         }
         // then accumulate the partial sums and write to result
@@ -334,8 +345,9 @@ void run_kernel_col_reduction_sized_impl(
         // storage than the input vector
         const auto reduction_size = std::min(
             rows, ceildiv(reduction_kernel_oversubscription * available_threads,
-                          cols));
-        const auto rows_per_thread = ceildiv(rows, reduction_size);
+                          std::max<int64>(cols, 1)));
+        const auto rows_per_thread =
+            ceildiv(rows, std::max<int64>(reduction_size, 1));
         const auto required_storage = sizeof(ValueType) * cols * reduction_size;
         if (tmp.get_num_elems() < required_storage) {
             tmp.resize_and_reset(required_storage);

From a3b2be05739c3ccd24dac5709ac233e20d8ebf29 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 2 Aug 2023 14:15:16 +0200
Subject: [PATCH 7/7] simplify size calculations

---
 test/base/kernel_launch_generic.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/base/kernel_launch_generic.cpp b/test/base/kernel_launch_generic.cpp
index bc4119d2806..d4a0f83c819 100644
--- a/test/base/kernel_launch_generic.cpp
+++ b/test/base/kernel_launch_generic.cpp
@@ -382,7 +382,7 @@ void run1d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                   static_cast<int64>(size));
         // The temporary storage (used for partial sums) must be smaller than
         // the input array
-        ASSERT_LE(temp.get_num_elems() / sizeof(int64), size);
+        ASSERT_LE(temp.get_num_elems(), size * sizeof(int64));
     }
 }
 
@@ -477,7 +477,7 @@ void run2d_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
                   static_cast<int64>(dim[0] + dim[1]));
         // The temporary storage (used for partial sums) must be smaller than
         // the input array
-        ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
+        ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64));
     }
 }
 
@@ -569,7 +569,7 @@ void run2d_row_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
         GKO_ASSERT_ARRAY_EQ(host_ref, output);
         // The temporary storage (used for partial sums) must be smaller than
         // the input array
-        ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
+        ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64));
     }
 }
 
@@ -660,7 +660,7 @@ void run2d_col_reduction_cached(std::shared_ptr<gko::EXEC_TYPE> exec,
             dim, temp);
 
         GKO_ASSERT_ARRAY_EQ(host_ref, output);
-        ASSERT_LE(temp.get_num_elems() / sizeof(int64), dim[0] * dim[1]);
+        ASSERT_LE(temp.get_num_elems(), dim[0] * dim[1] * sizeof(int64));
     }
 }