From db2ffaf7f662b962ce25c08f17ea53d98a66846d Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Thu, 19 Dec 2019 10:56:22 +0100
Subject: [PATCH 1/4] extract prefix sum computations

* introduce a new `prefix_sum` kernel that is used instead
* remove a few unnecessary `Array::clear` invocations
* fix reference ParILU initialization for zero diagonal entries.
---
 core/matrix/common_kernels.hpp                | 92 +++++++++++++++++++
 cuda/CMakeLists.txt                           |  1 +
 cuda/factorization/par_ilu_kernels.cu         | 17 +---
 cuda/matrix/common_kernels.cu                 | 74 +++++++++++++++
 cuda/matrix/csr_kernels.cu                    | 36 +-------
 cuda/matrix/dense_kernels.cu                  | 53 ++---------
 cuda/matrix/ell_kernels.cu                    | 16 +---
 cuda/matrix/hybrid_kernels.cu                 | 13 +--
 cuda/matrix/sellp_kernels.cu                  | 13 +--
 cuda/matrix/sparsity_csr_kernels.cu           |  1 -
 hip/CMakeLists.txt                            |  1 +
 hip/factorization/par_ilu_kernels.hip.cpp     | 21 +----
 hip/matrix/common_kernels.hip.cpp             | 75 +++++++++++++++
 hip/matrix/csr_kernels.hip.cpp                | 53 +----------
 hip/matrix/dense_kernels.hip.cpp              | 56 +----------
 hip/matrix/ell_kernels.hip.cpp                | 15 +--
 hip/matrix/hybrid_kernels.hip.cpp             | 15 +--
 hip/matrix/sellp_kernels.hip.cpp              | 18 +---
 hip/matrix/sparsity_csr_kernels.hip.cpp       |  1 -
 omp/CMakeLists.txt                            |  1 +
 omp/factorization/par_ilu_kernels.cpp         | 23 ++---
 omp/matrix/common_kernels.cpp                 | 65 +++++++++++++
 omp/matrix/csr_kernels.cpp                    | 15 ++-
 reference/CMakeLists.txt                      |  1 +
 reference/factorization/par_ilu_kernels.cpp   |  9 +-
 reference/matrix/common_kernels.cpp           | 66 +++++++++++++
 reference/matrix/csr_kernels.cpp              | 15 ++-
 .../test/factorization/par_ilu_kernels.cpp    | 56 +++++++++--
 28 files changed, 492 insertions(+), 330 deletions(-)
 create mode 100644 core/matrix/common_kernels.hpp
 create mode 100644 cuda/matrix/common_kernels.cu
 create mode 100644 hip/matrix/common_kernels.hip.cpp
 create mode 100644 omp/matrix/common_kernels.cpp
 create mode 100644 reference/matrix/common_kernels.cpp
diff --git a/core/matrix/common_kernels.hpp b/core/matrix/common_kernels.hpp
new file mode 100644
index 00000000000..dd44730a4be
--- /dev/null
+++ b/core/matrix/common_kernels.hpp
@@ -0,0 +1,92 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
+#define GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)                 \
+    void prefix_sum(std::shared_ptr<const DefaultExecutor> exec, \
+                    IndexType *counts, size_type num_entries)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES \
+    template <typename IndexType>    \
+    GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)
+
+
+namespace omp {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace omp
+
+
+namespace cuda {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace cuda
+
+
+namespace reference {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace reference
+
+
+namespace hip {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
\ No newline at end of file
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 49c3b6ddc3c..96d7ed44a13 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -64,6 +64,7 @@ target_sources(ginkgo_cuda
         base/version.cpp
         components/zero_array.cu
         factorization/par_ilu_kernels.cu
+        matrix/common_kernels.cu
         matrix/coo_kernels.cu
         matrix/csr_kernels.cu
         matrix/dense_kernels.cu
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index a7dbb4cc6b4..1cd41b6b1c0 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -38,9 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/coo.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/prefix_sum.cuh"
 
 
 namespace gko {
@@ -67,7 +67,6 @@ void initialize_row_ptrs_l_u(
     IndexType *l_row_ptrs, IndexType *u_row_ptrs)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const size_type num_row_ptrs{num_rows + 1};
 
     const dim3 block_size{default_block_size, 1, 1};
     const uint32 number_blocks =
@@ -80,18 +79,8 @@ void initialize_row_ptrs_l_u(
         as_cuda_type(system_matrix->get_const_values()),
         as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs));
 
-    Array<IndexType> block_sum(exec, grid_dim.x);
-    auto block_sum_ptr = block_sum.get_data();
-
-    start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
-    finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
-
-    start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
-    finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
+    prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    prefix_sum(exec, u_row_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/matrix/common_kernels.cu b/cuda/matrix/common_kernels.cu
new file mode 100644
index 00000000000..6952e1d4f21
--- /dev/null
+++ b/cuda/matrix/common_kernels.cu
@@ -0,0 +1,74 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include "core/matrix/common_kernels.hpp"
+
+
+#include "cuda/components/prefix_sum.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+constexpr int prefix_sum_block_size = 512;
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
+                size_type num_entries)
+{
+    auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
+    Array<IndexType> block_sum_array(exec, num_blocks);
+    auto block_sums = block_sum_array.get_data();
+    start_prefix_sum<prefix_sum_block_size>
+        <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
+                                                block_sums);
+    finalize_prefix_sum<prefix_sum_block_size>
+        <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
+                                                block_sums);
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// explicitly instantiate for size_type as well, as this is used in the SellP
+// format
+template void prefix_sum<size_type>(std::shared_ptr<const CudaExecutor> exec,
+                                    size_type *counts, size_type num_entries);
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index a08c96b69c3..6f4280ced10 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
@@ -56,7 +57,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -677,17 +677,7 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
         as_cuda_type(slice_sets));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
-    grid_dim = ceildiv(slice_num + 1, default_block_size);
-
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_const_data()));
+    prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
     kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
@@ -695,9 +685,6 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
         as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
         as_cuda_type(result_col_idxs), as_cuda_type(result_values));
-
-    nnz_per_row.clear();
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -779,11 +766,6 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    block_results.clear();
-    nnz_per_row.clear();
-    max_nnz_per_slice.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -917,10 +899,6 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    nnz_per_row.clear();
-    block_results.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -952,15 +930,7 @@ void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
         num_rows, max_nnz_per_row, as_cuda_type(source->get_const_row_ptrs()),
         as_cuda_type(coo_offset.get_data()));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(num_rows, default_block_size));
-    grid_dim = ceildiv(num_rows, default_block_size);
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(coo_offset.get_data()),
-        as_cuda_type(add_values.get_data()));
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(coo_offset.get_data()),
-        as_cuda_type(add_values.get_const_data()));
+    prefix_sum(exec, coo_offset.get_data(), num_rows);
 
     grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
     kernel::fill_in_hybrid<<<grid_dim, default_block_size>>>(
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 4579691c6cd..4b6a6c3cbbc 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -42,11 +42,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/uninitialized_array.hpp"
 
@@ -251,25 +251,15 @@ void convert_to_coo(std::shared_ptr<const CudaExecutor> exec,
     auto nnz_prefix_sum = Array<size_type>(exec, num_rows);
     calculate_nonzeros_per_row(exec, source, &nnz_prefix_sum);
 
-    const size_type grid_dim = ceildiv(num_rows, default_block_size);
-    auto add_values = Array<size_type>(exec, grid_dim);
+    prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
 
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(nnz_prefix_sum.get_data()),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(nnz_prefix_sum.get_data()),
-        as_cuda_type(add_values.get_data()));
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_coo<<<grid_dim, default_block_size>>>(
         num_rows, num_cols, stride,
         as_cuda_type(nnz_prefix_sum.get_const_data()),
         as_cuda_type(source->get_const_values()), as_cuda_type(row_idxs),
         as_cuda_type(col_idxs), as_cuda_type(values));
-
-    nnz_prefix_sum.clear();
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -297,22 +287,13 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
         num_rows, num_cols, stride, as_cuda_type(source->get_const_values()),
         as_cuda_type(row_ptrs));
 
-    size_type grid_dim = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_dim);
+    prefix_sum(exec, row_ptrs, num_rows + 1);
 
-    start_prefix_sum<default_block_size>
-        <<<grid_dim, default_block_size>>>(num_rows + 1, as_cuda_type(row_ptrs),
-                                           as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
         num_rows, num_cols, stride, as_cuda_type(source->get_const_values()),
         as_cuda_type(row_ptrs), as_cuda_type(col_idxs), as_cuda_type(values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -387,26 +368,13 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
         as_cuda_type(slice_sets));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
-    grid_dim = ceildiv(slice_num + 1, default_block_size);
-
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_const_data()));
+    prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
     kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
         num_rows, num_cols, slice_size, stride,
         as_cuda_type(source->get_const_values()), as_cuda_type(slice_lengths),
         as_cuda_type(slice_sets), as_cuda_type(col_idxs), as_cuda_type(vals));
-
-    add_values.clear();
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -433,7 +401,6 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
@@ -469,9 +436,6 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-    d_result.clear();
-    block_results.clear();
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -537,11 +501,6 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    block_results.clear();
-    nnz_per_row.clear();
-    max_nnz_per_slice.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 5c58cfc41ef..89ce0f9d27e 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -43,6 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
@@ -51,7 +52,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/format_conversion.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/zero_array.hpp"
 
@@ -307,24 +307,15 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
         num_rows, max_nnz_per_row, stride,
         as_cuda_type(source->get_const_values()), as_cuda_type(row_ptrs));
 
-    size_type grid_dim = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_dim);
+    prefix_sum(exec, row_ptrs, num_rows + 1);
 
-    start_prefix_sum<default_block_size>
-        <<<grid_dim, default_block_size>>>(num_rows + 1, as_cuda_type(row_ptrs),
-                                           as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
         num_rows, max_nnz_per_row, stride,
         as_cuda_type(source->get_const_values()),
         as_cuda_type(source->get_const_col_idxs()), as_cuda_type(row_ptrs),
         as_cuda_type(col_idxs), as_cuda_type(values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -342,7 +333,6 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu
index abf9525baec..fc9bdc4c1dd 100644
--- a/cuda/matrix/hybrid_kernels.cu
+++ b/cuda/matrix/hybrid_kernels.cu
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/ell_kernels.hpp"
 #include "cuda/base/config.hpp"
@@ -44,7 +45,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
 #include "cuda/components/format_conversion.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/zero_array.hpp"
@@ -128,16 +128,7 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
         num_rows, as_cuda_type(row_ptrs),
         as_cuda_type(coo_row_ptrs.get_const_data()));
 
-    grid_num = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_num);
-
-    start_prefix_sum<default_block_size>
-        <<<grid_num, default_block_size>>>(num_rows + 1, as_cuda_type(row_ptrs),
-                                           as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_num, default_block_size>>>(
-        num_rows + 1, as_cuda_type(row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    prefix_sum(exec, row_ptrs, num_rows + 1);
 
     // Fill the value
     grid_num = ceildiv(num_rows, default_block_size);
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
index b34b4a68291..024ed44b38e 100644
--- a/cuda/matrix/sellp_kernels.cu
+++ b/cuda/matrix/sellp_kernels.cu
@@ -40,10 +40,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 
 
@@ -175,13 +175,7 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
     grid_dim = ceildiv(num_rows + 1, default_block_size);
     auto add_values = Array<IndexType>(exec, grid_dim);
 
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(result_row_ptrs),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows + 1, as_cuda_type(result_row_ptrs),
-        as_cuda_type(add_values.get_const_data()));
+    prefix_sum(exec, result_row_ptrs, num_rows + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
 
@@ -190,8 +184,6 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(source_col_idxs), as_cuda_type(source_values),
         as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs),
         as_cuda_type(result_values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -217,7 +209,6 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(nnz_per_row.get_data()));
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/cuda/matrix/sparsity_csr_kernels.cu b/cuda/matrix/sparsity_csr_kernels.cu
index 0a0dc3f0a63..ac060ebe213 100644
--- a/cuda/matrix/sparsity_csr_kernels.cu
+++ b/cuda/matrix/sparsity_csr_kernels.cu
@@ -48,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/uninitialized_array.hpp"
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index bd703702ccf..a0b1b938167 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -96,6 +96,7 @@ set(GINKGO_HIP_SOURCES
     base/version.hip.cpp
     components/zero_array.hip.cpp
     factorization/par_ilu_kernels.hip.cpp
+    matrix/common_kernels.hip.cpp
     matrix/coo_kernels.hip.cpp
     matrix/csr_kernels.hip.cpp
     matrix/dense_kernels.hip.cpp
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
index 8b6637d0622..86170cda799 100644
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -41,9 +41,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/coo.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
 
 
 namespace gko {
@@ -70,7 +70,6 @@ void initialize_row_ptrs_l_u(
     IndexType *l_row_ptrs, IndexType *u_row_ptrs)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const size_type num_row_ptrs{num_rows + 1};
 
     const dim3 block_size{default_block_size, 1, 1};
     const uint32 number_blocks =
@@ -84,22 +83,8 @@ void initialize_row_ptrs_l_u(
                        as_hip_type(system_matrix->get_const_values()),
                        as_hip_type(l_row_ptrs), as_hip_type(u_row_ptrs));
 
-    Array<IndexType> block_sum(exec, grid_dim.x);
-    auto block_sum_ptr = block_sum.get_data();
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(block_size), 0, 0, num_row_ptrs,
-                       as_hip_type(l_row_ptrs), as_hip_type(block_sum_ptr));
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(block_size), 0, 0, num_row_ptrs,
-                       as_hip_type(l_row_ptrs), as_hip_type(block_sum_ptr));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(block_size), 0, 0, num_row_ptrs,
-                       as_hip_type(u_row_ptrs), as_hip_type(block_sum_ptr));
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(block_size), 0, 0, num_row_ptrs,
-                       as_hip_type(u_row_ptrs), as_hip_type(block_sum_ptr));
+    prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    prefix_sum(exec, u_row_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/matrix/common_kernels.hip.cpp b/hip/matrix/common_kernels.hip.cpp
new file mode 100644
index 00000000000..a3cc319727c
--- /dev/null
+++ b/hip/matrix/common_kernels.hip.cpp
@@ -0,0 +1,75 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include "core/matrix/common_kernels.hpp"
+
+
+#include "hip/components/prefix_sum.hip.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+
+
+constexpr int prefix_sum_block_size = 512;
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
+                size_type num_entries)
+{
+    auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
+    Array<IndexType> block_sum_array(exec, num_blocks);
+    auto block_sums = block_sum_array.get_data();
+    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<prefix_sum_block_size>),
+                       dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0,
+                       num_entries, counts, block_sums);
+    hipLaunchKernelGGL(
+        HIP_KERNEL_NAME(finalize_prefix_sum<prefix_sum_block_size>),
+        dim3(num_blocks), dim3(prefix_sum_block_size), 0, 0, num_entries,
+        counts, block_sums);
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// explicitly instantiate for size_type as well, as this is used in the SellP
+// format
+template void prefix_sum<size_type>(std::shared_ptr<const HipExecutor> exec,
+                                    size_type *counts, size_type num_entries);
+
+
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 4099979d182..c1ff237074e 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
@@ -59,7 +60,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/segment_scan.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
@@ -611,18 +611,7 @@ void advanced_spgemm(std::shared_ptr<const HipExecutor> exec,
                            d_col_idxs, m, c_row_ptrs);
 
         // build row pointers
-        auto num_row_ptrs = m + 1;
-        auto num_blocks_prefixsum = ceildiv(num_row_ptrs, default_block_size);
-        Array<IndexType> block_sums_array(exec, num_blocks_prefixsum);
-        auto block_sums = block_sums_array.get_data();
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-            dim3(num_blocks_prefixsum), dim3(default_block_size), 0, 0,
-            num_row_ptrs, c_row_ptrs, block_sums);
-        hipLaunchKernelGGL(
-            HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-            dim3(num_blocks_prefixsum), dim3(default_block_size), 0, 0,
-            num_row_ptrs, c_row_ptrs, block_sums);
+        prefix_sum(exec, c_row_ptrs, m + 1);
 
         // accumulate non-zeros for alpha * A * B + beta * D
         ValueType vbeta{};
@@ -749,19 +738,7 @@ void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
                        stride_factor, as_hip_type(nnz_per_row.get_const_data()),
                        as_hip_type(slice_lengths), as_hip_type(slice_sets));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
-    grid_dim = ceildiv(slice_num + 1, default_block_size);
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       slice_num + 1, as_hip_type(slice_sets),
-                       as_hip_type(add_values.get_data()));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       slice_num + 1, as_hip_type(slice_sets),
-                       as_hip_type(add_values.get_const_data()));
+    prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
     hipLaunchKernelGGL(kernel::fill_in_sellp, dim3(grid_dim),
@@ -770,9 +747,6 @@ void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
                        as_hip_type(source_col_idxs), as_hip_type(slice_lengths),
                        as_hip_type(slice_sets), as_hip_type(result_col_idxs),
                        as_hip_type(result_values));
-
-    nnz_per_row.clear();
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -860,11 +834,6 @@ void calculate_total_cols(std::shared_ptr<const HipExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    block_results.clear();
-    nnz_per_row.clear();
-    max_nnz_per_slice.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1001,10 +970,6 @@ void calculate_max_nnz_per_row(std::shared_ptr<const HipExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    nnz_per_row.clear();
-    block_results.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -1039,17 +1004,7 @@ void convert_to_hybrid(std::shared_ptr<const HipExecutor> exec,
                        as_hip_type(source->get_const_row_ptrs()),
                        as_hip_type(coo_offset.get_data()));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(num_rows, default_block_size));
-    grid_dim = ceildiv(num_rows, default_block_size);
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0, num_rows,
-                       as_hip_type(coo_offset.get_data()),
-                       as_hip_type(add_values.get_data()));
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0, num_rows,
-                       as_hip_type(coo_offset.get_data()),
-                       as_hip_type(add_values.get_const_data()));
+    prefix_sum(exec, coo_offset.get_data(), num_rows);
 
     grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
     hipLaunchKernelGGL(kernel::fill_in_hybrid, dim3(grid_dim),
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 5654c2b5c7e..99600409a86 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -45,11 +45,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/base/hipblas_bindings.hip.hpp"
 #include "hip/base/pointer_mode_guard.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
 
@@ -264,15 +264,7 @@ void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
     const size_type grid_dim = ceildiv(num_rows, default_block_size);
     auto add_values = Array<size_type>(exec, grid_dim);
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0, num_rows,
-                       as_hip_type(nnz_prefix_sum.get_data()),
-                       as_hip_type(add_values.get_data()));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0, num_rows,
-                       as_hip_type(nnz_prefix_sum.get_data()),
-                       as_hip_type(add_values.get_data()));
+    prefix_sum(exec, nnz_prefix_sum.get_data(), num_rows);
 
     hipLaunchKernelGGL(kernel::fill_in_coo, dim3(grid_dim),
                        dim3(default_block_size), 0, 0, num_rows, num_cols,
@@ -280,9 +272,6 @@ void convert_to_coo(std::shared_ptr<const HipExecutor> exec,
                        as_hip_type(source->get_const_values()),
                        as_hip_type(row_idxs), as_hip_type(col_idxs),
                        as_hip_type(values));
-
-    nnz_prefix_sum.clear();
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -311,25 +300,14 @@ void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
                        stride, as_hip_type(source->get_const_values()),
                        as_hip_type(row_ptrs));
 
-    size_type grid_dim = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_dim);
+    prefix_sum(exec, row_ptrs, num_rows + 1);
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(row_ptrs),
-                       as_hip_type(add_values.get_data()));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(row_ptrs),
-                       as_hip_type(add_values.get_const_data()));
+    size_type grid_dim = ceildiv(num_rows, default_block_size);
 
     hipLaunchKernelGGL(
         kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0,
         num_rows, num_cols, stride, as_hip_type(source->get_const_values()),
         as_hip_type(row_ptrs), as_hip_type(col_idxs), as_hip_type(values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -406,19 +384,7 @@ void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
                        as_hip_type(nnz_per_row.get_const_data()),
                        as_hip_type(slice_lengths), as_hip_type(slice_sets));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
-    grid_dim = ceildiv(slice_num + 1, default_block_size);
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       slice_num + 1, as_hip_type(slice_sets),
-                       as_hip_type(add_values.get_data()));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       slice_num + 1, as_hip_type(slice_sets),
-                       as_hip_type(add_values.get_const_data()));
+    prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
     hipLaunchKernelGGL(
@@ -426,9 +392,6 @@ void convert_to_sellp(std::shared_ptr<const HipExecutor> exec,
         num_rows, num_cols, slice_size, stride,
         as_hip_type(source->get_const_values()), as_hip_type(slice_lengths),
         as_hip_type(slice_sets), as_hip_type(col_idxs), as_hip_type(vals));
-
-    add_values.clear();
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -455,7 +418,6 @@ void count_nonzeros(std::shared_ptr<const HipExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COUNT_NONZEROS_KERNEL);
@@ -493,9 +455,6 @@ void calculate_max_nnz_per_row(std::shared_ptr<const HipExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-    d_result.clear();
-    block_results.clear();
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -564,11 +523,6 @@ void calculate_total_cols(std::shared_ptr<const HipExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    block_results.clear();
-    nnz_per_row.clear();
-    max_nnz_per_slice.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index d451f9b99f5..d2a17d6e525 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
@@ -54,7 +55,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/format_conversion.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/zero_array.hip.hpp"
 
@@ -316,15 +316,7 @@ void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
     size_type grid_dim = ceildiv(num_rows + 1, default_block_size);
     auto add_values = Array<IndexType>(exec, grid_dim);
 
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(row_ptrs),
-                       as_hip_type(add_values.get_data()));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(row_ptrs),
-                       as_hip_type(add_values.get_const_data()));
+    prefix_sum(exec, row_ptrs, num_rows + 1);
 
     hipLaunchKernelGGL(
         kernel::fill_in_csr, dim3(grid_dim), dim3(default_block_size), 0, 0,
@@ -332,8 +324,6 @@ void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
         as_hip_type(source->get_const_values()),
         as_hip_type(source->get_const_col_idxs()), as_hip_type(row_ptrs),
         as_hip_type(col_idxs), as_hip_type(values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -351,7 +341,6 @@ void count_nonzeros(std::shared_ptr<const HipExecutor> exec,
     calculate_nonzeros_per_row(exec, source, &nnz_per_row);
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp
index 04a6be08785..0e45df63125 100644
--- a/hip/matrix/hybrid_kernels.hip.cpp
+++ b/hip/matrix/hybrid_kernels.hip.cpp
@@ -40,6 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/ell_kernels.hpp"
 #include "hip/base/config.hip.hpp"
@@ -47,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
 #include "hip/components/format_conversion.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/segment_scan.hip.hpp"
 #include "hip/components/zero_array.hip.hpp"
@@ -134,18 +134,7 @@ void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
                        0, num_rows, as_hip_type(row_ptrs),
                        as_hip_type(coo_row_ptrs.get_const_data()));
 
-    grid_num = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_num);
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_num), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(row_ptrs),
-                       as_hip_type(add_values.get_data()));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_num), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(row_ptrs),
-                       as_hip_type(add_values.get_const_data()));
+    prefix_sum(exec, row_ptrs, num_rows + 1);
 
     // Fill the value
     grid_num = ceildiv(num_rows, default_block_size);
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
index 0df2ef62634..10a513d01a7 100644
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ b/hip/matrix/sellp_kernels.hip.cpp
@@ -43,10 +43,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/types.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 
 
@@ -179,18 +179,7 @@ void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
         0, num_rows, slice_size, as_hip_type(source_slice_sets),
         as_hip_type(source_values), as_hip_type(result_row_ptrs));
 
-    grid_dim = ceildiv(num_rows + 1, default_block_size);
-    auto add_values = Array<IndexType>(exec, grid_dim);
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(start_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(result_row_ptrs),
-                       as_hip_type(add_values.get_data()));
-
-    hipLaunchKernelGGL(HIP_KERNEL_NAME(finalize_prefix_sum<default_block_size>),
-                       dim3(grid_dim), dim3(default_block_size), 0, 0,
-                       num_rows + 1, as_hip_type(result_row_ptrs),
-                       as_hip_type(add_values.get_const_data()));
+    prefix_sum(exec, result_row_ptrs, num_rows + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
 
@@ -200,8 +189,6 @@ void convert_to_csr(std::shared_ptr<const HipExecutor> exec,
         as_hip_type(source_col_idxs), as_hip_type(source_values),
         as_hip_type(result_row_ptrs), as_hip_type(result_col_idxs),
         as_hip_type(result_values));
-
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -228,7 +215,6 @@ void count_nonzeros(std::shared_ptr<const HipExecutor> exec,
                        as_hip_type(nnz_per_row.get_data()));
 
     *result = reduce_add_array(exec, num_rows, nnz_per_row.get_const_data());
-    nnz_per_row.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/hip/matrix/sparsity_csr_kernels.hip.cpp b/hip/matrix/sparsity_csr_kernels.hip.cpp
index 65915b518e6..54ab4a7c69f 100644
--- a/hip/matrix/sparsity_csr_kernels.hip.cpp
+++ b/hip/matrix/sparsity_csr_kernels.hip.cpp
@@ -48,7 +48,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "hip/base/types.hip.hpp"
 #include "hip/components/atomic.hip.hpp"
 #include "hip/components/cooperative_groups.hip.hpp"
-#include "hip/components/prefix_sum.hip.hpp"
 #include "hip/components/reduction.hip.hpp"
 #include "hip/components/segment_scan.hip.hpp"
 #include "hip/components/uninitialized_array.hip.hpp"
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index dbca65286c0..d1c585bd6c7 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -5,6 +5,7 @@ target_sources(ginkgo_omp
     PRIVATE
         base/version.cpp
         factorization/par_ilu_kernels.cpp
+        matrix/common_kernels.cpp
         matrix/coo_kernels.cpp
         matrix/csr_kernels.cpp
         matrix/dense_kernels.cpp
diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
index 8512d8e5670..1a35445d0e6 100644
--- a/omp/factorization/par_ilu_kernels.cpp
+++ b/omp/factorization/par_ilu_kernels.cpp
@@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace omp {
@@ -55,14 +58,13 @@ void initialize_row_ptrs_l_u(
     const matrix::Csr<ValueType, IndexType> *system_matrix,
     IndexType *l_row_ptrs, IndexType *u_row_ptrs)
 {
+    auto num_rows = system_matrix->get_size()[0];
     auto row_ptrs = system_matrix->get_const_row_ptrs();
     auto col_idxs = system_matrix->get_const_col_idxs();
 
-    l_row_ptrs[0] = 0;
-    u_row_ptrs[0] = 0;
 // Calculate the NNZ per row first
 #pragma omp parallel for
-    for (size_type row = 0; row < system_matrix->get_size()[0]; ++row) {
+    for (size_type row = 0; row < num_rows; ++row) {
         size_type l_nnz{};
         size_type u_nnz{};
         bool has_diagonal{};
@@ -76,20 +78,13 @@ void initialize_row_ptrs_l_u(
             }
             has_diagonal |= col == row;
         }
-        l_row_ptrs[row + 1] = l_nnz + !has_diagonal;
-        u_row_ptrs[row + 1] = u_nnz + !has_diagonal;
+        l_row_ptrs[row] = l_nnz + !has_diagonal;
+        u_row_ptrs[row] = u_nnz + !has_diagonal;
     }
 
     // Now, compute the prefix-sum, to get proper row_ptrs for L and U
-    IndexType l_previous_nnz{};
-    IndexType u_previous_nnz{};
-    for (size_type row = 1; row < system_matrix->get_size()[0] + 1; ++row) {
-        l_previous_nnz += l_row_ptrs[row];
-        u_previous_nnz += u_row_ptrs[row];
-
-        l_row_ptrs[row] = l_previous_nnz;
-        u_row_ptrs[row] = u_previous_nnz;
-    }
+    prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    prefix_sum(exec, u_row_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
diff --git a/omp/matrix/common_kernels.cpp b/omp/matrix/common_kernels.cpp
new file mode 100644
index 00000000000..f5dabe7db3e
--- /dev/null
+++ b/omp/matrix/common_kernels.cpp
@@ -0,0 +1,65 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include "core/matrix/common_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const OmpExecutor> exec, IndexType *counts,
+                size_type num_entries)
+{
+    IndexType partial_sum{};
+    for (IndexType i = 0; i < num_entries; ++i) {
+        auto nnz = counts[i];
+        counts[i] = partial_sum;
+        partial_sum += nnz;
+    }
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// explicitly instantiate for size_type as well, as this is used in the SellP
+// format
+template void prefix_sum<size_type>(std::shared_ptr<const OmpExecutor> exec,
+                                    size_type *counts, size_type num_entries);
+
+
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index eea207e914a..dad2c64ce03 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -53,6 +53,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/iterator_factory.hpp"
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "omp/components/format_conversion.hpp"
 
@@ -221,12 +222,11 @@ void spgemm(std::shared_ptr<const OmpExecutor> exec,
     for (size_type a_row = 0; a_row < num_rows; ++a_row) {
         local_col_idxs.clear();
         spgemm_insert_row2(local_col_idxs, a, b, a_row);
-        c_row_ptrs[a_row + 1] = local_col_idxs.size();
+        c_row_ptrs[a_row] = local_col_idxs.size();
     }
 
-    // build row pointers: exclusive scan (thus the + 1)
-    c_row_ptrs[0] = 0;
-    std::partial_sum(c_row_ptrs + 1, c_row_ptrs + num_rows + 1, c_row_ptrs + 1);
+    // build row pointers
+    prefix_sum(exec, c_row_ptrs, num_rows + 1);
 
     // second sweep: accumulate non-zeros
     auto new_nnz = c_row_ptrs[num_rows];
@@ -282,12 +282,11 @@ void advanced_spgemm(std::shared_ptr<const OmpExecutor> exec,
         if (valpha != zero(valpha)) {
             spgemm_insert_row2(local_col_idxs, a, b, a_row);
         }
-        c_row_ptrs[a_row + 1] = local_col_idxs.size();
+        c_row_ptrs[a_row] = local_col_idxs.size();
     }
 
-    // build row pointers: exclusive scan (thus the + 1)
-    c_row_ptrs[0] = 0;
-    std::partial_sum(c_row_ptrs + 1, c_row_ptrs + num_rows + 1, c_row_ptrs + 1);
+    // build row pointers
+    prefix_sum(exec, c_row_ptrs, num_rows + 1);
 
     // second sweep: accumulate non-zeros
     auto new_nnz = c_row_ptrs[num_rows];
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 7516fc8641f..2b94ef0ebad 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -3,6 +3,7 @@ target_sources(ginkgo_reference
     PRIVATE
         base/version.cpp
         factorization/par_ilu_kernels.cpp
+        matrix/common_kernels.cpp
         matrix/coo_kernels.cpp
         matrix/csr_kernels.cpp
         matrix/dense_kernels.cpp
diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp
index c84ca97183f..9f68a550961 100644
--- a/reference/factorization/par_ilu_kernels.cpp
+++ b/reference/factorization/par_ilu_kernels.cpp
@@ -38,6 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
+#include "core/matrix/common_kernels.hpp"
+
+
 namespace gko {
 namespace kernels {
 namespace reference {
@@ -74,8 +77,10 @@ void initialize_row_ptrs_l_u(
             }
             has_diagonal |= col == row;
         }
-        l_row_ptrs[row + 1] = l_nnz + !has_diagonal;
-        u_row_ptrs[row + 1] = u_nnz + !has_diagonal;
+        l_nnz += !has_diagonal;
+        u_nnz += !has_diagonal;
+        l_row_ptrs[row + 1] = l_nnz;
+        u_row_ptrs[row + 1] = u_nnz;
     }
 }
 
diff --git a/reference/matrix/common_kernels.cpp b/reference/matrix/common_kernels.cpp
new file mode 100644
index 00000000000..9b721cdb3f2
--- /dev/null
+++ b/reference/matrix/common_kernels.cpp
@@ -0,0 +1,66 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+#include "core/matrix/common_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const ReferenceExecutor> exec,
+                IndexType *counts, size_type num_entries)
+{
+    IndexType partial_sum{};
+    for (IndexType i = 0; i < num_entries; ++i) {
+        auto nnz = counts[i];
+        counts[i] = partial_sum;
+        partial_sum += nnz;
+    }
+}
+
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// explicitly instantiate for size_type as well, as this is used in the SellP
+// format
+template void prefix_sum<size_type>(
+    std::shared_ptr<const ReferenceExecutor> exec, size_type *counts,
+    size_type num_entries);
+
+
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
\ No newline at end of file
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 0e0a587b9ae..2d22ddb58e9 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -54,6 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/iterator_factory.hpp"
+#include "core/matrix/common_kernels.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "reference/components/format_conversion.hpp"
 
@@ -219,12 +220,11 @@ void spgemm(std::shared_ptr<const ReferenceExecutor> exec,
     for (size_type a_row = 0; a_row < num_rows; ++a_row) {
         local_col_idxs.clear();
         spgemm_insert_row2(local_col_idxs, a, b, a_row);
-        c_row_ptrs[a_row + 1] = local_col_idxs.size();
+        c_row_ptrs[a_row] = local_col_idxs.size();
     }
 
-    // build row pointers: exclusive scan (thus the + 1)
-    c_row_ptrs[0] = 0;
-    std::partial_sum(c_row_ptrs + 1, c_row_ptrs + num_rows + 1, c_row_ptrs + 1);
+    // build row pointers
+    prefix_sum(exec, c_row_ptrs, num_rows + 1);
 
     // second sweep: accumulate non-zeros
     auto new_nnz = c_row_ptrs[num_rows];
@@ -278,12 +278,11 @@ void advanced_spgemm(std::shared_ptr<const ReferenceExecutor> exec,
         if (valpha != zero(valpha)) {
             spgemm_insert_row2(local_col_idxs, a, b, a_row);
         }
-        c_row_ptrs[a_row + 1] = local_col_idxs.size();
+        c_row_ptrs[a_row] = local_col_idxs.size();
     }
 
-    // build row pointers: exclusive scan (thus the + 1)
-    c_row_ptrs[0] = 0;
-    std::partial_sum(c_row_ptrs + 1, c_row_ptrs + num_rows + 1, c_row_ptrs + 1);
+    // build row pointers
+    prefix_sum(exec, c_row_ptrs, num_rows + 1);
 
     // second sweep: accumulate non-zeros
     auto new_nnz = c_row_ptrs[num_rows];
diff --git a/reference/test/factorization/par_ilu_kernels.cpp b/reference/test/factorization/par_ilu_kernels.cpp
index 4e697c61e6a..ea7126641f5 100644
--- a/reference/test/factorization/par_ilu_kernels.cpp
+++ b/reference/test/factorization/par_ilu_kernels.cpp
@@ -83,6 +83,10 @@ class ParIlu : public ::testing::Test {
         : ref(gko::ReferenceExecutor::create()),
           exec(std::static_pointer_cast<const gko::Executor>(ref)),
           // clang-format off
+          empty_csr(gko::initialize<Csr>(
+              {{0., 0., 0.},
+               {0., 0., 0.},
+               {0., 0., 0.}}, exec)),
           identity(gko::initialize<Dense>(
               {{1., 0., 0.},
                {0., 1., 0.},
@@ -139,16 +143,16 @@ class ParIlu : public ::testing::Test {
           big_nodiag_l_expected(gko::initialize<Dense>({{1., 0., 0., 0., 0., 0.},
                                                         {1., 1., 0., 0., 0., 0.},
                                                         {0., 2., 1., 0., 0., 0.},
-                                                        {2., 0., 0., 1., 0., 0.},
-                                                        {1., 1., 0., 1., 1., 0.},
-                                                        {0., 2., 1., 0.25, -0.5, 1.}},
+                                                        {1., 0., 2., 1., 0., 0.},
+                                                        {1., 1., 0., -2., 1., 0.},
+                                                        {0., 2., 1., -0.5, 2.5, 1.}},
                                                 exec)),
           big_nodiag_u_expected(gko::initialize<Dense>({{1., 1., 1., 0., 1., 3.},
                                                         {0., 1., 1., 0., 1., 0.},
-                                                        {0., 0., 1., 3., 1., -2.},
-                                                        {0., 0., 0., 4., 2., 0.},
-                                                        {0., 0., 0., 0., -3., 3.},
-                                                        {0., 0., 0., 0., 0., 11.5}},
+                                                        {0., 0., 1., 3., 1., 5.},
+                                                        {0., 0., 0., -2., 1., -9.},
+                                                        {0., 0., 0., 0., 1., -15.},
+                                                        {0., 0., 0., 0., 0., 36.}},
                                                 exec)),
           // clang-format on
           ilu_factory_skip(
@@ -163,6 +167,7 @@ class ParIlu : public ::testing::Test {
 
     std::shared_ptr<const gko::ReferenceExecutor> ref;
     std::shared_ptr<const gko::Executor> exec;
+    std::shared_ptr<const Csr> empty_csr;
     std::shared_ptr<const Dense> identity;
     std::shared_ptr<const Dense> lower_triangular;
     std::shared_ptr<const Dense> upper_triangular;
@@ -203,6 +208,28 @@ TEST_F(ParIlu, KernelInitializeRowPtrsLU)
 }
 
 
+TEST_F(ParIlu, KernelInitializeRowPtrsLUZeroMatrix)
+{
+    auto empty_csr_l_expected = Csr::create(ref);
+    identity->convert_to(gko::lend(empty_csr_l_expected));
+    auto empty_csr_u_expected = Csr::create(ref);
+    identity->convert_to(gko::lend(empty_csr_u_expected));
+    auto num_row_ptrs = empty_csr->get_size()[0] + 1;
+    std::vector<index_type> l_row_ptrs_vector(num_row_ptrs);
+    std::vector<index_type> u_row_ptrs_vector(num_row_ptrs);
+    auto l_row_ptrs = l_row_ptrs_vector.data();
+    auto u_row_ptrs = u_row_ptrs_vector.data();
+
+    gko::kernels::reference::par_ilu_factorization::initialize_row_ptrs_l_u(
+        ref, gko::lend(empty_csr), l_row_ptrs, u_row_ptrs);
+
+    ASSERT_TRUE(std::equal(l_row_ptrs, l_row_ptrs + num_row_ptrs,
+                           empty_csr_l_expected->get_const_row_ptrs()));
+    ASSERT_TRUE(std::equal(u_row_ptrs, u_row_ptrs + num_row_ptrs,
+                           empty_csr_u_expected->get_const_row_ptrs()));
+}
+
+
 TEST_F(ParIlu, KernelInitializeLU)
 {
     // clang-format off
@@ -233,6 +260,21 @@ TEST_F(ParIlu, KernelInitializeLU)
 }
 
 
+TEST_F(ParIlu, KernelInitializeLUZeroMatrix)
+{
+    auto actual_l = Csr::create(ref);
+    auto actual_u = Csr::create(ref);
+    actual_l->copy_from(identity.get());
+    actual_u->copy_from(identity.get());
+
+    gko::kernels::reference::par_ilu_factorization::initialize_l_u(
+        ref, gko::lend(empty_csr), gko::lend(actual_l), gko::lend(actual_u));
+
+    GKO_ASSERT_MTX_NEAR(actual_l, identity, 1e-14);
+    GKO_ASSERT_MTX_NEAR(actual_u, identity, 1e-14);
+}
+
+
 TEST_F(ParIlu, KernelComputeLU)
 {
     // clang-format off

From ffbbf8aead226a60d736054691c39361309f5e90 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 8 Jan 2020 11:15:20 +0100
Subject: [PATCH 2/4] move prefix_sum kernel to components

---
 .../common_kernels.hpp => components/prefix_sum.hpp}        | 6 +++---
 cuda/CMakeLists.txt                                         | 2 +-
 cuda/{matrix/common_kernels.cu => components/prefix_sum.cu} | 2 +-
 cuda/factorization/par_ilu_kernels.cu                       | 2 +-
 cuda/matrix/csr_kernels.cu                                  | 2 +-
 cuda/matrix/dense_kernels.cu                                | 2 +-
 cuda/matrix/ell_kernels.cu                                  | 2 +-
 cuda/matrix/hybrid_kernels.cu                               | 2 +-
 cuda/matrix/sellp_kernels.cu                                | 2 +-
 hip/CMakeLists.txt                                          | 2 +-
 .../prefix_sum.hip.cpp}                                     | 2 +-
 hip/factorization/par_ilu_kernels.hip.cpp                   | 2 +-
 hip/matrix/csr_kernels.hip.cpp                              | 2 +-
 hip/matrix/dense_kernels.hip.cpp                            | 2 +-
 hip/matrix/ell_kernels.hip.cpp                              | 2 +-
 hip/matrix/hybrid_kernels.hip.cpp                           | 2 +-
 hip/matrix/sellp_kernels.hip.cpp                            | 2 +-
 omp/CMakeLists.txt                                          | 2 +-
 .../common_kernels.cpp => components/prefix_sum.cpp}        | 2 +-
 omp/factorization/par_ilu_kernels.cpp                       | 2 +-
 omp/matrix/csr_kernels.cpp                                  | 2 +-
 reference/CMakeLists.txt                                    | 2 +-
 .../common_kernels.cpp => components/prefix_sum.cpp}        | 2 +-
 reference/factorization/par_ilu_kernels.cpp                 | 2 +-
 reference/matrix/csr_kernels.cpp                            | 2 +-
 25 files changed, 27 insertions(+), 27 deletions(-)
 rename core/{matrix/common_kernels.hpp => components/prefix_sum.hpp} (94%)
 rename cuda/{matrix/common_kernels.cu => components/prefix_sum.cu} (98%)
 rename hip/{matrix/common_kernels.hip.cpp => components/prefix_sum.hip.cpp} (98%)
 rename omp/{matrix/common_kernels.cpp => components/prefix_sum.cpp} (98%)
 rename reference/{matrix/common_kernels.cpp => components/prefix_sum.cpp} (98%)

diff --git a/core/matrix/common_kernels.hpp b/core/components/prefix_sum.hpp
similarity index 94%
rename from core/matrix/common_kernels.hpp
rename to core/components/prefix_sum.hpp
index dd44730a4be..a2e3bdc808b 100644
--- a/core/matrix/common_kernels.hpp
+++ b/core/components/prefix_sum.hpp
@@ -30,8 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
-#define GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
+#ifndef GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
+#define GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
 
 
 #include <memory>
@@ -89,4 +89,4 @@ GKO_DECLARE_ALL_AS_TEMPLATES;
 }  // namespace kernels
 }  // namespace gko
 
-#endif  // GKO_CORE_MATRIX_COMMON_KERNELS_HPP_
\ No newline at end of file
+#endif  // GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
\ No newline at end of file
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 96d7ed44a13..9a9bf302586 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -62,9 +62,9 @@ target_sources(ginkgo_cuda
         base/exception.cpp
         base/executor.cpp
         base/version.cpp
+        components/prefix_sum.cu
         components/zero_array.cu
         factorization/par_ilu_kernels.cu
-        matrix/common_kernels.cu
         matrix/coo_kernels.cu
         matrix/csr_kernels.cu
         matrix/dense_kernels.cu
diff --git a/cuda/matrix/common_kernels.cu b/cuda/components/prefix_sum.cu
similarity index 98%
rename from cuda/matrix/common_kernels.cu
rename to cuda/components/prefix_sum.cu
index 6952e1d4f21..83a01233d6c 100644
--- a/cuda/matrix/common_kernels.cu
+++ b/cuda/components/prefix_sum.cu
@@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 
 
 #include "cuda/components/prefix_sum.cuh"
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
index 1cd41b6b1c0..14894a9ef1d 100644
--- a/cuda/factorization/par_ilu_kernels.cu
+++ b/cuda/factorization/par_ilu_kernels.cu
@@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/coo.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
 
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 6f4280ced10..fdf0bf25e1c 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -46,7 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
index 4b6a6c3cbbc..b7e656e858d 100644
--- a/cuda/matrix/dense_kernels.cu
+++ b/cuda/matrix/dense_kernels.cu
@@ -42,7 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
index 89ce0f9d27e..4dba873649c 100644
--- a/cuda/matrix/ell_kernels.cu
+++ b/cuda/matrix/ell_kernels.cu
@@ -43,7 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "cuda/base/config.hpp"
diff --git a/cuda/matrix/hybrid_kernels.cu b/cuda/matrix/hybrid_kernels.cu
index fc9bdc4c1dd..5071f51ee2a 100644
--- a/cuda/matrix/hybrid_kernels.cu
+++ b/cuda/matrix/hybrid_kernels.cu
@@ -37,7 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/ell_kernels.hpp"
 #include "cuda/base/config.hpp"
diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
index 024ed44b38e..2cc8ad2cc22 100644
--- a/cuda/matrix/sellp_kernels.cu
+++ b/cuda/matrix/sellp_kernels.cu
@@ -40,7 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "cuda/base/config.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
 #include "cuda/base/types.hpp"
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index a0b1b938167..5bedaf1eea8 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -94,9 +94,9 @@ set(GINKGO_HIP_SOURCES
     base/exception.hip.cpp
     base/executor.hip.cpp
     base/version.hip.cpp
+    components/prefix_sum.hip.cpp
     components/zero_array.hip.cpp
     factorization/par_ilu_kernels.hip.cpp
-    matrix/common_kernels.hip.cpp
     matrix/coo_kernels.hip.cpp
     matrix/csr_kernels.hip.cpp
     matrix/dense_kernels.hip.cpp
diff --git a/hip/matrix/common_kernels.hip.cpp b/hip/components/prefix_sum.hip.cpp
similarity index 98%
rename from hip/matrix/common_kernels.hip.cpp
rename to hip/components/prefix_sum.hip.cpp
index a3cc319727c..f0283b6c80e 100644
--- a/hip/matrix/common_kernels.hip.cpp
+++ b/hip/components/prefix_sum.hip.cpp
@@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 
 
 #include "hip/components/prefix_sum.hip.hpp"
diff --git a/hip/factorization/par_ilu_kernels.hip.cpp b/hip/factorization/par_ilu_kernels.hip.cpp
index 86170cda799..58d957c45b6 100644
--- a/hip/factorization/par_ilu_kernels.hip.cpp
+++ b/hip/factorization/par_ilu_kernels.hip.cpp
@@ -41,7 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/coo.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "hip/base/math.hip.hpp"
 #include "hip/base/types.hip.hpp"
 
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index c1ff237074e..2aa7eb3c055 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -49,7 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
diff --git a/hip/matrix/dense_kernels.hip.cpp b/hip/matrix/dense_kernels.hip.cpp
index 99600409a86..46540b33a52 100644
--- a/hip/matrix/dense_kernels.hip.cpp
+++ b/hip/matrix/dense_kernels.hip.cpp
@@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/base/hipblas_bindings.hip.hpp"
 #include "hip/base/pointer_mode_guard.hip.hpp"
diff --git a/hip/matrix/ell_kernels.hip.cpp b/hip/matrix/ell_kernels.hip.cpp
index d2a17d6e525..835a4a6fc34 100644
--- a/hip/matrix/ell_kernels.hip.cpp
+++ b/hip/matrix/ell_kernels.hip.cpp
@@ -46,7 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
 #include "hip/base/config.hip.hpp"
diff --git a/hip/matrix/hybrid_kernels.hip.cpp b/hip/matrix/hybrid_kernels.hip.cpp
index 0e45df63125..397e62fe2b7 100644
--- a/hip/matrix/hybrid_kernels.hip.cpp
+++ b/hip/matrix/hybrid_kernels.hip.cpp
@@ -40,7 +40,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/ell.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/coo_kernels.hpp"
 #include "core/matrix/ell_kernels.hpp"
 #include "hip/base/config.hip.hpp"
diff --git a/hip/matrix/sellp_kernels.hip.cpp b/hip/matrix/sellp_kernels.hip.cpp
index 10a513d01a7..49f6ace3c15 100644
--- a/hip/matrix/sellp_kernels.hip.cpp
+++ b/hip/matrix/sellp_kernels.hip.cpp
@@ -43,7 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/dense.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "hip/base/config.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 #include "hip/base/types.hip.hpp"
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index d1c585bd6c7..2206f74659c 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -4,8 +4,8 @@ add_library(ginkgo_omp $<TARGET_OBJECTS:ginkgo_omp_device> "")
 target_sources(ginkgo_omp
     PRIVATE
         base/version.cpp
+        components/prefix_sum.cpp
         factorization/par_ilu_kernels.cpp
-        matrix/common_kernels.cpp
         matrix/coo_kernels.cpp
         matrix/csr_kernels.cpp
         matrix/dense_kernels.cpp
diff --git a/omp/matrix/common_kernels.cpp b/omp/components/prefix_sum.cpp
similarity index 98%
rename from omp/matrix/common_kernels.cpp
rename to omp/components/prefix_sum.cpp
index f5dabe7db3e..7f487a552fb 100644
--- a/omp/matrix/common_kernels.cpp
+++ b/omp/components/prefix_sum.cpp
@@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 
 
 namespace gko {
diff --git a/omp/factorization/par_ilu_kernels.cpp b/omp/factorization/par_ilu_kernels.cpp
index 1a35445d0e6..0a6f69d19f6 100644
--- a/omp/factorization/par_ilu_kernels.cpp
+++ b/omp/factorization/par_ilu_kernels.cpp
@@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 
 
 namespace gko {
diff --git a/omp/matrix/csr_kernels.cpp b/omp/matrix/csr_kernels.cpp
index dad2c64ce03..3552046a491 100644
--- a/omp/matrix/csr_kernels.cpp
+++ b/omp/matrix/csr_kernels.cpp
@@ -53,7 +53,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/iterator_factory.hpp"
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "omp/components/format_conversion.hpp"
 
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 2b94ef0ebad..bb7c9ac906a 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -2,8 +2,8 @@ add_library(ginkgo_reference $<TARGET_OBJECTS:ginkgo_reference_device> "")
 target_sources(ginkgo_reference
     PRIVATE
         base/version.cpp
+        components/prefix_sum.cpp
         factorization/par_ilu_kernels.cpp
-        matrix/common_kernels.cpp
         matrix/coo_kernels.cpp
         matrix/csr_kernels.cpp
         matrix/dense_kernels.cpp
diff --git a/reference/matrix/common_kernels.cpp b/reference/components/prefix_sum.cpp
similarity index 98%
rename from reference/matrix/common_kernels.cpp
rename to reference/components/prefix_sum.cpp
index 9b721cdb3f2..b1dadd229d0 100644
--- a/reference/matrix/common_kernels.cpp
+++ b/reference/components/prefix_sum.cpp
@@ -31,7 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 
 
 namespace gko {
diff --git a/reference/factorization/par_ilu_kernels.cpp b/reference/factorization/par_ilu_kernels.cpp
index 9f68a550961..01e9a1d5ae7 100644
--- a/reference/factorization/par_ilu_kernels.cpp
+++ b/reference/factorization/par_ilu_kernels.cpp
@@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/csr.hpp>
 
 
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 
 
 namespace gko {
diff --git a/reference/matrix/csr_kernels.cpp b/reference/matrix/csr_kernels.cpp
index 2d22ddb58e9..de0abeef99f 100644
--- a/reference/matrix/csr_kernels.cpp
+++ b/reference/matrix/csr_kernels.cpp
@@ -54,7 +54,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "core/base/iterator_factory.hpp"
-#include "core/matrix/common_kernels.hpp"
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "reference/components/format_conversion.hpp"
 

From bfa9a807d8f48c5af100396e6f4b0a4186ee3b4f Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Wed, 8 Jan 2020 12:00:41 +0100
Subject: [PATCH 3/4] add prefix_sum tests

---
 cuda/test/components/CMakeLists.txt      |  1 +
 cuda/test/components/prefix_sum.cu       | 94 ++++++++++++++++++++++++
 hip/test/components/CMakeLists.txt       |  1 +
 hip/test/components/prefix_sum.hip.cpp   | 94 ++++++++++++++++++++++++
 omp/test/CMakeLists.txt                  |  1 +
 omp/test/components/CMakeLists.txt       |  1 +
 omp/test/components/prefix_sum.cpp       | 93 +++++++++++++++++++++++
 reference/test/CMakeLists.txt            |  1 +
 reference/test/components/CMakeLists.txt |  1 +
 reference/test/components/prefix_sum.cpp | 70 ++++++++++++++++++
 10 files changed, 357 insertions(+)
 create mode 100644 cuda/test/components/prefix_sum.cu
 create mode 100644 hip/test/components/prefix_sum.hip.cpp
 create mode 100644 omp/test/components/CMakeLists.txt
 create mode 100644 omp/test/components/prefix_sum.cpp
 create mode 100644 reference/test/components/CMakeLists.txt
 create mode 100644 reference/test/components/prefix_sum.cpp

diff --git a/cuda/test/components/CMakeLists.txt b/cuda/test/components/CMakeLists.txt
index 5d0301b6552..4b501e93b4c 100644
--- a/cuda/test/components/CMakeLists.txt
+++ b/cuda/test/components/CMakeLists.txt
@@ -1 +1,2 @@
+ginkgo_create_cuda_test(prefix_sum)
 ginkgo_create_cuda_test(sorting)
\ No newline at end of file
diff --git a/cuda/test/components/prefix_sum.cu b/cuda/test/components/prefix_sum.cu
new file mode 100644
index 00000000000..df24d3f646d
--- /dev/null
+++ b/cuda/test/components/prefix_sum.cu
@@ -0,0 +1,94 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::CudaExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::prefix_sum(ref, vals.get_data(), size);
+        gko::kernels::cuda::prefix_sum(exec, dvals.get_data(), size);
+
+        gko::Array<index_type> dresult(ref, dvals);
+        auto dptr = dresult.get_const_data();
+        auto ptr = vals.get_const_data();
+        ASSERT_TRUE(std::equal(ptr, ptr + size, dptr));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+
+TEST_F(PrefixSum, SmallEqualsReference) { test(100); }
+
+
+TEST_F(PrefixSum, BigEqualsReference) { test(total_size); }
+
+
+}  // namespace
diff --git a/hip/test/components/CMakeLists.txt b/hip/test/components/CMakeLists.txt
index e3c6ca06f01..9824e97743f 100644
--- a/hip/test/components/CMakeLists.txt
+++ b/hip/test/components/CMakeLists.txt
@@ -1 +1,2 @@
+ginkgo_create_hip_test(prefix_sum)
 ginkgo_create_hip_test(sorting)
\ No newline at end of file
diff --git a/hip/test/components/prefix_sum.hip.cpp b/hip/test/components/prefix_sum.hip.cpp
new file mode 100644
index 00000000000..690ec1847b0
--- /dev/null
+++ b/hip/test/components/prefix_sum.hip.cpp
@@ -0,0 +1,94 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::HipExecutor::create(0, ref)),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::prefix_sum(ref, vals.get_data(), size);
+        gko::kernels::hip::prefix_sum(exec, dvals.get_data(), size);
+
+        gko::Array<index_type> dresult(ref, dvals);
+        auto dptr = dresult.get_const_data();
+        auto ptr = vals.get_const_data();
+        ASSERT_TRUE(std::equal(ptr, ptr + size, dptr));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+
+TEST_F(PrefixSum, SmallEqualsReference) { test(100); }
+
+
+TEST_F(PrefixSum, BigEqualsReference) { test(total_size); }
+
+
+}  // namespace
diff --git a/omp/test/CMakeLists.txt b/omp/test/CMakeLists.txt
index f3fb866163a..d746413f53f 100644
--- a/omp/test/CMakeLists.txt
+++ b/omp/test/CMakeLists.txt
@@ -1,5 +1,6 @@
 include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake)
 
+add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(matrix)
 add_subdirectory(preconditioner)
diff --git a/omp/test/components/CMakeLists.txt b/omp/test/components/CMakeLists.txt
new file mode 100644
index 00000000000..faffe4e94e1
--- /dev/null
+++ b/omp/test/components/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(prefix_sum)
\ No newline at end of file
diff --git a/omp/test/components/prefix_sum.cpp b/omp/test/components/prefix_sum.cpp
new file mode 100644
index 00000000000..b0c24ab7a63
--- /dev/null
+++ b/omp/test/components/prefix_sum.cpp
@@ -0,0 +1,93 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <memory>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/array.hpp>
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : ref(gko::ReferenceExecutor::create()),
+          exec(gko::OmpExecutor::create()),
+          rand(293),
+          total_size(42793),
+          vals(ref, total_size),
+          dvals(exec)
+    {
+        std::uniform_int_distribution<index_type> dist(0, 1000);
+        for (gko::size_type i = 0; i < total_size; ++i) {
+            vals.get_data()[i] = dist(rand);
+        }
+        dvals = vals;
+    }
+
+    void test(gko::size_type size)
+    {
+        gko::kernels::reference::prefix_sum(ref, vals.get_data(), size);
+        gko::kernels::omp::prefix_sum(exec, dvals.get_data(), size);
+
+        auto dptr = dvals.get_const_data();
+        auto ptr = vals.get_const_data();
+        ASSERT_TRUE(std::equal(ptr, ptr + size, dptr));
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::OmpExecutor> exec;
+    std::default_random_engine rand;
+    gko::size_type total_size;
+    gko::Array<index_type> vals;
+    gko::Array<index_type> dvals;
+};
+
+
+TEST_F(PrefixSum, SmallEqualsReference) { test(100); }
+
+
+TEST_F(PrefixSum, BigEqualsReference) { test(total_size); }
+
+
+}  // namespace
diff --git a/reference/test/CMakeLists.txt b/reference/test/CMakeLists.txt
index f0e39d5c568..b359d8146ed 100644
--- a/reference/test/CMakeLists.txt
+++ b/reference/test/CMakeLists.txt
@@ -1,6 +1,7 @@
 include(${CMAKE_SOURCE_DIR}/cmake/create_test.cmake)
 
 add_subdirectory(base)
+add_subdirectory(components)
 add_subdirectory(factorization)
 add_subdirectory(log)
 add_subdirectory(matrix)
diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt
new file mode 100644
index 00000000000..faffe4e94e1
--- /dev/null
+++ b/reference/test/components/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(prefix_sum)
\ No newline at end of file
diff --git a/reference/test/components/prefix_sum.cpp b/reference/test/components/prefix_sum.cpp
new file mode 100644
index 00000000000..2dfa31d5330
--- /dev/null
+++ b/reference/test/components/prefix_sum.cpp
@@ -0,0 +1,70 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+namespace {
+
+
+class PrefixSum : public ::testing::Test {
+protected:
+    using index_type = gko::int32;
+    PrefixSum()
+        : exec(gko::ReferenceExecutor::create()),
+          vals{3, 5, 6, 7, 1, 5, 9, 7, 2, 0, 5},
+          expected{0, 3, 8, 14, 21, 22, 27, 36, 43, 45, 45}
+    {}
+
+    std::shared_ptr<const gko::ReferenceExecutor> exec;
+    std::vector<index_type> vals;
+    std::vector<index_type> expected;
+};
+
+
+TEST_F(PrefixSum, Works)
+{
+    gko::kernels::reference::prefix_sum(exec, vals.data(), vals.size());
+
+    ASSERT_EQ(vals, expected);
+}
+
+
+}  // namespace

From 4abd1ebb5996a2538f89059060c9832d7ed32ca6 Mon Sep 17 00:00:00 2001
From: Tobias Ribizel <ribizel@kit.edu>
Date: Sun, 12 Jan 2020 19:22:39 +0100
Subject: [PATCH 4/4] improve formatting

Co-Authored-By: Yuhsiang M. Tsai <yhmtsai@gmail.com>
---
 cuda/components/prefix_sum.cu       | 4 +---
 hip/components/prefix_sum.hip.cpp   | 4 +---
 omp/components/prefix_sum.cpp       | 4 +---
 reference/components/prefix_sum.cpp | 4 +---
 4 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
index 83a01233d6c..ca273917faa 100644
--- a/cuda/components/prefix_sum.cu
+++ b/cuda/components/prefix_sum.cu
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include "core/components/prefix_sum.hpp"
 
 
@@ -60,7 +59,6 @@ void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
                                                 block_sums);
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
 
 // explicitly instantiate for size_type as well, as this is used in the SellP
@@ -71,4 +69,4 @@ template void prefix_sum<size_type>(std::shared_ptr<const CudaExecutor> exec,
 
 }  // namespace cuda
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/hip/components/prefix_sum.hip.cpp b/hip/components/prefix_sum.hip.cpp
index f0283b6c80e..69907ea002f 100644
--- a/hip/components/prefix_sum.hip.cpp
+++ b/hip/components/prefix_sum.hip.cpp
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include "core/components/prefix_sum.hpp"
 
 
@@ -61,7 +60,6 @@ void prefix_sum(std::shared_ptr<const HipExecutor> exec, IndexType *counts,
         counts, block_sums);
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
 
 // explicitly instantiate for size_type as well, as this is used in the SellP
@@ -72,4 +70,4 @@ template void prefix_sum<size_type>(std::shared_ptr<const HipExecutor> exec,
 
 }  // namespace hip
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/omp/components/prefix_sum.cpp b/omp/components/prefix_sum.cpp
index 7f487a552fb..9c8f7428aa6 100644
--- a/omp/components/prefix_sum.cpp
+++ b/omp/components/prefix_sum.cpp
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include "core/components/prefix_sum.hpp"
 
 
@@ -51,7 +50,6 @@ void prefix_sum(std::shared_ptr<const OmpExecutor> exec, IndexType *counts,
     }
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
 
 // explicitly instantiate for size_type as well, as this is used in the SellP
@@ -62,4 +60,4 @@ template void prefix_sum<size_type>(std::shared_ptr<const OmpExecutor> exec,
 
 }  // namespace omp
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko
diff --git a/reference/components/prefix_sum.cpp b/reference/components/prefix_sum.cpp
index b1dadd229d0..f70e934539e 100644
--- a/reference/components/prefix_sum.cpp
+++ b/reference/components/prefix_sum.cpp
@@ -30,7 +30,6 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-
 #include "core/components/prefix_sum.hpp"
 
 
@@ -51,7 +50,6 @@ void prefix_sum(std::shared_ptr<const ReferenceExecutor> exec,
     }
 }
 
-
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
 
 // explicitly instantiate for size_type as well, as this is used in the SellP
@@ -63,4 +61,4 @@ template void prefix_sum<size_type>(
 
 }  // namespace reference
 }  // namespace kernels
-}  // namespace gko
\ No newline at end of file
+}  // namespace gko