Extract prefix sum computations

This PR introduces a new prefix_sum kernel that is used instead of manually computing prefix sums in different places. This also fixes two bugs: * In CUDA/HIP ParILU, the wrong num_blocks is used for launching the prefix sum kernels * In Reference ParILU, the handling of non-existent diagonals during initialization is incorrect. Finally, it removes a few unnecessary calls to Array::clear since they would be cleaned up via RAII anyways. Related PR: #429
ginkgo-project · Jan 13, 2020 · d53f39b · d53f39b
2 parents 1df07c0 + 4abd1eb
commit d53f39b
Show file tree

Hide file tree

Showing 38 changed files with 841 additions and 330 deletions.
diff --git a/core/components/prefix_sum.hpp b/core/components/prefix_sum.hpp
@@ -0,0 +1,92 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
+#define GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/types.hpp>
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)                 \
+    void prefix_sum(std::shared_ptr<const DefaultExecutor> exec, \
+                    IndexType *counts, size_type num_entries)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES \
+    template <typename IndexType>    \
+    GKO_DECLARE_PREFIX_SUM_KERNEL(IndexType)
+
+
+namespace omp {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace omp
+
+
+namespace cuda {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace cuda
+
+
+namespace reference {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace reference
+
+
+namespace hip {
+
+GKO_DECLARE_ALL_AS_TEMPLATES;
+
+}  // namespace hip
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+#endif  // GKO_CORE_COMPONENTS_PREFIX_SUM_HPP_
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
@@ -62,6 +62,7 @@ target_sources(ginkgo_cuda
         base/exception.cpp
         base/executor.cpp
         base/version.cpp
+        components/prefix_sum.cu
         components/zero_array.cu
         factorization/par_ilu_kernels.cu
         matrix/coo_kernels.cu

diff --git a/cuda/components/prefix_sum.cu b/cuda/components/prefix_sum.cu
@@ -0,0 +1,72 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/components/prefix_sum.hpp"
+
+
+#include "cuda/components/prefix_sum.cuh"
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+
+
+constexpr int prefix_sum_block_size = 512;
+
+
+template <typename IndexType>
+void prefix_sum(std::shared_ptr<const CudaExecutor> exec, IndexType *counts,
+                size_type num_entries)
+{
+    auto num_blocks = ceildiv(num_entries, prefix_sum_block_size);
+    Array<IndexType> block_sum_array(exec, num_blocks);
+    auto block_sums = block_sum_array.get_data();
+    start_prefix_sum<prefix_sum_block_size>
+        <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
+                                                block_sums);
+    finalize_prefix_sum<prefix_sum_block_size>
+        <<<num_blocks, prefix_sum_block_size>>>(num_entries, counts,
+                                                block_sums);
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_KERNEL);
+
+// explicitly instantiate for size_type as well, as this is used in the SellP
+// format
+template void prefix_sum<size_type>(std::shared_ptr<const CudaExecutor> exec,
+                                    size_type *counts, size_type num_entries);
+
+
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/factorization/par_ilu_kernels.cu b/cuda/factorization/par_ilu_kernels.cu
@@ -38,9 +38,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/coo.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "cuda/base/math.hpp"
 #include "cuda/base/types.hpp"
-#include "cuda/components/prefix_sum.cuh"
 
 
 namespace gko {
@@ -67,7 +67,6 @@ void initialize_row_ptrs_l_u(
     IndexType *l_row_ptrs, IndexType *u_row_ptrs)
 {
     const size_type num_rows{system_matrix->get_size()[0]};
-    const size_type num_row_ptrs{num_rows + 1};
 
     const dim3 block_size{default_block_size, 1, 1};
     const uint32 number_blocks =
@@ -80,18 +79,8 @@ void initialize_row_ptrs_l_u(
         as_cuda_type(system_matrix->get_const_values()),
         as_cuda_type(l_row_ptrs), as_cuda_type(u_row_ptrs));
 
-    Array<IndexType> block_sum(exec, grid_dim.x);
-    auto block_sum_ptr = block_sum.get_data();
-
-    start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
-    finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(l_row_ptrs), as_cuda_type(block_sum_ptr));
-
-    start_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
-    finalize_prefix_sum<default_block_size><<<grid_dim, block_size>>>(
-        num_row_ptrs, as_cuda_type(u_row_ptrs), as_cuda_type(block_sum_ptr));
+    prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    prefix_sum(exec, u_row_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -46,6 +46,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/matrix/sellp.hpp>
 
 
+#include "core/components/prefix_sum.hpp"
 #include "core/matrix/csr_builder.hpp"
 #include "core/matrix/dense_kernels.hpp"
 #include "core/synthesizer/implementation_selection.hpp"
@@ -56,7 +57,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/types.hpp"
 #include "cuda/components/atomic.cuh"
 #include "cuda/components/cooperative_groups.cuh"
-#include "cuda/components/prefix_sum.cuh"
 #include "cuda/components/reduction.cuh"
 #include "cuda/components/segment_scan.cuh"
 #include "cuda/components/uninitialized_array.hpp"
@@ -677,27 +677,14 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
         as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
         as_cuda_type(slice_sets));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(slice_num + 1, default_block_size));
-    grid_dim = ceildiv(slice_num + 1, default_block_size);
-
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_data()));
-
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        slice_num + 1, as_cuda_type(slice_sets),
-        as_cuda_type(add_values.get_const_data()));
+    prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
     kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
         num_rows, slice_size, as_cuda_type(source_values),
         as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
         as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
         as_cuda_type(result_col_idxs), as_cuda_type(result_values));
-
-    nnz_per_row.clear();
-    add_values.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -779,11 +766,6 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    block_results.clear();
-    nnz_per_row.clear();
-    max_nnz_per_slice.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -917,10 +899,6 @@ void calculate_max_nnz_per_row(std::shared_ptr<const CudaExecutor> exec,
 
     exec->get_master()->copy_from(exec.get(), 1, d_result.get_const_data(),
                                   result);
-
-    nnz_per_row.clear();
-    block_results.clear();
-    d_result.clear();
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -952,15 +930,7 @@ void convert_to_hybrid(std::shared_ptr<const CudaExecutor> exec,
         num_rows, max_nnz_per_row, as_cuda_type(source->get_const_row_ptrs()),
         as_cuda_type(coo_offset.get_data()));
 
-    auto add_values =
-        Array<size_type>(exec, ceildiv(num_rows, default_block_size));
-    grid_dim = ceildiv(num_rows, default_block_size);
-    start_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(coo_offset.get_data()),
-        as_cuda_type(add_values.get_data()));
-    finalize_prefix_sum<default_block_size><<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(coo_offset.get_data()),
-        as_cuda_type(add_values.get_const_data()));
+    prefix_sum(exec, coo_offset.get_data(), num_rows);
 
     grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
     kernel::fill_in_hybrid<<<grid_dim, default_block_size>>>(