Merge guarded and improved prefix_sum

This guards prefix_sum kernels against overflows and drastically improves their performance. Related PR: #1303
ginkgo-project · Mar 22, 2023 · 8695b65 · 8695b65
2 parents f428910 + 0460931
commit 8695b65
Show file tree

Hide file tree

Showing 74 changed files with 465 additions and 258 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -393,60 +393,35 @@ build/cuda114/nompi/gcc/cuda/debug/shared:
     EXTRA_CMAKE_FLAGS: "-DCMAKE_CUDA_FLAGS=-diag-suppress=177"
     CUDA_ARCH: 61
 
-# ROCm 4.0 and friends
-build/amd/nompi/gcc/rocm40/debug/shared:
-  extends:
-    - .build_and_test_template
-    - .default_variables
-    - .quick_test_condition
-    - .use_gko-rocm40-openmpi-gnu8-llvm50
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_HIP: "ON"
-    RUN_EXAMPLES: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
 
-build/amd/openmpi/clang/rocm40/release/static:
+# ROCm 4.5 and friends
+build/amd/nompi/gcc/rocm45/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
     - .quick_test_condition
-    - .use_gko-rocm40-openmpi-gnu8-llvm50
+    - .use_gko-rocm45-nompi-gnu8-llvm8
   variables:
-    C_COMPILER: "clang"
-    CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_HIP: "ON"
-    BUILD_MPI: "ON"
-    MPI_AS_ROOT: "ON"
+    NONDEFAULT_STREAM: "ON"
+    RUN_EXAMPLES: "ON"
     BUILD_TYPE: "Release"
-    BUILD_SHARED_LIBS: "OFF"
 
-test/amd/openmpi/clang/rocm40/release/static:
+build/amd/nompi/clang/rocm45/release/static:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
-    - .use_gko-rocm40-openmpi-gnu8-llvm50
-  variables:
-    USE_NAME: "amd-openmpi-clang-${CI_PIPELINE_ID}"
-  dependencies: null
-  needs: [ "build/amd/openmpi/clang/rocm40/release/static" ]
-
-# ROCm 4.5 and friends
-build/amd/nompi/gcc/rocm45/release/shared:
-  extends:
-    - .build_and_test_template
-    - .default_variables
-    - .quick_test_condition
+    - .full_test_condition
     - .use_gko-rocm45-nompi-gnu8-llvm8
   variables:
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_HIP: "ON"
-    NONDEFAULT_STREAM: "ON"
     RUN_EXAMPLES: "ON"
     BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
 
 build/amd/nompi/clang/rocm45/debug/shared:
   extends:

diff --git a/.gitlab/image.yml b/.gitlab/image.yml
@@ -69,13 +69,6 @@
     - private_ci
     - nvidia-gpu
 
-.use_gko-rocm40-openmpi-gnu8-llvm50:
-  image: ginkgohub/rocm:40-openmpi-gnu8-llvm50
-  tags:
-    - private_ci
-    - nla-gpu
-
-
 .use_gko-rocm45-nompi-gnu8-llvm8:
   image: ginkgohub/rocm:45-mvapich2-gnu8-llvm8
   tags:

diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ The Ginkgo CUDA module has the following __additional__ requirements:
 
 The Ginkgo HIP module has the following __additional__ requirements:
 
-* _ROCm 4.0+_
+* _ROCm 4.5+_
 *    the HIP, hipBLAS, hipSPARSE, hip/rocRAND and rocThrust packages compiled with either:
     * _AMD_ backend (using the `clang` compiler)
     * _9.2 <= CUDA < 11_ backend

diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
@@ -100,6 +100,16 @@ std::map<std::string, std::function<std::unique_ptr<BenchmarkOperation>(
              return std::make_unique<AdvancedApplyOperation<Generator>>(
                  exec, Generator{}, dims.n, dims.k, dims.m, dims.stride_A,
                  dims.stride_B, dims.stride_C);
+         }},
+        {"prefix_sum32",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<PrefixSumOperation<gko::int32>>(exec,
+                                                                     dims.n);
+         }},
+        {"prefix_sum64",
+         [](std::shared_ptr<const gko::Executor> exec, dimensions dims) {
+             return std::make_unique<PrefixSumOperation<gko::int64>>(exec,
+                                                                     dims.n);
          }}};
 
 

diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp
@@ -46,12 +46,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/loggers.hpp"
 #include "benchmark/utils/timer.hpp"
 #include "benchmark/utils/types.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
 
 
 // Command-line arguments
 DEFINE_string(
     operations, "copy,axpy,scal",
-    "A comma-separated list of BLAS operations to benchmark.\nCandidates are"
+    "A comma-separated list of operations to benchmark.\nCandidates are"
+    "BLAS algorithms:\n"
     "   copy (y = x),\n"
     "   axpy (y = y + a * x),\n"
     "   multiaxpy (like axpy, but a has one entry per column),\n"
@@ -61,6 +63,9 @@ DEFINE_string(
     "   norm (a = sqrt(x' * x)),\n"
     "   mm (C = A * B),\n"
     "   gemm (C = a * A * B + b * C)\n"
+    "Non-numerical algorithms:\n"
+    "   prefix_sum32 (x_i <- sum_{j=0}^{i-1} x_i, 32 bit indices)\n"
+    "   prefix_sum64 (                            64 bit indices)\n"
     "where A has dimensions n x k, B has dimensions k x m,\n"
     "C has dimensions n x m and x and y have dimensions n x r");
 
@@ -354,6 +359,38 @@ class AdvancedApplyOperation : public BenchmarkOperation {
 };
 
 
+GKO_REGISTER_OPERATION(prefix_sum_nonnegative,
+                       components::prefix_sum_nonnegative);
+
+
+template <typename IndexType>
+class PrefixSumOperation : public BenchmarkOperation {
+public:
+    PrefixSumOperation(std::shared_ptr<const gko::Executor> exec,
+                       gko::size_type n)
+        : array_{exec, n}
+    {
+        array_.fill(0);
+    }
+
+    gko::size_type get_flops() const override { return 0; }
+
+    gko::size_type get_memory() const override
+    {
+        return 2 * sizeof(IndexType) * array_.get_num_elems();
+    }
+
+    void run() override
+    {
+        array_.get_executor()->run(make_prefix_sum_nonnegative(
+            array_.get_data(), array_.get_num_elems()));
+    }
+
+private:
+    gko::array<IndexType> array_;
+};
+
+
 struct dimensions {
     gko::size_type n;
     gko::size_type k;

diff --git a/benchmark/utils/timer_impl.hpp b/benchmark/utils/timer_impl.hpp
@@ -117,6 +117,7 @@ class Timer {
                 return copy.at(mid);
             }
         }
+        GKO_NOT_IMPLEMENTED;
     }
 
     /**

diff --git a/common/cuda_hip/components/prefix_sum_kernels.hpp.inc b/common/cuda_hip/components/prefix_sum_kernels.hpp.inc
@@ -0,0 +1,86 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2023, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+template <typename IndexType>
+struct overflowing_sum {
+    constexpr static IndexType max = std::numeric_limits<IndexType>::max();
+    constexpr static IndexType sentinel = -1;
+
+    constexpr IndexType operator()(IndexType i, IndexType j) const
+    {
+        if (did_overflow(i) || did_overflow(j) || max - i < j) {
+            return sentinel;
+        }
+        return i + j;
+    }
+
+    constexpr static bool did_overflow(IndexType i) { return i < 0; }
+};
+
+
+template <>
+struct overflowing_sum<size_type> {
+    constexpr static size_type max = std::numeric_limits<size_type>::max();
+    constexpr static size_type sentinel = max;
+
+    constexpr size_type operator()(size_type i, size_type j) const
+    {
+        if (did_overflow(i) || did_overflow(j) || max - i < j) {
+            return sentinel;
+        }
+        return i + j;
+    }
+
+    constexpr static bool did_overflow(size_type i) { return i == sentinel; }
+};
+
+
+template <typename IndexType>
+void prefix_sum_nonnegative(std::shared_ptr<const DefaultExecutor> exec,
+                            IndexType* counts, size_type num_entries)
+{
+    constexpr auto max = std::numeric_limits<IndexType>::max();
+    thrust::exclusive_scan(thrust_policy(exec), counts, counts + num_entries,
+                           counts, IndexType{}, overflowing_sum<IndexType>{});
+    if (num_entries > 0 &&
+        overflowing_sum<IndexType>::did_overflow(
+            exec->copy_val_to_host(counts + num_entries - 1))) {
+        throw OverflowError(__FILE__, __LINE__,
+                            name_demangling::get_type_name(typeid(IndexType)));
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_PREFIX_SUM_NONNEGATIVE_KERNEL);
+
+// instantiate for size_type as well, as this is used in the Sellp format
+template void prefix_sum_nonnegative<size_type>(
+    std::shared_ptr<const DefaultExecutor>, size_type*, size_type);
diff --git a/common/cuda_hip/factorization/factorization_kernels.hpp.inc b/common/cuda_hip/factorization/factorization_kernels.hpp.inc
@@ -409,7 +409,7 @@ void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,
         return;
     }
 
-    components::prefix_sum(exec, row_ptrs_add, row_ptrs_size);
+    components::prefix_sum_nonnegative(exec, row_ptrs_add, row_ptrs_size);
     exec->synchronize();
 
     auto total_additions =
@@ -465,8 +465,8 @@ void initialize_row_ptrs_l_u(
             u_row_ptrs);
     }
 
-    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
-    components::prefix_sum(exec, u_row_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -520,7 +520,7 @@ void initialize_row_ptrs_l(
             as_device_type(system_matrix->get_const_values()), l_row_ptrs);
     }
 
-    components::prefix_sum(exec, l_row_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

diff --git a/common/cuda_hip/preconditioner/isai_kernels.hpp.inc b/common/cuda_hip/preconditioner/isai_kernels.hpp.inc
@@ -461,8 +461,8 @@ void generate_tri_inverse(std::shared_ptr<const DefaultExecutor> exec,
                     excess_nz_ptrs);
         }
     }
-    components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
-    components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, excess_rhs_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -490,8 +490,8 @@ void generate_general_inverse(std::shared_ptr<const DefaultExecutor> exec,
                 as_device_type(inverse->get_values()), excess_rhs_ptrs,
                 excess_nz_ptrs, spd);
     }
-    components::prefix_sum(exec, excess_rhs_ptrs, num_rows + 1);
-    components::prefix_sum(exec, excess_nz_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, excess_rhs_ptrs, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, excess_nz_ptrs, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp
@@ -100,8 +100,8 @@ void build_from_mapping(std::shared_ptr<const DefaultExecutor> exec,
             range_starting_index[i] = cur_part != prev_part ? 1 : 0;
         },
         mapping.get_num_elems(), mapping, range_starting_index);
-    components::prefix_sum(exec, range_starting_index.get_data(),
-                           mapping.get_num_elems() + 1);
+    components::prefix_sum_nonnegative(exec, range_starting_index.get_data(),
+                                       mapping.get_num_elems() + 1);
     run_kernel(
         exec,
         [] GKO_KERNEL(auto i, auto size, auto mapping,
@@ -140,7 +140,8 @@ void build_ranges_from_global_size(std::shared_ptr<const DefaultExecutor> exec,
             ranges[i] = size_per_part + (i < rest ? 1 : 0);
         },
         ranges.get_num_elems() - 1, size_per_part, rest, ranges.get_data());
-    components::prefix_sum(exec, ranges.get_data(), ranges.get_num_elems());
+    components::prefix_sum_nonnegative(exec, ranges.get_data(),
+                                       ranges.get_num_elems());
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE);

diff --git a/common/unified/matrix/csr_kernels.cpp b/common/unified/matrix/csr_kernels.cpp
@@ -280,7 +280,7 @@ void build_lookup_offsets(std::shared_ptr<const DefaultExecutor> exec,
             }
         },
         num_rows, row_ptrs, col_idxs, num_rows, allowed, storage_offsets);
-    components::prefix_sum(exec, storage_offsets, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, storage_offsets, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(

diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp
@@ -354,7 +354,7 @@ void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
         gko::dim<2>{num_slices, slice_size}, row_nnz, slice_size, stride_factor,
         num_rows);
     exec->copy(num_slices, slice_lengths, slice_sets);
-    components::prefix_sum(exec, slice_sets, num_slices + 1);
+    components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(

diff --git a/common/unified/matrix/hybrid_kernels.cpp b/common/unified/matrix/hybrid_kernels.cpp
@@ -59,7 +59,8 @@ void compute_coo_row_ptrs(std::shared_ptr<const DefaultExecutor> exec,
                                                static_cast<int64>(ell_lim));
         },
         row_nnz.get_num_elems(), row_nnz, ell_lim, coo_row_ptrs);
-    components::prefix_sum(exec, coo_row_ptrs, row_nnz.get_num_elems() + 1);
+    components::prefix_sum_nonnegative(exec, coo_row_ptrs,
+                                       row_nnz.get_num_elems() + 1);
 }
 
 

diff --git a/common/unified/matrix/sellp_kernels.cpp b/common/unified/matrix/sellp_kernels.cpp
@@ -77,7 +77,7 @@ void compute_slice_sets(std::shared_ptr<const DefaultExecutor> exec,
         gko::dim<2>{num_slices, slice_size}, row_ptrs, slice_size,
         stride_factor, num_rows);
     exec->copy(num_slices, slice_lengths, slice_sets);
-    components::prefix_sum(exec, slice_sets, num_slices + 1);
+    components::prefix_sum_nonnegative(exec, slice_sets, num_slices + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(

diff --git a/common/unified/matrix/sparsity_csr_kernels.cpp b/common/unified/matrix/sparsity_csr_kernels.cpp
@@ -97,7 +97,7 @@ void diagonal_element_prefix_sum(
         },
         num_rows, matrix->get_const_row_ptrs(), matrix->get_const_col_idxs(),
         prefix_sum);
-    components::prefix_sum(exec, prefix_sum, num_rows + 1);
+    components::prefix_sum_nonnegative(exec, prefix_sum, num_rows + 1);
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(

diff --git a/common/unified/multigrid/pgm_kernels.cpp b/common/unified/multigrid/pgm_kernels.cpp
@@ -111,7 +111,8 @@ void renumber(std::shared_ptr<const DefaultExecutor> exec,
         },
         num, agg.get_const_data(), agg_map.get_data());
 
-    components::prefix_sum(exec, agg_map.get_data(), agg_map.get_num_elems());
+    components::prefix_sum_nonnegative(exec, agg_map.get_data(),
+                                       agg_map.get_num_elems());
 
     run_kernel(
         exec,
-Original file line number
+Diff line change
@@ Expand Up / @@ -117,6 +117,7 @@ class Timer { @@
                     return copy.at(mid);
                 }
             }
+            GKO_NOT_IMPLEMENTED;
         }
         /**
@@ Expand Down @@