From 733a09072de99d55debf0bacd2907d03332dce96 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Tue, 9 Jan 2024 15:14:37 -0700 Subject: [PATCH 1/2] spadd: change arguments to ctor of SPADDHandle add a default value to input_sorted; add a second argument input_merged to indicate unqiue entries; So that we can easily know whether we can use TPLs on the input matrices --- sparse/src/KokkosKernels_Handle.hpp | 23 ++++++++++++----------- sparse/src/KokkosSparse_spadd_handle.hpp | 9 +++++++-- sparse/unit_test/Test_Sparse_spadd.hpp | 10 ++++++++-- 3 files changed, 27 insertions(+), 15 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 4f33795018..48c70ca156 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -605,18 +605,18 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: - * + * * KokkosSpace::GS_DEFAULT PointGaussSeidel * KokkosSpace::GS_PERMUTED ?? * KokkosSpace::GS_TEAM ?? * KokkosSpace::GS_CLUSTER ?? * KokkosSpace::GS_TWOSTAGE ?? * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT ?? * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring @@ -649,9 +649,9 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param gs_algorithm Specifies which algorithm to use: - * + * * KokkosSpace::GS_DEFAULT PointGaussSeidel or BlockGaussSeidel, depending on matrix type. * KokkosSpace::GS_PERMUTED Reorders rows/cols into colors to improve locality. Uses RangePolicy over rows. * KokkosSpace::GS_TEAM Uses TeamPolicy over batches of rows with ThreadVector within rows. @@ -660,7 +660,7 @@ class KokkosKernelsHandle { * KokkosSpace::GS_TWOSTAGE Uses spmv to parallelize inner sweeps of x. * For more information, see: https://arxiv.org/pdf/2104.01196.pdf. * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT Depends on execution space: * COLORING_SERIAL on Kokkos::Serial; * COLORING_EB on GPUs; @@ -744,16 +744,16 @@ class KokkosKernelsHandle { // clang-format off /** * @brief Create a gs handle object - * + * * @param clusterAlgo Specifies which clustering algorithm to use: - * + * * KokkosSparse::CLUSTER_DEFAULT ?? * KokkosSparse::CLUSTER_MIS2 ?? * KokkosSparse::CLUSTER_BALLOON ?? * KokkosSparse::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: - * + * * KokkosGraph::COLORING_DEFAULT ?? * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring * KokkosGraph::COLORING_VB Vertex Based Coloring @@ -821,10 +821,11 @@ class KokkosKernelsHandle { // ---------------------------------------- // SPADDHandleType *get_spadd_handle() { return this->spaddHandle; } - void create_spadd_handle(bool input_sorted) { + void create_spadd_handle(bool input_sorted = false, + bool input_merged = false) { this->destroy_spadd_handle(); this->is_owner_of_the_spadd_handle = true; - this->spaddHandle = new SPADDHandleType(input_sorted); + this->spaddHandle = new SPADDHandleType(input_sorted, input_merged); } void destroy_spadd_handle() { if (is_owner_of_the_spadd_handle && this->spaddHandle != NULL) { diff --git a/sparse/src/KokkosSparse_spadd_handle.hpp b/sparse/src/KokkosSparse_spadd_handle.hpp index 2902550d6a..007fac33ac 100644 --- a/sparse/src/KokkosSparse_spadd_handle.hpp +++ b/sparse/src/KokkosSparse_spadd_handle.hpp @@ -33,7 +33,9 @@ class SPADDHandle { typedef ExecutionSpace execution_space; private: - bool input_sorted; + // if both are true, the input matrices are strict CRS + bool input_sorted; // column indices in a row are sorted + bool input_merged; // column indices in a row are unique (i.e., merged) size_type result_nnz_size; @@ -79,8 +81,9 @@ class SPADDHandle { /** * \brief Default constructor. */ - SPADDHandle(bool input_is_sorted) + SPADDHandle(bool input_is_sorted, bool input_is_merged = false) : input_sorted(input_is_sorted), + input_merged(input_is_merged), result_nnz_size(0), called_symbolic(false), called_numeric(false) {} @@ -95,6 +98,8 @@ class SPADDHandle { void set_call_numeric(bool call = true) { this->called_numeric = call; } bool is_input_sorted() { return input_sorted; } + bool is_input_merged() { return input_merged; } + bool is_input_strict_crs() { return input_sorted && input_merged; } }; } // namespace KokkosSparse diff --git a/sparse/unit_test/Test_Sparse_spadd.hpp b/sparse/unit_test/Test_Sparse_spadd.hpp index 05ff97bb3a..f48d86a98c 100644 --- a/sparse/unit_test/Test_Sparse_spadd.hpp +++ b/sparse/unit_test/Test_Sparse_spadd.hpp @@ -32,7 +32,11 @@ typedef Kokkos::complex kokkos_complex_double; typedef Kokkos::complex kokkos_complex_float; -// Create a random square matrix for testing mat-mat addition kernels +// Create a random nrows by ncols matrix for testing mat-mat addition kernels. +// minNNZ, maxNNZ: min and max number of nonzeros in any row. +// maxNNZ > ncols will result in duplicated entries in a row, otherwise entries +// in a row are unique. +// sortRows: whether to sort columns in a row template crsMat_t randomMatrix(ordinal_type nrows, ordinal_type ncols, ordinal_type minNNZ, ordinal_type maxNNZ, bool sortRows) { @@ -117,7 +121,9 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, srand((numRows << 1) ^ numCols); KernelHandle handle; - handle.create_spadd_handle(sortRows); + // If maxNNZ <= numCols, the generated A, B have unique column indices in each + // row + handle.create_spadd_handle(sortRows, static_cast(maxNNZ) <= numCols); crsMat_t A = randomMatrix(numRows, numCols, minNNZ, maxNNZ, sortRows); crsMat_t B = From c9d03f223a2cebd176ee5bd4d423159a54746b11 Mon Sep 17 00:00:00 2001 From: Junchao Zhang Date: Wed, 25 Oct 2023 11:26:40 -0500 Subject: [PATCH 2/2] spadd: add cuda/rocm TPL support for spadd_symbolic/numeric --- perf_test/sparse/KokkosSparse_spadd.cpp | 10 +- .../KokkosSparse_par_ilut_numeric_impl.hpp | 22 +- .../impl/KokkosSparse_spadd_numeric_impl.hpp | 31 +- .../impl/KokkosSparse_spadd_numeric_spec.hpp | 62 ++-- .../impl/KokkosSparse_spadd_symbolic_impl.hpp | 114 ++++--- .../impl/KokkosSparse_spadd_symbolic_spec.hpp | 52 ++-- sparse/src/KokkosSparse_spadd.hpp | 242 ++++++++++----- sparse/src/KokkosSparse_spadd_handle.hpp | 44 +++ ...kkosSparse_spadd_numeric_tpl_spec_decl.hpp | 282 ++++++++++++++++++ ...kosSparse_spadd_symbolic_tpl_spec_decl.hpp | 238 +++++++++++++++ .../KokkosSparse_spadd_tpl_spec_avail.hpp | 117 +++++++- .../tpls/KokkosSparse_spadd_tpl_spec_decl.hpp | 24 -- sparse/unit_test/Test_Sparse_spadd.hpp | 13 +- 13 files changed, 1002 insertions(+), 249 deletions(-) create mode 100644 sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp create mode 100644 sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp delete mode 100644 sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 3b347eb903..a785ea82f6 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -303,8 +303,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { double numericTime = 0; // Do an untimed warm up symbolic, and preallocate space for C entries/values - spadd_symbolic(&kh, A.graph.row_map, A.graph.entries, B.graph.row_map, - B.graph.entries, row_mapC); + spadd_symbolic(exec_space{}, &kh, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); bool use_kk = !params.use_cusparse && !params.use_mkl; @@ -366,7 +366,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { for (int sumRep = 0; sumRep < params.repeat; sumRep++) { timer.reset(); if (use_kk) { - spadd_symbolic(&kh, A.graph.row_map, A.graph.entries, B.graph.row_map, + spadd_symbolic(exec_space{}, &kh, A.numRows(), A.numCols(), + A.graph.row_map, A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); c_nnz = addHandle->get_c_nnz(); } else if (params.use_cusparse) { @@ -434,7 +435,8 @@ void run_experiment(int argc, char** argv, CommonInputParams) { } #endif } else { - spadd_numeric(&kh, A.graph.row_map, A.graph.entries, A.values, + spadd_numeric(exec_space{}, &kh, A.numRows(), A.numCols(), + A.graph.row_map, A.graph.entries, A.values, 1.0, // A, alpha B.graph.row_map, B.graph.entries, B.values, 1.0, // B, beta diff --git a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp index 9375039747..6bdf0eb577 100644 --- a/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_par_ilut_numeric_impl.hpp @@ -699,18 +699,24 @@ struct IlutWrap { multiply_matrices(kh, ih, L_row_map, L_entries, L_values, U_row_map, U_entries, U_values, LU_row_map, LU_entries, LU_values); - auto addHandle = kh.get_spadd_handle(); - KokkosSparse::Experimental::spadd_symbolic( - &kh, A_row_map, A_entries, LU_row_map, LU_entries, R_row_map); + auto addHandle = kh.get_spadd_handle(); + typename KHandle::const_nnz_lno_t m = A_row_map.extent(0) - 1, + n = m; // square matrix + // TODO: let compute_residual_norm also take an execution space argument and + // use that for exec! + typename KHandle::HandleExecSpace exec{}; + KokkosSparse::Experimental::spadd_symbolic(exec, &kh, m, n, A_row_map, + A_entries, LU_row_map, + LU_entries, R_row_map); const size_type r_nnz = addHandle->get_c_nnz(); - Kokkos::resize(R_entries, r_nnz); - Kokkos::resize(R_values, r_nnz); + Kokkos::resize(exec, R_entries, r_nnz); + Kokkos::resize(exec, R_values, r_nnz); KokkosSparse::Experimental::spadd_numeric( - &kh, A_row_map, A_entries, A_values, 1., LU_row_map, LU_entries, - LU_values, -1., R_row_map, R_entries, R_values); - + exec, &kh, m, n, A_row_map, A_entries, A_values, 1., LU_row_map, + LU_entries, LU_values, -1., R_row_map, R_entries, R_values); + // TODO: how to make this policy use exec? auto policy = ih.get_default_team_policy(); Kokkos::parallel_reduce( diff --git a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp index 8e70cd3c3b..fa356dc963 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_impl.hpp @@ -174,24 +174,23 @@ struct UnsortedNumericSumFunctor { std::is_same::type, \ typename std::remove_const::type>::value -template +template < + typename execution_space, typename KernelHandle, typename alno_row_view_t, + typename alno_nnz_view_t, typename ascalar_t, typename ascalar_nnz_view_t, + typename blno_row_view_t, typename blno_nnz_view_t, typename bscalar_t, + typename bscalar_nnz_view_t, typename clno_row_view_t, + typename clno_nnz_view_t, typename cscalar_nnz_view_t> void spadd_numeric_impl( - KernelHandle* kernel_handle, const alno_row_view_t a_rowmap, - const alno_nnz_view_t a_entries, const ascalar_nnz_view_t a_values, - const ascalar_t alpha, const blno_row_view_t b_rowmap, - const blno_nnz_view_t b_entries, const bscalar_nnz_view_t b_values, - const bscalar_t beta, const clno_row_view_t c_rowmap, - clno_nnz_view_t c_entries, cscalar_nnz_view_t c_values) { + const execution_space& exec, KernelHandle* kernel_handle, + const alno_row_view_t a_rowmap, const alno_nnz_view_t a_entries, + const ascalar_nnz_view_t a_values, const ascalar_t alpha, + const blno_row_view_t b_rowmap, const blno_nnz_view_t b_entries, + const bscalar_nnz_view_t b_values, const bscalar_t beta, + const clno_row_view_t c_rowmap, clno_nnz_view_t c_entries, + cscalar_nnz_view_t c_values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; // Check that A/B/C data types match KernelHandle types, and that C data types // are nonconst (doesn't matter if A/B types are const) static_assert(SAME_TYPE(ascalar_t, scalar_type), @@ -252,7 +251,7 @@ void spadd_numeric_impl( sortedNumeric(a_rowmap, b_rowmap, c_rowmap, a_entries, b_entries, c_entries, a_values, b_values, c_values, alpha, beta); Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputSorted", - range_type(0, nrows), sortedNumeric); + range_type(exec, 0, nrows), sortedNumeric); } else { // use a_pos and b_pos (set in the handle by symbolic) to quickly compute C // entries and values @@ -265,7 +264,7 @@ void spadd_numeric_impl( c_entries, a_values, b_values, c_values, alpha, beta, addHandle->get_a_pos(), addHandle->get_b_pos()); Kokkos::parallel_for("KokkosSparse::SpAdd:Numeric::InputNotSorted", - range_type(0, nrows), unsortedNumeric); + range_type(exec, 0, nrows), unsortedNumeric); } addHandle->set_call_numeric(); } diff --git a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp index e81649f552..18731348de 100644 --- a/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_numeric_spec.hpp @@ -28,10 +28,10 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spadd_numeric_eti_spec_avail { enum : bool { value = false }; }; @@ -44,6 +44,7 @@ struct spadd_numeric_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spadd_numeric_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -87,20 +88,22 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) -template ::value, + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + a_scalar_view_t, b_size_view_t, b_lno_view_t, b_scalar_view_t, + c_size_view_t, c_lno_view_t, c_scalar_view_t>::value, bool eti_spec_avail = spadd_numeric_eti_spec_avail< - KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, - b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, - c_lno_view_t, c_scalar_view_t>::value> + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + a_scalar_view_t, b_size_view_t, b_lno_view_t, b_scalar_view_t, + c_size_view_t, c_lno_view_t, c_scalar_view_t>::value> struct SPADD_NUMERIC { - static void spadd_numeric(KernelHandle *handle, + static void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, typename a_scalar_view_t::const_value_type alpha, a_size_view_t row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, @@ -112,15 +115,17 @@ struct SPADD_NUMERIC { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct SPADD_NUMERIC { - static void spadd_numeric(KernelHandle *handle, +template +struct SPADD_NUMERIC< + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, a_scalar_view_t, + b_size_view_t, b_lno_view_t, b_scalar_view_t, c_size_view_t, c_lno_view_t, + c_scalar_view_t, false, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> { + static void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t /* m */, + typename KernelHandle::const_nnz_lno_t /* n */, typename a_scalar_view_t::const_value_type alpha, a_size_view_t row_mapA, a_lno_view_t entriesA, a_scalar_view_t valuesA, @@ -128,8 +133,9 @@ struct SPADD_NUMERIC, \ @@ -178,6 +185,7 @@ struct SPADD_NUMERIC, \ @@ -210,6 +218,6 @@ struct SPADD_NUMERIC >, \ false, true>; -#include +#include #endif diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp index 15132f9da3..80506e3056 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_impl.hpp @@ -371,50 +371,48 @@ struct MergeEntriesFunctor { }; // Run SortedCountEntries: non-GPU, always uses the RangePolicy version. -template +template void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = + const execution_space& exec, const alno_row_view_t_& a_rowmap, + const alno_nnz_view_t_& a_entries, const blno_row_view_t_& b_rowmap, + const blno_nnz_view_t_& b_entries, const clno_row_view_t_& c_rowmap, + typename std::enable_if< + !KokkosKernels::Impl::kk_is_gpu_exec_space()>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using range_type = Kokkos::RangePolicy; - auto nrows = c_rowmap.extent(0) - 1; + using range_type = Kokkos::RangePolicy; + auto nrows = c_rowmap.extent(0) - 1; SortedCountEntriesRange countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); Kokkos::parallel_for( "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - range_type(0, nrows), countEntries); + range_type(exec, 0, nrows), countEntries); } // Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending // on average nz per row (a runtime decision) -template +template void runSortedCountEntries( - const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, - const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, - const clno_row_view_t_& c_rowmap, - typename std::enable_if()>::type* = + const execution_space& exec, const alno_row_view_t_& a_rowmap, + const alno_nnz_view_t_& a_entries, const blno_row_view_t_& b_rowmap, + const blno_nnz_view_t_& b_entries, const clno_row_view_t_& c_rowmap, + typename std::enable_if< + KokkosKernels::Impl::kk_is_gpu_exec_space()>::type* = nullptr) { using size_type = typename KernelHandle::size_type; using ordinal_type = typename KernelHandle::nnz_lno_t; - using execution_space = - typename KernelHandle::SPADDHandleType::execution_space; - using RangePol = Kokkos::RangePolicy; - using TeamPol = Kokkos::TeamPolicy; - auto nrows = c_rowmap.extent(0) - 1; + using RangePol = Kokkos::RangePolicy; + using TeamPol = Kokkos::TeamPolicy; + auto nrows = c_rowmap.extent(0) - 1; size_type c_est_nnz = 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; if (c_est_nnz <= 512) { @@ -435,14 +433,14 @@ void runSortedCountEntries( countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); countEntries.sharedPerThread = pot_est_nnz; // compute largest possible team size - TeamPol testPolicy(1, 1, vector_length); + TeamPol testPolicy(exec, 1, 1, vector_length); testPolicy.set_scratch_size( 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); int team_size = testPolicy.team_size_recommended(countEntries, Kokkos::ParallelForTag()); // construct real policy int league_size = (nrows + team_size - 1) / team_size; - TeamPol policy(league_size, team_size, vector_length); + TeamPol policy(exec, league_size, team_size, vector_length); policy.set_scratch_size( 0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); countEntries.totalShared = @@ -457,24 +455,23 @@ void runSortedCountEntries( countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); Kokkos::parallel_for( "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - RangePol(0, nrows), countEntries); + RangePol(exec, 0, nrows), countEntries); } } // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template +template void spadd_symbolic_impl( - KernelHandle* handle, const alno_row_view_t_ a_rowmap, - const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, - const blno_nnz_view_t_ b_entries, + const execution_space& exec, KernelHandle* handle, + const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, + const blno_row_view_t_ b_rowmap, const blno_nnz_view_t_ b_entries, clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't // need to be initialized) { - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::SPADDHandleType::nnz_lno_view_t ordinal_view_t; @@ -520,17 +517,18 @@ void spadd_symbolic_impl( ordinal_type nrows = a_rowmap.extent(0) - 1; typedef Kokkos::RangePolicy range_type; if (addHandle->is_input_sorted()) { - runSortedCountEntries( - a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + runSortedCountEntries(exec, a_rowmap, a_entries, b_rowmap, + b_entries, c_rowmap); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); + exec, nrows + 1, c_rowmap); } else { // note: scoping individual parts of the process to free views sooner, // minimizing peak memory usage run the unsorted c_rowmap upper bound // functor (just adds together A and B entry counts row by row) offset_view_t c_rowmap_upperbound( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "C row counts upper bound"), nrows + 1); size_type c_nnz_upperbound = 0; @@ -540,17 +538,17 @@ void spadd_symbolic_impl( countEntries(nrows, a_rowmap, b_rowmap, c_rowmap_upperbound); Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::CountEntries", - range_type(0, nrows), countEntries); + range_type(exec, 0, nrows), countEntries); KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap_upperbound); - Kokkos::deep_copy(c_nnz_upperbound, + exec, nrows + 1, c_rowmap_upperbound); + Kokkos::deep_copy(exec, c_nnz_upperbound, Kokkos::subview(c_rowmap_upperbound, nrows)); } ordinal_view_t c_entries_uncompressed( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "C entries uncompressed"), c_nnz_upperbound); - ordinal_view_t ab_perm(Kokkos::view_alloc(Kokkos::WithoutInitializing, + ordinal_view_t ab_perm(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "A and B permuted entry indices"), c_nnz_upperbound); // compute the unmerged sum @@ -561,17 +559,17 @@ void spadd_symbolic_impl( c_rowmap_upperbound, c_entries_uncompressed, ab_perm); Kokkos::parallel_for( "KokkosSparse::SpAdd:Symbolic::InputNotSorted::UnmergedSum", - range_type(0, nrows), unmergedSum); + range_type(exec, 0, nrows), unmergedSum); // sort the unmerged sum KokkosSparse::sort_crs_matrix( - c_rowmap_upperbound, c_entries_uncompressed, ab_perm); - ordinal_view_t a_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "A entry positions"), - a_entries.extent(0)); - ordinal_view_t b_pos( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "B entry positions"), - b_entries.extent(0)); + exec, c_rowmap_upperbound, c_entries_uncompressed, ab_perm); + ordinal_view_t a_pos(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "A entry positions"), + a_entries.extent(0)); + ordinal_view_t b_pos(Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, + "B entry positions"), + b_entries.extent(0)); // merge the entries and compute Apos/Bpos, as well as Crowcounts { MergeEntriesFunctor( - nrows + 1, c_rowmap); + exec, nrows + 1, c_rowmap); } addHandle->set_a_b_pos(a_pos, b_pos); } // provide the number of NNZ in C to user through handle size_type cmax; - Kokkos::deep_copy(cmax, Kokkos::subview(c_rowmap, nrows)); + Kokkos::deep_copy(exec, cmax, Kokkos::subview(c_rowmap, nrows)); addHandle->set_c_nnz(cmax); addHandle->set_call_symbolic(); addHandle->set_call_numeric(false); diff --git a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp index aaab68568a..bdc4ed04bd 100644 --- a/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_spadd_symbolic_spec.hpp @@ -28,8 +28,9 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists -template +template struct spadd_symbolic_eti_spec_avail { enum : bool { value = false }; }; @@ -42,6 +43,7 @@ struct spadd_symbolic_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spadd_symbolic_eti_spec_avail< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -73,31 +75,39 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosBlas::spadd (sparse-sparse matrix addition) -template ::value, + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + b_size_view_t, b_lno_view_t, c_size_view_t>::value, bool eti_spec_avail = spadd_symbolic_eti_spec_avail< - KernelHandle, a_size_view_t, a_lno_view_t, b_size_view_t, - b_lno_view_t, c_size_view_t>::value> + ExecSpace, KernelHandle, a_size_view_t, a_lno_view_t, + b_size_view_t, b_lno_view_t, c_size_view_t>::value> struct SPADD_SYMBOLIC { - static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, - a_lno_view_t entriesA, b_size_view_t row_mapB, - b_lno_view_t entriesB, c_size_view_t row_mapC); + static void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + a_size_view_t row_mapA, a_lno_view_t entriesA, + b_size_view_t row_mapB, b_lno_view_t entriesB, + c_size_view_t row_mapC); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct SPADD_SYMBOLIC +struct SPADD_SYMBOLIC { - static void spadd_symbolic(KernelHandle *handle, a_size_view_t row_mapA, - a_lno_view_t entriesA, b_size_view_t row_mapB, - b_lno_view_t entriesB, c_size_view_t row_mapC) { - spadd_symbolic_impl(handle, row_mapA, entriesA, row_mapB, entriesB, + static void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t /* m */, + typename KernelHandle::const_nnz_lno_t /* n */, + a_size_view_t row_mapA, a_lno_view_t entriesA, + b_size_view_t row_mapB, b_lno_view_t entriesB, + c_size_view_t row_mapC) { + spadd_symbolic_impl(exec, handle, row_mapA, entriesA, row_mapB, entriesB, row_mapC); } }; @@ -111,6 +121,7 @@ struct SPADD_SYMBOLIC, \ @@ -135,6 +146,7 @@ struct SPADD_SYMBOLIC, \ @@ -155,6 +167,6 @@ struct SPADD_SYMBOLIC >, \ false, true>; -#include +#include #endif diff --git a/sparse/src/KokkosSparse_spadd.hpp b/sparse/src/KokkosSparse_spadd.hpp index 74efed66bc..4151ea6783 100644 --- a/sparse/src/KokkosSparse_spadd.hpp +++ b/sparse/src/KokkosSparse_spadd.hpp @@ -19,25 +19,27 @@ #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_helpers.hpp" -#include "KokkosSparse_spadd_symbolic_spec.hpp" +#include "KokkosBlas1_scal.hpp" #include "KokkosSparse_spadd_numeric_spec.hpp" +#include "KokkosSparse_spadd_symbolic_spec.hpp" namespace KokkosSparse { namespace Experimental { // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template void spadd_symbolic( - KernelHandle* handle, const alno_row_view_t_ a_rowmap, + const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, // same type as column indices + typename KernelHandle::const_nnz_lno_t n, const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const blno_row_view_t_ b_rowmap, const blno_nnz_view_t_ b_entries, clno_row_view_t_ c_rowmap) // c_rowmap must already be allocated (doesn't // need to be initialized) { - typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; @@ -51,49 +53,69 @@ void spadd_symbolic( ConstKernelHandle; ConstKernelHandle tmp_handle(*handle); - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_rowmap; - KokkosSparse::Impl::SPADD_SYMBOLIC:: - spadd_symbolic(&tmp_handle, - Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), - Internal_a_entries(a_entries.data(), a_entries.extent(0)), - Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), - Internal_b_entries(b_entries.data(), b_entries.extent(0)), - Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + + auto addHandle = handle->get_spadd_handle(); + bool useFallback = !addHandle->is_input_strict_crs(); + if (useFallback) { + KokkosSparse::Impl::SPADD_SYMBOLIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_b_rowmap, Internal_b_entries, Internal_c_rowmap, false>:: + spadd_symbolic( + exec, &tmp_handle, m, n, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + } else { + KokkosSparse::Impl::SPADD_SYMBOLIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_b_rowmap, Internal_b_entries, Internal_c_rowmap>:: + spadd_symbolic( + exec, &tmp_handle, m, n, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0))); + } } -template -void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, +void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t m, + typename KernelHandle::const_nnz_lno_t n, + const alno_row_view_t_ a_rowmap, const alno_nnz_view_t_ a_entries, const ascalar_nnz_view_t_ a_values, const ascalar_t_ alpha, const blno_row_view_t_ b_rowmap, @@ -101,7 +123,6 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, const bscalar_nnz_view_t_ b_values, const bscalar_t_ beta, const clno_row_view_t_ c_rowmap, clno_nnz_view_t_ c_entries, cscalar_nnz_view_t_ c_values) { - typedef typename KernelHandle::HandleExecSpace ExecSpace; typedef typename KernelHandle::HandleTempMemorySpace MemSpace; typedef typename KernelHandle::HandlePersistentMemorySpace PersistentMemSpace; typedef typename Kokkos::Device DeviceType; @@ -113,116 +134,177 @@ void spadd_numeric(KernelHandle* handle, const alno_row_view_t_ a_rowmap, typedef typename KokkosKernels::Experimental::KokkosKernelsHandle< c_size_t, c_lno_t, c_scalar_t, ExecSpace, MemSpace, PersistentMemSpace> ConstKernelHandle; - ConstKernelHandle tmp_handle(*handle); + ConstKernelHandle tmp_handle(*handle); // handle->exec_space is also copied - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_a_values; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_b_values; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_rowmap; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_entries; - typedef Kokkos::View::array_layout, - DeviceType, Kokkos::MemoryTraits > + DeviceType, Kokkos::MemoryTraits> Internal_c_values; - KokkosSparse::Impl::SPADD_NUMERIC:: - spadd_numeric(&tmp_handle, alpha, - Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), - Internal_a_entries(a_entries.data(), a_entries.extent(0)), - Internal_a_values(a_values.data(), a_values.extent(0)), - beta, - Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), - Internal_b_entries(b_entries.data(), b_entries.extent(0)), - Internal_b_values(b_values.data(), b_values.extent(0)), - Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), - Internal_c_entries(c_entries.data(), c_entries.extent(0)), - Internal_c_values(c_values.data(), c_values.extent(0))); + + auto addHandle = handle->get_spadd_handle(); + bool useFallback = !addHandle->is_input_strict_crs(); + if (useFallback) { + KokkosSparse::Impl::SPADD_NUMERIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_a_values, Internal_b_rowmap, Internal_b_entries, + Internal_b_values, Internal_c_rowmap, Internal_c_entries, + Internal_c_values, false>:: + spadd_numeric(exec, &tmp_handle, m, n, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); + } else { + KokkosSparse::Impl::SPADD_NUMERIC< + ExecSpace, ConstKernelHandle, Internal_a_rowmap, Internal_a_entries, + Internal_a_values, Internal_b_rowmap, Internal_b_entries, + Internal_b_values, Internal_c_rowmap, Internal_c_entries, + Internal_c_values>:: + spadd_numeric(exec, &tmp_handle, m, n, alpha, + Internal_a_rowmap(a_rowmap.data(), a_rowmap.extent(0)), + Internal_a_entries(a_entries.data(), a_entries.extent(0)), + Internal_a_values(a_values.data(), a_values.extent(0)), + beta, + Internal_b_rowmap(b_rowmap.data(), b_rowmap.extent(0)), + Internal_b_entries(b_entries.data(), b_entries.extent(0)), + Internal_b_values(b_values.data(), b_values.extent(0)), + Internal_c_rowmap(c_rowmap.data(), c_rowmap.extent(0)), + Internal_c_entries(c_entries.data(), c_entries.extent(0)), + Internal_c_values(c_values.data(), c_values.extent(0))); + } } } // namespace Experimental // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. -template -void spadd_symbolic(KernelHandle* handle, const AMatrix& A, const BMatrix& B, - CMatrix& C) { +template +void spadd_symbolic(const ExecSpace &exec, KernelHandle *handle, + const AMatrix &A, const BMatrix &B, CMatrix &C) { using row_map_type = typename CMatrix::row_map_type::non_const_type; using entries_type = typename CMatrix::index_type::non_const_type; using values_type = typename CMatrix::values_type::non_const_type; + auto addHandle = handle->get_spadd_handle(); + // Create the row_map of C, no need to initialize it row_map_type row_mapC( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "row map"), + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "row map"), A.numRows() + 1); - KokkosSparse::Experimental::spadd_symbolic(handle, A.graph.row_map, - A.graph.entries, B.graph.row_map, - B.graph.entries, row_mapC); + + // Shortcuts for special cases as they cause errors in some TPL + // implementations (e.g., cusparse and hipsparse) + if (!A.nnz()) { + Kokkos::deep_copy(exec, row_mapC, B.graph.row_map); + addHandle->set_c_nnz(B.graph.entries.extent(0)); + } else if (!B.nnz()) { + Kokkos::deep_copy(exec, row_mapC, A.graph.row_map); + addHandle->set_c_nnz(A.graph.entries.extent(0)); + } else { + KokkosSparse::Experimental::spadd_symbolic( + exec, handle, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, B.graph.row_map, B.graph.entries, row_mapC); + } // Now create and allocate the entries and values // views so we can build a graph and then matrix C // and subsequently construct C. - auto addHandle = handle->get_spadd_handle(); entries_type entriesC( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "entries"), addHandle->get_c_nnz()); // Finally since we already have the number of nnz handy // we can go ahead and allocate C's values and set them. - values_type valuesC(Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), - addHandle->get_c_nnz()); + values_type valuesC( + Kokkos::view_alloc(exec, Kokkos::WithoutInitializing, "values"), + addHandle->get_c_nnz()); C = CMatrix("matrix", A.numRows(), A.numCols(), addHandle->get_c_nnz(), valuesC, row_mapC, entriesC); } -// Symbolic: count entries in each row in C to produce rowmap +// Numeric: fill the column indices and values // kernel handle has information about whether it is sorted add or not. +template +void spadd_numeric(const ExecSpace &exec, KernelHandle *handle, + const AScalar alpha, const AMatrix &A, const BScalar beta, + const BMatrix &B, CMatrix &C) { + if (!A.nnz()) { + Kokkos::deep_copy(exec, C.graph.entries, B.graph.entries); + KokkosBlas::scal(exec, C.values, beta, B.values); + } else if (!B.nnz()) { + Kokkos::deep_copy(exec, C.graph.entries, A.graph.entries); + KokkosBlas::scal(exec, C.values, alpha, A.values); + } else { + KokkosSparse::Experimental::spadd_numeric( + exec, handle, A.numRows(), A.numCols(), A.graph.row_map, + A.graph.entries, A.values, alpha, B.graph.row_map, B.graph.entries, + B.values, beta, C.graph.row_map, C.graph.entries, C.values); + } +} + +// One without an explicit execution space argument +template +void spadd_symbolic(KernelHandle *handle, const AMatrix &A, const BMatrix &B, + CMatrix &C) { + spadd_symbolic(typename AMatrix::execution_space{}, handle, A, B, C); +} + template -void spadd_numeric(KernelHandle* handle, const AScalar alpha, const AMatrix& A, - const BScalar beta, const BMatrix& B, CMatrix& C) { - KokkosSparse::Experimental::spadd_numeric( - handle, A.graph.row_map, A.graph.entries, A.values, alpha, - B.graph.row_map, B.graph.entries, B.values, beta, C.graph.row_map, - C.graph.entries, C.values); +void spadd_numeric(KernelHandle *handle, const AScalar alpha, const AMatrix &A, + const BScalar beta, const BMatrix &B, CMatrix &C) { + spadd_numeric(typename AMatrix::execution_space{}, handle, alpha, A, beta, B, + C); } } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_spadd_handle.hpp b/sparse/src/KokkosSparse_spadd_handle.hpp index 007fac33ac..760f912c6d 100644 --- a/sparse/src/KokkosSparse_spadd_handle.hpp +++ b/sparse/src/KokkosSparse_spadd_handle.hpp @@ -32,6 +32,42 @@ class SPADDHandle { typedef typename lno_row_view_t_::non_const_value_type size_type; typedef ExecutionSpace execution_space; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + struct SpaddCusparseData { + size_t nbytes; + void* workspace; + cusparseMatDescr_t descrA, descrB, descrC; + + SpaddCusparseData() + : nbytes(0), + workspace(nullptr), + descrA(nullptr), + descrB(nullptr), + descrC(nullptr) {} + + ~SpaddCusparseData() { + Kokkos::kokkos_free(workspace); + cusparseDestroyMatDescr(descrA); + cusparseDestroyMatDescr(descrB); + cusparseDestroyMatDescr(descrC); + } + }; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + struct SpaddRocsparseData { + rocsparse_mat_descr descrA, descrB, descrC; + + SpaddRocsparseData() : descrA(nullptr), descrB(nullptr), descrC(nullptr) {} + + ~SpaddRocsparseData() { + rocsparse_destroy_mat_descr(descrA); + rocsparse_destroy_mat_descr(descrB); + rocsparse_destroy_mat_descr(descrC); + } + }; +#endif + private: // if both are true, the input matrices are strict CRS bool input_sorted; // column indices in a row are sorted @@ -78,6 +114,14 @@ class SPADDHandle { int get_sort_option() { return this->sort_option; } +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + SpaddCusparseData cusparseData; +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + SpaddRocsparseData rocsparseData; +#endif + /** * \brief Default constructor. */ diff --git a/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp new file mode 100644 index 0000000000..0952654bdf --- /dev/null +++ b/sparse/tpls/KokkosSparse_spadd_numeric_tpl_spec_decl.hpp @@ -0,0 +1,282 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_NUMERIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spadd_numeric( \ + const EXEC_SPACE_TYPE &exec, kernelhandle_t *handle, ORDINAL_TYPE m, \ + ORDINAL_TYPE n, const KOKKOS_SCALAR_TYPE alpha, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, scalar_view_t valuesA, \ + const KOKKOS_SCALAR_TYPE beta, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, scalar_view_t valuesB, rowmap_view_t rowmapC, \ + non_const_colidx_view_t colidxC, non_const_scalar_view_t valuesC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_numeric[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto &cuspData = addHandle->cusparseData; \ + auto &cuspHandle = \ + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; \ + cusparsePointerMode_t oldPtrMode; \ + \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetStream(cuspHandle, exec.cuda_stream())); \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseGetPointerMode(cuspHandle, &oldPtrMode)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetPointerMode( \ + cuspHandle, CUSPARSE_POINTER_MODE_HOST)); /* alpha, beta on host*/ \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparse##TOKEN##csrgeam2( \ + cuspHandle, m, n, reinterpret_cast(&alpha), \ + cuspData.descrA, nnzA, \ + reinterpret_cast(valuesA.data()), \ + rowmapA.data(), colidxA.data(), \ + reinterpret_cast(&beta), cuspData.descrB, \ + nnzB, reinterpret_cast(valuesB.data()), \ + rowmapB.data(), colidxB.data(), cuspData.descrC, \ + reinterpret_cast(valuesC.data()), \ + const_cast(rowmapC.data()), colidxC.data(), \ + cuspData.workspace)); \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetPointerMode(cuspHandle, oldPtrMode)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(cuspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + S, float, float, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + D, double, double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + C, Kokkos::complex, cuComplex, int, int, Kokkos::LayoutLeft, \ + Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE( \ + Z, Kokkos::complex, cuDoubleComplex, int, int, \ + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_NUMERIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + Kokkos::View, \ + Kokkos::MemoryTraits>, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + using non_const_scalar_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits>; \ + static void spadd_numeric( \ + const EXEC_SPACE_TYPE &exec, kernelhandle_t *handle, ORDINAL_TYPE m, \ + ORDINAL_TYPE n, const KOKKOS_SCALAR_TYPE alpha, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, scalar_view_t valuesA, \ + const KOKKOS_SCALAR_TYPE beta, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, scalar_view_t valuesB, rowmap_view_t rowmapC, \ + non_const_colidx_view_t colidxC, non_const_scalar_view_t valuesC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_numeric[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto &rocData = addHandle->rocsparseData; \ + auto &rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton() \ + .rocsparseHandle; \ + rocsparse_pointer_mode oldPtrMode; \ + \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_get_pointer_mode(rocspHandle, &oldPtrMode)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_set_pointer_mode( \ + rocspHandle, rocsparse_pointer_mode_host)); /* alpha, beta on host*/ \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_##TOKEN##csrgeam( \ + rocspHandle, m, n, \ + reinterpret_cast(&alpha), rocData.descrA, \ + nnzA, reinterpret_cast(valuesA.data()), \ + rowmapA.data(), colidxA.data(), \ + reinterpret_cast(&beta), rocData.descrB, \ + nnzB, reinterpret_cast(valuesB.data()), \ + rowmapB.data(), colidxB.data(), rocData.descrC, \ + reinterpret_cast(valuesC.data()), \ + const_cast(rowmapC.data()), colidxC.data())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_pointer_mode(rocspHandle, oldPtrMode)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + s, float, float, int, int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + d, double, double, int, int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + c, Kokkos::complex, rocsparse_float_complex, int, int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE( \ + z, Kokkos::complex, rocsparse_double_complex, int, int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_DECL_ROCSPARSE_EXT(false) +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp new file mode 100644 index 0000000000..fe6b51207f --- /dev/null +++ b/sparse/tpls/KokkosSparse_spadd_symbolic_tpl_spec_decl.hpp @@ -0,0 +1,238 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_HPP_ +#define KOKKOSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Impl { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + TOKEN, KOKKOS_SCALAR_TYPE, TPL_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_SYMBOLIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + static void spadd_symbolic(const EXEC_SPACE_TYPE& exec, \ + kernelhandle_t* handle, const ORDINAL_TYPE m, \ + const ORDINAL_TYPE n, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, \ + non_const_rowmap_view_t rowmapC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_symbolic[TPL_CUSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto& cuspData = addHandle->cusparseData; \ + auto& cuspHandle = \ + KokkosKernels::Impl::CusparseSingleton::singleton().cusparseHandle; \ + \ + /* Not easy to init 'one' for cuda complex, so we don't init it. Anyway, \ + * the uninit'ed var won't affect C's pattern. \ + */ \ + TPL_SCALAR_TYPE one; \ + size_t nbytes; \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + OFFSET_TYPE nnzC = 0; \ + \ + KOKKOS_CUSPARSE_SAFE_CALL( \ + cusparseSetStream(cuspHandle, exec.cuda_stream())); \ + \ + /* https://docs.nvidia.com/cuda/cusparse/index.html#cusparsecreatematdescr \ + It sets the fields MatrixType and IndexBase to the default values \ + CUSPARSE_MATRIX_TYPE_GENERAL and CUSPARSE_INDEX_BASE_ZERO, \ + respectively, while leaving other fields uninitialized. */ \ + \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrA)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrB)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&cuspData.descrC)); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparse##TOKEN##csrgeam2_bufferSizeExt( \ + cuspHandle, m, n, &one, cuspData.descrA, nnzA, NULL, rowmapA.data(), \ + colidxA.data(), &one, cuspData.descrB, nnzB, NULL, rowmapB.data(), \ + colidxB.data(), cuspData.descrC, NULL, rowmapC.data(), NULL, \ + &nbytes)); \ + cuspData.nbytes = nbytes; \ + cuspData.workspace = Kokkos::kokkos_malloc(nbytes); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz( \ + cuspHandle, m, n, cuspData.descrA, nnzA, rowmapA.data(), \ + colidxA.data(), cuspData.descrB, nnzB, rowmapB.data(), \ + colidxB.data(), cuspData.descrC, rowmapC.data(), &nnzC, \ + cuspData.workspace)); \ + addHandle->set_c_nnz(nnzC); \ + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetStream(cuspHandle, NULL)); \ + \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + S, float, float, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + D, double, double, int, int, Kokkos::LayoutLeft, Kokkos::Cuda, \ + Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + C, Kokkos::complex, cuComplex, int, int, Kokkos::LayoutLeft, \ + Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE( \ + Z, Kokkos::complex, cuDoubleComplex, int, int, \ + Kokkos::LayoutLeft, Kokkos::Cuda, Kokkos::CudaSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_CUSPARSE_EXT(false) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + KOKKOS_SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, ETI_SPEC_AVAIL) \ + template <> \ + struct SPADD_SYMBOLIC< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + using kernelhandle_t = KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const KOKKOS_SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>; \ + using rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using non_const_rowmap_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + using colidx_view_t = \ + Kokkos::View, \ + Kokkos::MemoryTraits >; \ + static void spadd_symbolic(const EXEC_SPACE_TYPE& exec, \ + kernelhandle_t* handle, const ORDINAL_TYPE m, \ + const ORDINAL_TYPE n, rowmap_view_t rowmapA, \ + colidx_view_t colidxA, rowmap_view_t rowmapB, \ + colidx_view_t colidxB, \ + non_const_rowmap_view_t rowmapC) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosSparse::spadd_symbolic[TPL_ROCSPARSE," + \ + Kokkos::ArithTraits::name() + "]"); \ + \ + auto addHandle = handle->get_spadd_handle(); \ + auto& rocData = addHandle->rocsparseData; \ + auto& rocspHandle = KokkosKernels::Impl::RocsparseSingleton::singleton() \ + .rocsparseHandle; \ + OFFSET_TYPE nnzA = colidxA.extent(0); \ + OFFSET_TYPE nnzB = colidxB.extent(0); \ + OFFSET_TYPE nnzC = 0; \ + \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, exec.hip_stream())); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrA)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrB)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_create_mat_descr(&rocData.descrC)); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL(rocsparse_csrgeam_nnz( \ + rocspHandle, m, n, rocData.descrA, nnzA, rowmapA.data(), \ + colidxA.data(), rocData.descrB, nnzB, rowmapB.data(), \ + colidxB.data(), rocData.descrC, rowmapC.data(), &nnzC)); \ + addHandle->set_c_nnz(nnzC); \ + KOKKOS_ROCSPARSE_SAFE_CALL_IMPL( \ + rocsparse_set_stream(rocspHandle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT( \ + ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + float, rocsparse_int, rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + double, rocsparse_int, rocsparse_int, Kokkos::LayoutLeft, Kokkos::HIP, \ + Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + Kokkos::complex, rocsparse_int, rocsparse_int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE( \ + Kokkos::complex, rocsparse_int, rocsparse_int, \ + Kokkos::LayoutLeft, Kokkos::HIP, Kokkos::HIPSpace, ETI_SPEC_AVAIL) + +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT(true) +KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_DECL_ROCSPARSE_EXT(false) +#endif + +} // namespace Impl +} // namespace KokkosSparse + +#endif diff --git a/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp b/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp index b654c4331c..6d4db8731f 100644 --- a/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp +++ b/sparse/tpls/KokkosSparse_spadd_tpl_spec_avail.hpp @@ -21,20 +21,125 @@ namespace KokkosSparse { namespace Impl { // Specialization struct which defines whether a specialization exists // -template +template struct spadd_symbolic_tpl_spec_avail { enum : bool { value = false }; }; -template +template struct spadd_numeric_tpl_spec_avail { enum : bool { value = false }; }; +#define KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_symbolic_tpl_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spadd_numeric_tpl_spec_avail< \ + EXEC_SPACE_TYPE, \ + KokkosKernels::Experimental::KokkosKernelsHandle< \ + const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits > > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL( \ + ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL(float, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL(double, ORDINAL_TYPE, \ + OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_SYMBOLIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL(float, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL(double, ORDINAL_TYPE, OFFSET_TYPE, \ + LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) \ + KOKKOSSPARSE_SPADD_NUMERIC_TPL_SPEC_AVAIL( \ + Kokkos::complex, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, \ + EXEC_SPACE_TYPE, MEM_SPACE_TYPE) + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL(int, int, Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE +KOKKOSSPARSE_SPADD_TPL_SPEC_AVAIL(rocsparse_int, rocsparse_int, + Kokkos::LayoutLeft, Kokkos::HIP, + Kokkos::HIPSpace) +#endif + } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp b/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp deleted file mode 100644 index 8f5ad83ed7..0000000000 --- a/sparse/tpls/KokkosSparse_spadd_tpl_spec_decl.hpp +++ /dev/null @@ -1,24 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ -#define KOKKOSPARSE_SPADD_TPL_SPEC_DECL_HPP_ - -namespace KokkosSparse { -namespace Impl {} -} // namespace KokkosSparse - -#endif diff --git a/sparse/unit_test/Test_Sparse_spadd.hpp b/sparse/unit_test/Test_Sparse_spadd.hpp index f48d86a98c..3156801dbd 100644 --- a/sparse/unit_test/Test_Sparse_spadd.hpp +++ b/sparse/unit_test/Test_Sparse_spadd.hpp @@ -135,9 +135,10 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, // initialized Kokkos::deep_copy(c_row_map, (size_type)5); auto addHandle = handle.get_spadd_handle(); - KokkosSparse::Experimental::spadd_symbolic(&handle, A.graph.row_map, - A.graph.entries, B.graph.row_map, - B.graph.entries, c_row_map); + typename Device::execution_space exec{}; + KokkosSparse::Experimental::spadd_symbolic( + exec, &handle, numRows, numCols, A.graph.row_map, A.graph.entries, + B.graph.row_map, B.graph.entries, c_row_map); size_type c_nnz = addHandle->get_c_nnz(); // Fill values, entries with incorrect incorret values_type c_values( @@ -146,9 +147,9 @@ void test_spadd(lno_t numRows, lno_t numCols, size_type minNNZ, entries_type c_entries("C entries", c_nnz); Kokkos::deep_copy(c_entries, (lno_t)5); KokkosSparse::Experimental::spadd_numeric( - &handle, A.graph.row_map, A.graph.entries, A.values, KAT::one(), - B.graph.row_map, B.graph.entries, B.values, KAT::one(), c_row_map, - c_entries, c_values); + exec, &handle, numRows, numCols, A.graph.row_map, A.graph.entries, + A.values, KAT::one(), B.graph.row_map, B.graph.entries, B.values, + KAT::one(), c_row_map, c_entries, c_values); // done with handle // create C using CRS arrays crsMat_t C("C", numRows, numCols, c_nnz, c_values, c_row_map, c_entries);