diff --git a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp index 0cfb82080984..850c121922df 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_decl.hpp @@ -647,6 +647,8 @@ class RILUK: bool isKokkosKernelsStream_; int num_streams_; std::vector exec_space_instances_; + bool hasStreamReordered_; + std::vector perm_v_; }; // NOTE (mfh 11 Feb 2015) This used to exist in order to deal with diff --git a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp index 1e90f9bdf7f3..4a5ba4bc692e 100644 --- a/packages/ifpack2/src/Ifpack2_RILUK_def.hpp +++ b/packages/ifpack2/src/Ifpack2_RILUK_def.hpp @@ -91,7 +91,8 @@ RILUK::RILUK (const Teuchos::RCP& Matrix_in) Rthresh_ (Teuchos::ScalarTraits::one ()), isKokkosKernelsSpiluk_(false), isKokkosKernelsStream_(false), - num_streams_(0) + num_streams_(0), + hasStreamReordered_(false) { allocateSolvers(); } @@ -116,7 +117,8 @@ RILUK::RILUK (const Teuchos::RCP& Matrix_in) Rthresh_ (Teuchos::ScalarTraits::one ()), isKokkosKernelsSpiluk_(false), isKokkosKernelsStream_(false), - num_streams_(0) + num_streams_(0), + hasStreamReordered_(false) { allocateSolvers(); } @@ -412,7 +414,7 @@ setParameters (const Teuchos::ParameterList& params) getParamTryingTypes (nstreams, params, paramName, prefix); } - + // Forward to trisolvers. L_solver_->setParameters(params); U_solver_->setParameters(params); @@ -427,6 +429,9 @@ setParameters (const Teuchos::ParameterList& params) if (num_streams_ >= 1) { this->isKokkosKernelsStream_ = true; + // Will we do reordering in streams? + if (params.isParameter("fact: kspiluk reordering in streams")) + hasStreamReordered_ = params.get ("fact: kspiluk reordering in streams"); } else { this->isKokkosKernelsStream_ = false; @@ -524,7 +529,7 @@ void RILUK::initialize () "matrix until the matrix is fill complete. If your matrix is a " "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and " "range Maps, if appropriate) before calling this method."); - + Teuchos::Time timer ("RILUK::initialize"); double startTime = timer.wallTime(); { // Start timing @@ -592,8 +597,10 @@ void RILUK::initialize () } else { auto lclMtx = A_local_crs->getLocalMatrixDevice(); - KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); - + if (!hasStreamReordered_) + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); + else + perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true); for(int i = 0; i < num_streams_; i++) { Teuchos::RCP A_local_diagblks_RowMap = rcp (new crs_map_type(A_local_diagblks[i].numRows(), A_local_diagblks[i].numRows(), @@ -654,6 +661,7 @@ void RILUK::initialize () #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030) L_solver_->compute ();//NOTE: It makes sense to do compute here because only the nonzero pattern is involved in trisolve compute #endif + if (!isKokkosKernelsStream_) { U_solver_->setMatrix (U_); } @@ -1050,7 +1058,11 @@ void RILUK::compute () A_local_values_ = lclMtx.values; } else { - KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); + if (!hasStreamReordered_) + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks); + else + perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true); + A_local_diagblks_rowmap_v_ = std::vector(num_streams_); A_local_diagblks_entries_v_ = std::vector(num_streams_); A_local_diagblks_values_v_ = std::vector(num_streams_); @@ -1198,77 +1210,187 @@ apply (const Tpetra::MultiVectorgetLocalViewDevice(Tpetra::Access::ReadOnly); + auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite); + local_ordinal_type stream_begin = 0; + local_ordinal_type stream_end; + for(int i = 0; i < num_streams_; i++) { + auto perm_i = perm_v_[i]; + stream_end = stream_begin + perm_i.extent(0); + auto X_lcl_sub = Kokkos::subview (X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + auto ReorderedX_lcl_sub = Kokkos::subview (ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + Kokkos::parallel_for( Kokkos::RangePolicy(0, static_cast(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) { + ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii); + }); + stream_begin = stream_end; + } + } + + if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y. #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030) - //NOTE (Nov-15-2022): - //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) - //since cusparseSpSV_solve() does not support in-place computation - MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + //NOTE (Nov-15-2022): + //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) + //since cusparseSpSV_solve() does not support in-place computation + MV Y_tmp (ReorderedY.getMap (), ReorderedY.getNumVectors ()); + + // Start by solving L Y_tmp = X for Y_tmp. + L_solver_->apply (ReorderedX, Y_tmp, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D Y = Y. The operation lets us do this in place in Y, so we can + // write "solve D Y = Y for Y." + Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); + } - // Start by solving L Y_tmp = X for Y_tmp. - L_solver_->apply (X, Y_tmp, mode); + U_solver_->apply (Y_tmp, ReorderedY, mode); // Solve U Y = Y_tmp. +#else + // Start by solving L Y = X for Y. + L_solver_->apply (ReorderedX, ReorderedY, mode); - if (!this->isKokkosKernelsSpiluk_) { - // Solve D Y = Y. The operation lets us do this in place in Y, so we can - // write "solve D Y = Y for Y." - Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); - } + if (!this->isKokkosKernelsSpiluk_) { + // Solve D Y = Y. The operation lets us do this in place in Y, so we can + // write "solve D Y = Y for Y." + Y.elementWiseMultiply (one, *D_, Y, zero); + } - U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp. + U_solver_->apply (ReorderedY, ReorderedY, mode); // Solve U Y = Y. +#endif + } + else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030) + //NOTE (Nov-15-2022): + //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) + //since cusparseSpSV_solve() does not support in-place computation + MV Y_tmp (ReorderedY.getMap (), ReorderedY.getNumVectors ()); + + // Start by solving U^P Y_tmp = X for Y_tmp. + U_solver_->apply (ReorderedX, Y_tmp, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D^P Y = Y. + // + // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we + // need to do an elementwise multiply with the conjugate of + // D_, not just with D_ itself. + Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); + } + + L_solver_->apply (Y_tmp, ReorderedY, mode); // Solve L^P Y = Y_tmp. #else - // Start by solving L Y = X for Y. - L_solver_->apply (X, Y, mode); - - if (!this->isKokkosKernelsSpiluk_) { - // Solve D Y = Y. The operation lets us do this in place in Y, so we can - // write "solve D Y = Y for Y." - Y.elementWiseMultiply (one, *D_, Y, zero); + // Start by solving U^P Y = X for Y. + U_solver_->apply (ReorderedX, ReorderedY, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D^P Y = Y. + // + // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we + // need to do an elementwise multiply with the conjugate of + // D_, not just with D_ itself. + Y.elementWiseMultiply (one, *D_, Y, zero); + } + + L_solver_->apply (ReorderedY, ReorderedY, mode); // Solve L^P Y = Y. +#endif } - U_solver_->apply (Y, Y, mode); // Solve U Y = Y. -#endif + for (size_t j = 0; j < Y.getNumVectors(); j++) { + auto Y_j = Y.getVectorNonConst(j); + auto ReorderedY_j = ReorderedY.getVector(j); + auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite); + auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly); + local_ordinal_type stream_begin = 0; + local_ordinal_type stream_end; + for(int i = 0; i < num_streams_; i++) { + auto perm_i = perm_v_[i]; + stream_end = stream_begin + perm_i.extent(0); + auto Y_lcl_sub = Kokkos::subview (Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + auto ReorderedY_lcl_sub = Kokkos::subview (ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0); + Kokkos::parallel_for( Kokkos::RangePolicy(0, static_cast(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) { + Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii)); + }); + stream_begin = stream_end; + } + } } - else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). + else { + if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y. #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030) - //NOTE (Nov-15-2022): - //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) - //since cusparseSpSV_solve() does not support in-place computation - MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + //NOTE (Nov-15-2022): + //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) + //since cusparseSpSV_solve() does not support in-place computation + MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + + // Start by solving L Y_tmp = X for Y_tmp. + L_solver_->apply (X, Y_tmp, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D Y = Y. The operation lets us do this in place in Y, so we can + // write "solve D Y = Y for Y." + Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); + } - // Start by solving U^P Y_tmp = X for Y_tmp. - U_solver_->apply (X, Y_tmp, mode); + U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp. +#else + // Start by solving L Y = X for Y. + L_solver_->apply (X, Y, mode); - if (!this->isKokkosKernelsSpiluk_) { - // Solve D^P Y = Y. - // - // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we - // need to do an elementwise multiply with the conjugate of - // D_, not just with D_ itself. - Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); - } + if (!this->isKokkosKernelsSpiluk_) { + // Solve D Y = Y. The operation lets us do this in place in Y, so we can + // write "solve D Y = Y for Y." + Y.elementWiseMultiply (one, *D_, Y, zero); + } - L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp. + U_solver_->apply (Y, Y, mode); // Solve U Y = Y. +#endif + } + else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T). +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030) + //NOTE (Nov-15-2022): + //This is a workaround for Cuda >= 11.3 (using cusparseSpSV) + //since cusparseSpSV_solve() does not support in-place computation + MV Y_tmp (Y.getMap (), Y.getNumVectors ()); + + // Start by solving U^P Y_tmp = X for Y_tmp. + U_solver_->apply (X, Y_tmp, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D^P Y = Y. + // + // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we + // need to do an elementwise multiply with the conjugate of + // D_, not just with D_ itself. + Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero); + } + + L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp. #else - // Start by solving U^P Y = X for Y. - U_solver_->apply (X, Y, mode); - - if (!this->isKokkosKernelsSpiluk_) { - // Solve D^P Y = Y. - // - // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we - // need to do an elementwise multiply with the conjugate of - // D_, not just with D_ itself. - Y.elementWiseMultiply (one, *D_, Y, zero); - } - - L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y. + // Start by solving U^P Y = X for Y. + U_solver_->apply (X, Y, mode); + + if (!this->isKokkosKernelsSpiluk_) { + // Solve D^P Y = Y. + // + // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we + // need to do an elementwise multiply with the conjugate of + // D_, not just with D_ itself. + Y.elementWiseMultiply (one, *D_, Y, zero); + } + + L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y. #endif + } } } else { // alpha != 1 or beta != 0 diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp index f3fbec18369f..2b89c1a2f74e 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_Utils.hpp @@ -25,6 +25,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_BsrMatrix.hpp" #include "Kokkos_Bitset.hpp" +#include "KokkosGraph_RCM.hpp" #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT #include @@ -2415,15 +2416,23 @@ void kk_extract_subblock_crsmatrix_sequential( * @tparam crsMat_t The type of the CRS matrix. * @param A [in] The square CrsMatrix. It is expected that column indices are * in ascending order + * @param UseRCMReordering [in] Boolean indicating whether applying (true) RCM + * reordering to diagonal blocks or not (false) (default: false) * @param DiagBlk_v [out] The vector of the extracted the CRS diagonal blocks * (1 <= the number of diagonal blocks <= A_nrows) + * @return a vector of lists of vertices in RCM order (a list per a diagonal + * block) if UseRCMReordering is true, or an empty vector if UseRCMReordering is + * false * * Usage Example: - * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); + * perm = kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_out, + * UseRCMReordering); */ template -void kk_extract_diagonal_blocks_crsmatrix_sequential( - const crsMat_t &A, std::vector &DiagBlk_v) { +std::vector +kk_extract_diagonal_blocks_crsmatrix_sequential( + const crsMat_t &A, std::vector &DiagBlk_v, + bool UseRCMReordering = false) { using row_map_type = typename crsMat_t::row_map_type; using entries_type = typename crsMat_t::index_type; using values_type = typename crsMat_t::values_type; @@ -2437,6 +2446,7 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using ordinal_type = typename crsMat_t::non_const_ordinal_type; using size_type = typename crsMat_t::non_const_size_type; + using value_type = typename crsMat_t::non_const_value_type; using offset_view1d_type = Kokkos::View; @@ -2463,8 +2473,12 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( throw std::runtime_error(os.str()); } + std::vector perm_v; + std::vector perm_h_v; + if (n_blocks == 1) { // One block case: simply shallow copy A to DiagBlk_v[0] + // Note: always not applying RCM reordering, for now DiagBlk_v[0] = crsMat_t(A); } else { // n_blocks > 1 @@ -2487,12 +2501,10 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); - std::vector row_map_v(n_blocks); - std::vector entries_v(n_blocks); - std::vector values_v(n_blocks); - std::vector row_map_h_v(n_blocks); - std::vector entries_h_v(n_blocks); - std::vector values_h_v(n_blocks); + if (UseRCMReordering) { + perm_v.resize(n_blocks); + perm_h_v.resize(n_blocks); + } ordinal_type blk_row_start = 0; // first row index of i-th diagonal block ordinal_type blk_col_start = 0; // first col index of i-th diagonal block @@ -2509,37 +2521,110 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( // First round: count i-th non-zeros or size of entries_v[i] and find // the first and last column indices at each row size_type blk_nnz = 0; - offset_view1d_type first("first", blk_nrows); // first position per row - offset_view1d_type last("last", blk_nrows); // last position per row + offset_view1d_type first( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "first"), + blk_nrows); // first position per row + offset_view1d_type last( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "last"), + blk_nrows); // last position per row kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, blk_ncols, blk_nnz, first, last); // Second round: extract - row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); - entries_v[i] = out_entries_type("entries_v", blk_nnz); - values_v[i] = out_values_type("values_v", blk_nnz); - row_map_h_v[i] = - out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); - entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz); - values_h_v[i] = out_values_hostmirror_type("values_h_v", blk_nnz); + out_row_map_type row_map( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map"), + blk_nrows + 1); + out_entries_type entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), + blk_nnz); + out_values_type values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), blk_nnz); + out_row_map_hostmirror_type row_map_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_h"), + blk_nnz); + out_values_hostmirror_type values_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_h"), + blk_nnz); kk_extract_subblock_crsmatrix_sequential( A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, - last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); + last, row_map_h, entries_h, values_h); + + if (!UseRCMReordering) { + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + } else { + perm_h_v[i] = KokkosGraph::Experimental::graph_rcm< + Kokkos::DefaultHostExecutionSpace>(row_map_h, entries_h); + perm_v[i] = out_entries_type( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "perm_v"), + perm_h_v[i].extent(0)); + + out_row_map_hostmirror_type row_map_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_perm_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_perm_h"), + blk_nnz); + out_values_hostmirror_type values_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_perm_h"), + blk_nnz); + + out_entries_hostmirror_type reverseperm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "reverseperm_h"), + blk_nrows); + for (ordinal_type ii = 0; ii < blk_nrows; ii++) + reverseperm_h(perm_h_v[i](ii)) = ii; + + std::map colIdx_Value_rcm; + + // Loop through each row of the reordered matrix + size_type cnt = 0; + for (ordinal_type ii = 0; ii < blk_nrows; ii++) { + colIdx_Value_rcm.clear(); + // ii: reordered index + ordinal_type origRow = reverseperm_h( + ii); // get the original row idx of the reordered row idx, ii + for (size_type j = row_map_h(origRow); j < row_map_h(origRow + 1); + j++) { + ordinal_type origEi = entries_h(j); + value_type origV = values_h(j); + ordinal_type Ei = + perm_h_v[i](origEi); // get the reordered col idx of the + // original col idx, origEi + colIdx_Value_rcm[Ei] = origV; + } + row_map_perm_h(ii) = cnt; + for (typename std::map::iterator it = + colIdx_Value_rcm.begin(); + it != colIdx_Value_rcm.end(); ++it) { + entries_perm_h(cnt) = it->first; + values_perm_h(cnt) = it->second; + cnt++; + } + } + row_map_perm_h(blk_nrows) = cnt; - Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); - Kokkos::deep_copy(entries_v[i], entries_h_v[i]); - Kokkos::deep_copy(values_v[i], values_h_v[i]); + Kokkos::deep_copy(row_map, row_map_perm_h); + Kokkos::deep_copy(entries, entries_perm_h); + Kokkos::deep_copy(values, values_perm_h); + Kokkos::deep_copy(perm_v[i], perm_h_v[i]); + } DiagBlk_v[i] = crsMat_t("CrsMatrix", blk_nrows, blk_ncols, blk_nnz, - values_v[i], row_map_v[i], entries_v[i]); + values, row_map, entries); blk_row_start += blk_nrows; } // for (ordinal_type i = 0; i < n_blocks; i++) } // A_nrows >= 1 } // n_blocks > 1 + return perm_v; } } // namespace Impl diff --git a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp index 327780dec32d..28674ad353fe 100644 --- a/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp +++ b/packages/kokkos-kernels/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -15,6 +15,8 @@ //@HEADER #include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosBlas1_nrm2.hpp" #include "KokkosKernels_TestUtils.hpp" namespace Test { @@ -31,6 +33,7 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { crsMat_t A; std::vector DiagBlks(nblocks); + std::vector DiagBlks_rcm(nblocks); if (nrows != 0) { // Generate test matrix @@ -84,6 +87,10 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, DiagBlks); + auto perm = + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential( + A, DiagBlks_rcm, true); + // Checking lno_t numRows = 0; lno_t numCols = 0; @@ -125,6 +132,40 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { col_start += DiagBlks[i].numCols(); } EXPECT_TRUE(flag); + + // Checking RCM + if (!perm.empty()) { + scalar_t one = scalar_t(1.0); + scalar_t zero = scalar_t(0.0); + scalar_t mone = scalar_t(-1.0); + for (int i = 0; i < nblocks; i++) { + ValuesType In("In", DiagBlks[i].numRows()); + ValuesType Out("Out", DiagBlks[i].numRows()); + + ValuesType_hm h_Out = Kokkos::create_mirror_view(Out); + ValuesType_hm h_Out_tmp = Kokkos::create_mirror(Out); + + Kokkos::deep_copy(In, one); + + auto h_perm = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), perm[i]); + + KokkosSparse::spmv("N", one, DiagBlks_rcm[i], In, zero, Out); + + Kokkos::deep_copy(h_Out_tmp, Out); + for (lno_t ii = 0; ii < static_cast(DiagBlks[i].numRows()); + ii++) { + lno_t rcm_ii = h_perm(ii); + h_Out(ii) = h_Out_tmp(rcm_ii); + } + Kokkos::deep_copy(Out, h_Out); + + KokkosSparse::spmv("N", one, DiagBlks[i], In, mone, Out); + + double nrm_val = KokkosBlas::nrm2(Out); + EXPECT_LE(nrm_val, 1e-9); + } + } } } } // namespace Test @@ -136,9 +177,9 @@ void test_extract_diagonal_blocks() { Test::run_test_extract_diagonal_blocks( 0, s); Test::run_test_extract_diagonal_blocks( - 12, s); + 153, s); Test::run_test_extract_diagonal_blocks( - 123, s); + 1553, s); } }