diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index f3fbec1836..2b89c1a2f7 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -25,6 +25,7 @@ #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_BsrMatrix.hpp" #include "Kokkos_Bitset.hpp" +#include "KokkosGraph_RCM.hpp" #ifdef KOKKOSKERNELS_HAVE_PARALLEL_GNUSORT #include @@ -2415,15 +2416,23 @@ void kk_extract_subblock_crsmatrix_sequential( * @tparam crsMat_t The type of the CRS matrix. * @param A [in] The square CrsMatrix. It is expected that column indices are * in ascending order + * @param UseRCMReordering [in] Boolean indicating whether applying (true) RCM + * reordering to diagonal blocks or not (false) (default: false) * @param DiagBlk_v [out] The vector of the extracted the CRS diagonal blocks * (1 <= the number of diagonal blocks <= A_nrows) + * @return a vector of lists of vertices in RCM order (a list per a diagonal + * block) if UseRCMReordering is true, or an empty vector if UseRCMReordering is + * false * * Usage Example: - * kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_in_b); + * perm = kk_extract_diagonal_blocks_crsmatrix_sequential(A_in, diagBlk_out, + * UseRCMReordering); */ template -void kk_extract_diagonal_blocks_crsmatrix_sequential( - const crsMat_t &A, std::vector &DiagBlk_v) { +std::vector +kk_extract_diagonal_blocks_crsmatrix_sequential( + const crsMat_t &A, std::vector &DiagBlk_v, + bool UseRCMReordering = false) { using row_map_type = typename crsMat_t::row_map_type; using entries_type = typename crsMat_t::index_type; using values_type = typename crsMat_t::values_type; @@ -2437,6 +2446,7 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( using ordinal_type = typename crsMat_t::non_const_ordinal_type; using size_type = typename crsMat_t::non_const_size_type; + using value_type = typename crsMat_t::non_const_value_type; using offset_view1d_type = Kokkos::View; @@ -2463,8 +2473,12 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( throw std::runtime_error(os.str()); } + std::vector perm_v; + std::vector perm_h_v; + if (n_blocks == 1) { // One block case: simply shallow copy A to DiagBlk_v[0] + // Note: always not applying RCM reordering, for now DiagBlk_v[0] = crsMat_t(A); } else { // n_blocks > 1 @@ -2487,12 +2501,10 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( ? (A_nrows / n_blocks) : (A_nrows / n_blocks + 1); - std::vector row_map_v(n_blocks); - std::vector entries_v(n_blocks); - std::vector values_v(n_blocks); - std::vector row_map_h_v(n_blocks); - std::vector entries_h_v(n_blocks); - std::vector values_h_v(n_blocks); + if (UseRCMReordering) { + perm_v.resize(n_blocks); + perm_h_v.resize(n_blocks); + } ordinal_type blk_row_start = 0; // first row index of i-th diagonal block ordinal_type blk_col_start = 0; // first col index of i-th diagonal block @@ -2509,37 +2521,110 @@ void kk_extract_diagonal_blocks_crsmatrix_sequential( // First round: count i-th non-zeros or size of entries_v[i] and find // the first and last column indices at each row size_type blk_nnz = 0; - offset_view1d_type first("first", blk_nrows); // first position per row - offset_view1d_type last("last", blk_nrows); // last position per row + offset_view1d_type first( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "first"), + blk_nrows); // first position per row + offset_view1d_type last( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "last"), + blk_nrows); // last position per row kk_find_nnz_first_last_indices_subblock_crsmatrix_sequential( A_row_map_h, A_entries_h, blk_row_start, blk_col_start, blk_nrows, blk_ncols, blk_nnz, first, last); // Second round: extract - row_map_v[i] = out_row_map_type("row_map_v", blk_nrows + 1); - entries_v[i] = out_entries_type("entries_v", blk_nnz); - values_v[i] = out_values_type("values_v", blk_nnz); - row_map_h_v[i] = - out_row_map_hostmirror_type("row_map_h_v", blk_nrows + 1); - entries_h_v[i] = out_entries_hostmirror_type("entries_h_v", blk_nnz); - values_h_v[i] = out_values_hostmirror_type("values_h_v", blk_nnz); + out_row_map_type row_map( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map"), + blk_nrows + 1); + out_entries_type entries( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries"), + blk_nnz); + out_values_type values( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values"), blk_nnz); + out_row_map_hostmirror_type row_map_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_h"), + blk_nnz); + out_values_hostmirror_type values_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_h"), + blk_nnz); kk_extract_subblock_crsmatrix_sequential( A_entries_h, A_values_h, blk_col_start, blk_nrows, blk_nnz, first, - last, row_map_h_v[i], entries_h_v[i], values_h_v[i]); + last, row_map_h, entries_h, values_h); + + if (!UseRCMReordering) { + Kokkos::deep_copy(row_map, row_map_h); + Kokkos::deep_copy(entries, entries_h); + Kokkos::deep_copy(values, values_h); + } else { + perm_h_v[i] = KokkosGraph::Experimental::graph_rcm< + Kokkos::DefaultHostExecutionSpace>(row_map_h, entries_h); + perm_v[i] = out_entries_type( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "perm_v"), + perm_h_v[i].extent(0)); + + out_row_map_hostmirror_type row_map_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "row_map_perm_h"), + blk_nrows + 1); + out_entries_hostmirror_type entries_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entries_perm_h"), + blk_nnz); + out_values_hostmirror_type values_perm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "values_perm_h"), + blk_nnz); + + out_entries_hostmirror_type reverseperm_h( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "reverseperm_h"), + blk_nrows); + for (ordinal_type ii = 0; ii < blk_nrows; ii++) + reverseperm_h(perm_h_v[i](ii)) = ii; + + std::map colIdx_Value_rcm; + + // Loop through each row of the reordered matrix + size_type cnt = 0; + for (ordinal_type ii = 0; ii < blk_nrows; ii++) { + colIdx_Value_rcm.clear(); + // ii: reordered index + ordinal_type origRow = reverseperm_h( + ii); // get the original row idx of the reordered row idx, ii + for (size_type j = row_map_h(origRow); j < row_map_h(origRow + 1); + j++) { + ordinal_type origEi = entries_h(j); + value_type origV = values_h(j); + ordinal_type Ei = + perm_h_v[i](origEi); // get the reordered col idx of the + // original col idx, origEi + colIdx_Value_rcm[Ei] = origV; + } + row_map_perm_h(ii) = cnt; + for (typename std::map::iterator it = + colIdx_Value_rcm.begin(); + it != colIdx_Value_rcm.end(); ++it) { + entries_perm_h(cnt) = it->first; + values_perm_h(cnt) = it->second; + cnt++; + } + } + row_map_perm_h(blk_nrows) = cnt; - Kokkos::deep_copy(row_map_v[i], row_map_h_v[i]); - Kokkos::deep_copy(entries_v[i], entries_h_v[i]); - Kokkos::deep_copy(values_v[i], values_h_v[i]); + Kokkos::deep_copy(row_map, row_map_perm_h); + Kokkos::deep_copy(entries, entries_perm_h); + Kokkos::deep_copy(values, values_perm_h); + Kokkos::deep_copy(perm_v[i], perm_h_v[i]); + } DiagBlk_v[i] = crsMat_t("CrsMatrix", blk_nrows, blk_ncols, blk_nnz, - values_v[i], row_map_v[i], entries_v[i]); + values, row_map, entries); blk_row_start += blk_nrows; } // for (ordinal_type i = 0; i < n_blocks; i++) } // A_nrows >= 1 } // n_blocks > 1 + return perm_v; } } // namespace Impl diff --git a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp index 327780dec3..28674ad353 100644 --- a/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp +++ b/sparse/unit_test/Test_Sparse_extractCrsDiagonalBlocks.hpp @@ -15,6 +15,8 @@ //@HEADER #include "KokkosSparse_Utils.hpp" +#include "KokkosSparse_spmv.hpp" +#include "KokkosBlas1_nrm2.hpp" #include "KokkosKernels_TestUtils.hpp" namespace Test { @@ -31,6 +33,7 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { crsMat_t A; std::vector DiagBlks(nblocks); + std::vector DiagBlks_rcm(nblocks); if (nrows != 0) { // Generate test matrix @@ -84,6 +87,10 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(A, DiagBlks); + auto perm = + KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential( + A, DiagBlks_rcm, true); + // Checking lno_t numRows = 0; lno_t numCols = 0; @@ -125,6 +132,40 @@ void run_test_extract_diagonal_blocks(int nrows, int nblocks) { col_start += DiagBlks[i].numCols(); } EXPECT_TRUE(flag); + + // Checking RCM + if (!perm.empty()) { + scalar_t one = scalar_t(1.0); + scalar_t zero = scalar_t(0.0); + scalar_t mone = scalar_t(-1.0); + for (int i = 0; i < nblocks; i++) { + ValuesType In("In", DiagBlks[i].numRows()); + ValuesType Out("Out", DiagBlks[i].numRows()); + + ValuesType_hm h_Out = Kokkos::create_mirror_view(Out); + ValuesType_hm h_Out_tmp = Kokkos::create_mirror(Out); + + Kokkos::deep_copy(In, one); + + auto h_perm = + Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), perm[i]); + + KokkosSparse::spmv("N", one, DiagBlks_rcm[i], In, zero, Out); + + Kokkos::deep_copy(h_Out_tmp, Out); + for (lno_t ii = 0; ii < static_cast(DiagBlks[i].numRows()); + ii++) { + lno_t rcm_ii = h_perm(ii); + h_Out(ii) = h_Out_tmp(rcm_ii); + } + Kokkos::deep_copy(Out, h_Out); + + KokkosSparse::spmv("N", one, DiagBlks[i], In, mone, Out); + + double nrm_val = KokkosBlas::nrm2(Out); + EXPECT_LE(nrm_val, 1e-9); + } + } } } } // namespace Test @@ -136,9 +177,9 @@ void test_extract_diagonal_blocks() { Test::run_test_extract_diagonal_blocks( 0, s); Test::run_test_extract_diagonal_blocks( - 12, s); + 153, s); Test::run_test_extract_diagonal_blocks( - 123, s); + 1553, s); } }