From 8fc5d878f0f938544cf140323ef6a96168290902 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 21 Dec 2021 08:30:44 -0700 Subject: [PATCH 01/15] Add BsrMatrix spmv and a naive BlockCrsMatrix spmv. --- perf_test/sparse/CMakeLists.txt | 132 +- .../sparse/KokkosSparse_spmv_blockcrs.cpp | 562 ++++++++ perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 573 ++++++++ src/CMakeLists.txt | 21 + ...e_spmv_blockcrsmatrix_eti_spec_inst.cpp.in | 56 + ...pmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in | 56 + ...rse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in | 56 + ..._spmv_blockcrsmatrix_eti_spec_avail.hpp.in | 56 + ...e_spmv_blockcrsmatrix_eti_spec_decl.hpp.in | 56 + ...parse_spmv_bsrmatrix_eti_spec_avail.hpp.in | 6 +- ...mv_mv_blockcrsmatrix_eti_spec_avail.hpp.in | 56 + ...pmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in | 56 + ...se_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in | 56 + ...rse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in | 56 + ...rse_spmv_blockcrsmatrix_tpl_spec_avail.hpp | 70 + ...rse_spmv_blockcrsmatrix_tpl_spec_decl.hpp} | 18 +- ...osSparse_spmv_bsrmatrix_tpl_spec_avail.hpp | 327 +++++ ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 865 ++++++++++++ src/sparse/KokkosSparse_BsrMatrix.hpp | 86 +- src/sparse/KokkosSparse_spmv.hpp | 737 +++++++++-- .../KokkosSparse_spmv_blockcrsmatrix_impl.hpp | 48 + .../KokkosSparse_spmv_blockcrsmatrix_spec.hpp | 421 ++++++ .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 1161 ++++++++++++++++- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 221 +++- unit_test/sparse/Test_Sparse.hpp | 2 + .../sparse/Test_Sparse_spmv_blockcrs.hpp | 750 +++++++++++ unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 855 ++++++++++++ 27 files changed, 7137 insertions(+), 222 deletions(-) create mode 100644 perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp create mode 100644 perf_test/sparse/KokkosSparse_spmv_bsr.cpp create mode 100644 src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_blockcrsmatrix_eti_spec_inst.cpp.in create mode 100644 src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in create mode 100644 src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in create mode 100644 src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in create mode 100644 src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp rename src/{sparse/impl/KokkosSparse_BsrMatrix_impl.hpp => impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp} (82%) create mode 100644 src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp create mode 100644 src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp create mode 100644 src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp create mode 100644 src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp create mode 100644 unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp create mode 100644 unit_test/sparse/Test_Sparse_spmv_bsr.hpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index c515ef2986..1566a84775 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -2,98 +2,108 @@ KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_pcg - SOURCES KokkosSparse_pcg.cpp - ) + sparse_pcg + SOURCES KokkosSparse_pcg.cpp +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_block_pcg - SOURCES KokkosSparse_block_pcg.cpp - ) + sparse_block_pcg + SOURCES KokkosSparse_block_pcg.cpp +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spgemm - SOURCES KokkosSparse_spgemm.cpp - ) + sparse_spgemm + SOURCES KokkosSparse_spgemm.cpp +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spgemm_jacobi - SOURCES KokkosSparse_spgemm_jacobi.cpp - ) - + sparse_spgemm_jacobi + SOURCES KokkosSparse_spgemm_jacobi.cpp +) + KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/spmv) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spadd - SOURCES KokkosSparse_spadd.cpp - ) + sparse_spadd + SOURCES KokkosSparse_spadd.cpp +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spmv_struct - SOURCES KokkosSparse_spmv_struct.cpp - ) + sparse_spmv_struct + SOURCES KokkosSparse_spmv_struct.cpp +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spmv_struct_tuning - SOURCES KokkosSparse_spmv_struct_tuning.cpp - ) + sparse_spmv_struct_tuning + SOURCES KokkosSparse_spmv_struct_tuning.cpp +) set(utilities_list) -IF(KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) -LIST(APPEND utilities_list ../PerfTestUtilities.cpp) -ENDIF() +IF (KokkosKernels_ENABLE_TESTS_AND_PERFSUITE) + LIST(APPEND utilities_list ../PerfTestUtilities.cpp) +ENDIF () KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spmv - SOURCES KokkosSparse_spmv.cpp KokkosSparse_spmv_test.cpp spmv/OpenMPSmartStatic_SPMV.cpp - ${utilities_list} - ) + sparse_spmv + SOURCES KokkosSparse_spmv.cpp KokkosSparse_spmv_test.cpp spmv/OpenMPSmartStatic_SPMV.cpp + ${utilities_list} +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_kk_spmv - SOURCES KokkosSparse_kk_spmv.cpp - ) - -IF(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) - KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spmv_merge - SOURCES KokkosSparse_spmv_merge.cpp + sparse_kk_spmv + SOURCES KokkosSparse_kk_spmv.cpp +) + +IF (KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_spmv_merge + SOURCES KokkosSparse_spmv_merge.cpp ) -ENDIF() +ENDIF () KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_sptrsv - SOURCES KokkosSparse_sptrsv.cpp - ) + sparse_spmv_blockcrs + SOURCES KokkosSparse_spmv_blockcrs.cpp +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_sptrsv_cholmod - SOURCES KokkosSparse_sptrsv_cholmod.cpp - ) + sparse_spmv_bsr + SOURCES KokkosSparse_spmv_bsr.cpp +) -IF(NOT ${KOKKOS_HAS_TRILINOS}) -# Disable this perf test with Trilinos builds to workaround -# -Werror issues error: declaration of xyz with C language linkage KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_sptrsv_superlu - SOURCES KokkosSparse_sptrsv_superlu.cpp - ) -ENDIF() + sparse_sptrsv + SOURCES KokkosSparse_sptrsv.cpp +) + +KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_sptrsv_cholmod + SOURCES KokkosSparse_sptrsv_cholmod.cpp +) + +IF (NOT ${KOKKOS_HAS_TRILINOS}) + # Disable this perf test with Trilinos builds to workaround + # -Werror issues error: declaration of xyz with C language linkage + KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_sptrsv_superlu + SOURCES KokkosSparse_sptrsv_superlu.cpp + ) +ENDIF () KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_sptrsv_supernode - SOURCES KokkosSparse_sptrsv_supernode.cpp - ) + sparse_sptrsv_supernode + SOURCES KokkosSparse_sptrsv_supernode.cpp +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_gs - SOURCES KokkosSparse_gs.cpp - TESTONLYLIBS kokkoskernelsperf_gtest - ) + sparse_gs + SOURCES KokkosSparse_gs.cpp + TESTONLYLIBS kokkoskernelsperf_gtest +) KOKKOSKERNELS_ADD_EXECUTABLE( - sparse_spiluk - SOURCES KokkosSparse_spiluk.cpp - ) + sparse_spiluk + SOURCES KokkosSparse_spiluk.cpp +) diff --git a/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp b/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp new file mode 100644 index 0000000000..9ccd63f58f --- /dev/null +++ b/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp @@ -0,0 +1,562 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include "KokkosKernels_default_types.hpp" +#include + +namespace details { + +enum class Implementation : int { KokkosKernels = 0, Cuda = 1, MKL = 2 }; + +/// +/// Define default types +/// +typedef double Scalar; +typedef int Ordinal; +/// +////////////////////////// + +/// Random generator +template +inline scalar_t random() { + auto const max = static_cast(RAND_MAX) + static_cast(1); + return static_cast(std::rand()) / max; +} + +template +inline void set_random_value(scalar_t &v) { + v = random(); +} + +template +inline void set_random_value(Kokkos::complex &v) { + Scalar vre = random(); + Scalar vim = random(); + v = Kokkos::complex(vre, vim); +} + +template +inline void set_random_value(std::complex &v) { + scalar_t vre = random(); + scalar_t vim = random(); + v = std::complex(vre, vim); +} + +template +void make_block_entries( + const KokkosSparse::CrsMatrix &mat_b1, + int blockSize, std::vector &mat_rowmap, + std::vector &mat_colidx, std::vector &mat_val) { + Ordinal nRow = blockSize * mat_b1.numRows(); + Ordinal nCol = blockSize * mat_b1.numCols(); + size_t nnz = static_cast(blockSize) * static_cast(blockSize) * + mat_b1.nnz(); + + mat_val.resize(nnz); + for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); + + // + // Create graph for CrsMatrix + // + + mat_rowmap.assign(nRow + 1, 0); + mat_colidx.assign(nnz, 0); + + for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir) { + const auto jbeg = mat_b1.graph.row_map(ir); + const auto jend = mat_b1.graph.row_map(ir + 1); + for (Ordinal ib = 0; ib < blockSize; ++ib) { + const Ordinal my_row = ir * blockSize + ib; + mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; + for (Ordinal ijk = jbeg; ijk < jend; ++ijk) { + const auto col0 = mat_b1.graph.entries(ijk); + for (Ordinal jb = 0; jb < blockSize; ++jb) { + mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + col0 * blockSize + jb; + } + } + } + } // for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir) +} + +template +int test_blockcrs_matrix_single_vec( + const char fOp[], + KokkosSparse::CrsMatrix + mat_b1, + int test, const char *filename, int rows_per_thread, int team_size, + int vector_length, int schedule, int loop, const scalar_t alpha, + const scalar_t beta, const int bMax) { + typedef typename KokkosSparse::CrsMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> + crsMat_type; + + typedef typename crsMat_type::values_type::non_const_type scalar_view_t; + typedef scalar_view_t x_vector_type; + typedef scalar_view_t y_vector_type; + + srand(17312837); + + int num_errors = 0; + const auto bMax_o = static_cast(bMax); + for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) { + Ordinal nRow = blockSize * mat_b1.numRows(); + Ordinal nCol = nRow; + std::vector mat_rowmap; + std::vector mat_colidx; + std::vector mat_val; + + // Create the entries + make_block_entries(mat_b1, blockSize, mat_rowmap, mat_colidx, + mat_val); + + // Create the CrsMatrix for the reference computation + crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0], + &mat_rowmap[0], &mat_colidx[0]); + + x_vector_type xref("new_right_hand_side", nRow); + auto h_xref = Kokkos::create_mirror_view(xref); + for (Ordinal ir = 0; ir < nRow; ++ir) { + set_random_value(h_xref(ir)); + } + Kokkos::deep_copy(xref, h_xref); + + y_vector_type y0("y_init", nRow); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir)); + Kokkos::deep_copy(y0, h_y0); + + y_vector_type ycrs("crs_product_result", nRow); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + + // Time a series of multiplications with the CrsMatrix + double time_crs = 0.0; + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); + Kokkos::deep_copy(ycrs, h_ycrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + time_crs += timer.seconds(); + } + + // Create the output vector + y_vector_type yblockcrs("product_result", nRow); + auto h_yblockcrs = Kokkos::create_mirror_view(yblockcrs); + + double time_blockcrs = 0.0; + // Create the BlockCrsMatrix + KokkosSparse::Experimental::BlockCrsMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> + Ablockcrs(Acrs, blockSize); + + switch (static_cast(test)) { + default: + case Implementation::KokkosKernels: { + // Time a series of multiplications with the BlockCrsMatrix + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal ir = 0; ir < nRow; ++ir) h_yblockcrs(ir) = h_y0(ir); + Kokkos::deep_copy(yblockcrs, h_yblockcrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Ablockcrs, xref, beta, yblockcrs); + time_blockcrs += timer.seconds(); + } + } break; +#ifdef HAVE_CUSPARSE + case Implementation::Cuda: { + // Time a series of multiplications with the BlockCrsMatrix + KokkosKernels::Experimental::Controls controls; + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal ir = 0; ir < nRow; ++ir) h_yblockcrs(ir) = h_y0(ir); + Kokkos::deep_copy(yblockcrs, h_yblockcrs); + Kokkos::Timer timer; + KokkosSparse::Impl::spmv_block_impl_cusparse( + controls, fOp, alpha, Ablockcrs, xref, beta, yblockcrs); + time_blockcrs += timer.seconds(); + } + break; +#endif +#ifdef HAVE_MKL + case Implementation::MKL: { + // Time a series of multiplications with the BlockCrsMatrix + KokkosKernels::Experimental::Controls controls; + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal ir = 0; ir < nRow; ++ir) h_yblockcrs(ir) = h_y0(ir); + Kokkos::deep_copy(yblockcrs, h_yblockcrs); + Kokkos::Timer timer; + KokkosSparse::Impl::spmv_block_mkl(controls, fOp, alpha, Ablockcrs, + xref, beta, yblockcrs); + time_blockcrs += timer.seconds(); + } + } break; +#endif + } + + // Check that the numerical result is matching + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_yblockcrs, yblockcrs); + double error = 0.0, maxNorm = 0.0; + for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { + maxNorm = std::max( + maxNorm, std::abs(static_cast(h_ycrs(ir)))); + error = std::max(error, std::abs(static_cast( + h_ycrs(ir) - h_yblockcrs(ir)))); + } + + double tol = + (mat_val.size() / nRow) * std::numeric_limits::epsilon(); + if (error > tol * maxNorm) { + num_errors += 1; + std::cout << static_cast(test) << " "; + std::cout << fOp << ", " << blockSize << " : " + << " error " << error << " maxNorm " << maxNorm << " tol " + << tol << " tol * maxNorm " << tol * maxNorm << "\n"; + } + + //-- Print the number of Gflops for both products + if (blockSize == 1) { + printf( + "Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n"); + } + double num_flops = mat_val.size() * 2 * loop; + double crs_flop = (num_flops / time_crs) * 1.0e-09; + double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09; + std::cout << fOp << ", " << blockSize << " : "; + if (crs_flop < blockcrs_flop) { + std::cout << crs_flop << " <" << blockcrs_flop << ">"; + } else { + std::cout << "<" << crs_flop << "> " << blockcrs_flop; + } + std::cout << std::endl; + + } // for (Ordinal blockSize = 1; blockSize < bMax; ++blockSize) + + return int(num_errors); + } + + template + int test_blockcrs_matrix_vec( + const char fOp[], + KokkosSparse::CrsMatrix + mat_b1, + int nvec, int test, const char *filename, int rows_per_thread, + int team_size, int vector_length, int schedule, int loop, + const scalar_t alpha, const scalar_t beta, const int bMax) { + typedef typename KokkosSparse::CrsMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> + crsMat_type; + + typedef Kokkos::View + block_vector_t; + + srand(17312837); + + int num_errors = 0; + const auto bMax_o = static_cast(bMax); + for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) { + Ordinal nRow = blockSize * mat_b1.numRows(); + Ordinal nCol = nRow; + std::vector mat_rowmap; + std::vector mat_colidx; + std::vector mat_val; + + make_block_entries(mat_b1, blockSize, mat_rowmap, mat_colidx, + mat_val); + + // Create the CrsMatrix for the reference computation + crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0], + &mat_rowmap[0], &mat_colidx[0]); + + block_vector_t xref("new_right_hand_side", nRow, nvec); + auto h_xref = Kokkos::create_mirror_view(xref); + for (Ordinal jc = 0; jc < nvec; ++jc) { + for (Ordinal ir = 0; ir < nRow; ++ir) { + set_random_value(h_xref(ir, jc)); + } + } + Kokkos::deep_copy(xref, h_xref); + + block_vector_t y0("y_init", nRow, nvec); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (Ordinal jc = 0; jc < nvec; ++jc) + for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); + Kokkos::deep_copy(y0, h_y0); + + block_vector_t ycrs("crs_product_result", nRow, nvec); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + + // Time a series of multiplications with the CrsMatrix format + double time_crs = 0.0; + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal jc = 0; jc < nvec; ++jc) + for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ycrs, h_ycrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + time_crs += timer.seconds(); + } + + // Create the BlockCrsMatrix variable + KokkosSparse::Experimental::BlockCrsMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> + Ablockcrs(Acrs, blockSize); + + block_vector_t yblockcrs("blockcrs_product_result", nRow, nvec); + auto h_yblockcrs = Kokkos::create_mirror_view(yblockcrs); + + // Time a series of multiplications with the BlockCrsMatrix + double time_blockcrs = 0.0; + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal jc = 0; jc < nvec; ++jc) + for (Ordinal ir = 0; ir < nRow; ++ir) + h_yblockcrs(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(yblockcrs, h_yblockcrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Ablockcrs, xref, beta, yblockcrs); + time_blockcrs += timer.seconds(); + } + + // Check that the result is matching + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_yblockcrs, yblockcrs); + double tol = + (mat_val.size() / nRow) * std::numeric_limits::epsilon(); + for (int jc = 0; jc < nvec; ++jc) { + double error = 0.0, maxNorm = 0.0; + for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { + maxNorm = std::max( + maxNorm, std::abs(static_cast(h_ycrs(ir, jc)))); + error = std::max(error, + std::abs(static_cast( + h_ycrs(ir, jc) - h_yblockcrs(ir, jc)))); + } + if (error > tol * maxNorm) { + num_errors += 1; + std::cout << fOp << ", " << blockSize << " : rhs " << jc << " error " + << error << " maxNorm " << maxNorm << " tol " << tol + << " tol * maxNorm " << tol * maxNorm << "\n"; + } + } + + // Print the number of Gflops + if (blockSize == 1) { + printf( + "Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n"); + } + double num_flops = mat_val.size() * 2 * loop * nvec; + double crs_flop = (num_flops / time_crs) * 1.0e-09; + double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09; + std::cout << fOp << ", " << blockSize << " "; + if (crs_flop < blockcrs_flop) { + // std::cout << crs_flop << " <" << blockcrs_flop << ">"; + std::cout << crs_flop << " " << blockcrs_flop << " "; + } else { + // std::cout << "<" << crs_flop << "> " << blockcrs_flop; + std::cout << " " << crs_flop << " " << blockcrs_flop; + } + std::cout << std::endl; + } + + return int(num_errors); + } + + void print_help() { + printf("BlockCrsMatrix SPMV benchmark code \n"); + printf("Options:\n"); + printf( + " -bs : Maximum blocksize for the sparse matrix (default " + "= " + "16). \n"); + printf(" -h : Help. \n"); + printf( + " -l [LOOP] : How many spmv to run to aggregate average time " + "(default = 512). \n"); + printf( + " -nx : Number of points in the x-direction (default = " + "32).\n"); + printf( + " The matrix will be of dimension nx (nx - 1) (nx + " + "1).\n"); + printf( + " -nv : Number of vectors to multiply with (default = 1). " + "\n"); + printf(" --op : Use different operation \n"); + printf(" Options: \n"); + printf( + " N = normal (default) y <- alpha A x + beta y\n"); + printf( + " C = conjugate y <- alpha conj(A) x + beta " + "y\n"); + printf( + " T = transpose y <- alpha A^T x + beta " + "y\n"); + printf( + " H = hermitian y <- alpha A^H x + beta " + "y\n"); + } +} + +int main(int argc, char **argv) { + int loop = 512; + int bMax = 16; + int nvec = 1; + int nx = 32; + + char fOp[] = "N"; + + char *filename = nullptr; + int rows_per_thread = -1; + int vector_length = -1; + int team_size = -1; + int test = static_cast(details::Implementation::KokkosKernels); + int schedule = 0; + + for (int i = 0; i < argc; i++) { + if ((strcmp(argv[i], "-bs") == 0)) { + int tmp = atoi(argv[++i]); + bMax = (tmp > 0) ? tmp : bMax; + continue; + } + + if ((strcmp(argv[i], "--tpl") == 0)) { + i++; + if ((strcmp(argv[i], "cuda") == 0)) + test = static_cast(details::Implementation::Cuda); + if ((strcmp(argv[i], "mkl") == 0)) + test = static_cast(details::Implementation::MKL); + continue; + } + + if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { + details::print_help(); + return 0; + } + + if ((strcmp(argv[i], "-l") == 0)) { + int tmp = atoi(argv[++i]); + loop = (tmp > 0) ? tmp : loop; + continue; + } + + if ((strcmp(argv[i], "-nx") == 0)) { + int tmp = atoi(argv[++i]); + nx = (tmp > 0) ? tmp : nx; + continue; + } + + if ((strcmp(argv[i], "-nv") == 0)) { + int tmp = atoi(argv[++i]); + nvec = (tmp > 0) ? tmp : nvec; + continue; + } + + if ((strcmp(argv[i], "--op") == 0)) { + i++; + if ((strcmp(argv[i], "N") == 0)) strcpy(fOp, "N"); + if ((strcmp(argv[i], "C") == 0)) strcpy(fOp, "C"); + if ((strcmp(argv[i], "T") == 0)) strcpy(fOp, "T"); + if ((strcmp(argv[i], "H") == 0)) strcpy(fOp, "H"); + continue; + } + } + + Kokkos::initialize(argc, argv); + { + // The mat_structure view is used to generate a matrix using + // finite difference (FD) or finite element (FE) discretization + // on a cartesian grid. + Kokkos::View mat_structure( + "Matrix Structure", 3); + mat_structure(0, 0) = nx; // Request 8 grid point in 'x' direction + mat_structure(0, 1) = 0; // Add BC to the left + mat_structure(0, 2) = 0; // Add BC to the right + mat_structure(1, 0) = nx - 1; // Request 7 grid point in 'y' direction + mat_structure(1, 1) = 0; // Add BC to the bottom + mat_structure(1, 2) = 0; // Add BC to the top + mat_structure(2, 0) = nx + 1; // Request 9 grid point in 'z' direction + mat_structure(2, 1) = 0; // Add BC to the bottom + mat_structure(2, 2) = 0; // Add BC to the top + + typedef typename KokkosSparse::CrsMatrix + h_crsMat_type; + + h_crsMat_type mat_b1 = + Test::generate_structured_matrix3D("FD", mat_structure); + + int total_errors = 0; + + if (nvec == 1) + total_errors = details::test_blockcrs_matrix_single_vec( + fOp, mat_b1, test, filename, rows_per_thread, team_size, + vector_length, schedule, loop, details::Scalar(3.1), + details::Scalar(-2.4), bMax); + else + total_errors = details::test_blockcrs_matrix_vec( + fOp, mat_b1, nvec, test, filename, rows_per_thread, team_size, + vector_length, schedule, loop, details::Scalar(3.1), + details::Scalar(-2.4), bMax); + + if (total_errors != 0) { + printf("Kokkos::BlockCrsMatrix SpMV Test: Failed\n"); + } + } + Kokkos::finalize(); +} diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp new file mode 100644 index 0000000000..d65468ce96 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -0,0 +1,573 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace details { + +enum class Implementation : int { KokkosKernels = 0, Cuda = 1, MKL = 2 }; + +/// +/// Define default types +/// +typedef double Scalar; +typedef int Ordinal; +/// +////////////////////////// + +/// Random generator +template +inline scalar_t random() { + auto const max = static_cast(RAND_MAX) + static_cast(1); + return static_cast(std::rand()) / max; +} + +template +inline void set_random_value(scalar_t &v) { + v = random(); +} + +template +inline void set_random_value(Kokkos::complex &v) { + Scalar vre = random(); + Scalar vim = random(); + v = Kokkos::complex(vre, vim); +} + +template +inline void set_random_value(std::complex &v) { + scalar_t vre = random(); + scalar_t vim = random(); + v = std::complex(vre, vim); +} + +template +void make_block_entries( + const KokkosSparse::CrsMatrix &mat_b1, + int blockSize, std::vector &mat_rowmap, + std::vector &mat_colidx, std::vector &mat_val) { + Ordinal nRow = blockSize * mat_b1.numRows(); + Ordinal nCol = blockSize * mat_b1.numCols(); + size_t nnz = static_cast(blockSize) * static_cast(blockSize) * + mat_b1.nnz(); + + mat_val.resize(nnz); + for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); + + // + // Create graph for CrsMatrix + // + + mat_rowmap.assign(nRow + 1, 0); + mat_colidx.assign(nnz, 0); + + for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir) { + const auto jbeg = mat_b1.graph.row_map(ir); + const auto jend = mat_b1.graph.row_map(ir + 1); + for (Ordinal ib = 0; ib < blockSize; ++ib) { + const Ordinal my_row = ir * blockSize + ib; + mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; + for (Ordinal ijk = jbeg; ijk < jend; ++ijk) { + const auto col0 = mat_b1.graph.entries(ijk); + for (Ordinal jb = 0; jb < blockSize; ++jb) { + mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + col0 * blockSize + jb; + } + } + } + } // for (Ordinal ir = 0; ir < mat_b1.numRows(); ++ir) +} + +template +int test_bsr_matrix_single_vec( + const char fOp[], + KokkosSparse::CrsMatrix + mat_b1, + int test, const char *filename, int rows_per_thread, int team_size, + int vector_length, int schedule, int loop, const scalar_t alpha, + const scalar_t beta, const int bMax) { + typedef + typename KokkosSparse::CrsMatrix + crsMat_type; + + typedef Kokkos::View + x_vector_type; + typedef Kokkos::View + y_vector_type; + + srand(17312837); + + int num_errors = 0; + const auto bMax_o = static_cast(bMax); + for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) { + Ordinal nRow = blockSize * mat_b1.numRows(); + Ordinal nCol = nRow; + std::vector mat_rowmap; + std::vector mat_colidx; + std::vector mat_val; + + // Create the entries + make_block_entries(mat_b1, blockSize, mat_rowmap, mat_colidx, + mat_val); + + // Create the CrsMatrix for the reference computation + crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0], + &mat_rowmap[0], &mat_colidx[0]); + + x_vector_type xref("new_right_hand_side", nRow); + auto h_xref = Kokkos::create_mirror_view(xref); + for (Ordinal ir = 0; ir < nRow; ++ir) { + set_random_value(h_xref(ir)); + } + Kokkos::deep_copy(xref, h_xref); + + y_vector_type y0("y_init", nRow); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir)); + Kokkos::deep_copy(y0, h_y0); + + y_vector_type ycrs("crs_product_result", nRow); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + + // Time a series of multiplications with the CrsMatrix + double time_crs = 0.0; + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); + Kokkos::deep_copy(ycrs, h_ycrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + time_crs += timer.seconds(); + } + + // Create the output vector + y_vector_type ybsr("product_result", nRow); + auto h_ybsr = Kokkos::create_mirror_view(ybsr); + + double time_bsr = 0.0; + // Create the BsrMatrix + KokkosSparse::Experimental::BsrMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int> + Absr(Acrs, blockSize); + + KokkosKernels::Experimental::Controls controls; + switch (static_cast(test)) { + case Implementation::KokkosKernels: { + controls.setParameter("algorithm", "native"); + } break; + default: break; + } + + // Time a series of multiplications with the BsrMatrix + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); + Kokkos::deep_copy(ybsr, h_ybsr); + Kokkos::Timer timer; + KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + time_bsr += timer.seconds(); + } + + // Check that the numerical result is matching + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_ybsr, ybsr); + double error = 0.0, maxNorm = 0.0; + for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { + maxNorm = std::max( + maxNorm, std::abs(static_cast(h_ycrs(ir)))); + error = std::max( + error, + std::abs(static_cast(h_ycrs(ir) - h_ybsr(ir)))); + } + + double tol = + (mat_val.size() / nRow) * std::numeric_limits::epsilon(); + if (error > tol * maxNorm) { + num_errors += 1; + std::cout << static_cast(test) << " "; + std::cout << fOp << ", " << blockSize << " : " + << " error " << error << " maxNorm " << maxNorm << " tol " + << tol << " tol * maxNorm " << tol * maxNorm << "\n"; + } + + //-- Print the number of Gflops for both products + if (blockSize == 1) { + printf("Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BsrMatrix) \n"); + } + double num_flops = mat_val.size() * 2 * loop; + double crs_flop = (num_flops / time_crs) * 1.0e-09; + double bsr_flop = (num_flops / time_bsr) * 1.0e-09; + std::cout << fOp << ", " << blockSize << " : "; + if (crs_flop < bsr_flop) { + std::cout << crs_flop << " <" << bsr_flop << ">"; + } else { + std::cout << "<" << crs_flop << "> " << bsr_flop; + } + std::cout << std::endl; + + } // for (Ordinal blockSize = 1; blockSize < bMax; ++blockSize) + + return int(num_errors); +} + +template +int test_bsr_matrix_vec( + const char fOp[], + KokkosSparse::CrsMatrix + mat_b1, + int nvec, int test, const char *filename, int rows_per_thread, + int team_size, int vector_length, int schedule, int loop, + const scalar_t alpha, const scalar_t beta, const int bMax) { + typedef + typename KokkosSparse::CrsMatrix + crsMat_type; + + typedef Kokkos::View + block_vector_t; + + srand(17312837); + + int num_errors = 0; + const auto bMax_o = static_cast(bMax); + for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) { + Ordinal nRow = blockSize * mat_b1.numRows(); + Ordinal nCol = nRow; + std::vector mat_rowmap; + std::vector mat_colidx; + std::vector mat_val; + + make_block_entries(mat_b1, blockSize, mat_rowmap, mat_colidx, + mat_val); + + // Create the CrsMatrix for the reference computation + crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0], + &mat_rowmap[0], &mat_colidx[0]); + + block_vector_t xref("new_right_hand_side", nRow, nvec); + auto h_xref = Kokkos::create_mirror_view(xref); + for (Ordinal jc = 0; jc < nvec; ++jc) { + for (Ordinal ir = 0; ir < nRow; ++ir) { + set_random_value(h_xref(ir, jc)); + } + } + Kokkos::deep_copy(xref, h_xref); + + block_vector_t y0("y_init", nRow, nvec); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (Ordinal jc = 0; jc < nvec; ++jc) + for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); + Kokkos::deep_copy(y0, h_y0); + + block_vector_t ycrs("crs_product_result", nRow, nvec); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + + // Time a series of multiplications with the CrsMatrix format + double time_crs = 0.0; + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal jc = 0; jc < nvec; ++jc) + for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ycrs, h_ycrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + time_crs += timer.seconds(); + } + + // Create the BsrMatrix variable + KokkosSparse::Experimental::BsrMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int> + Absr(Acrs, blockSize); + + block_vector_t ybsr("bsr_product_result", nRow, nvec); + auto h_ybsr = Kokkos::create_mirror_view(ybsr); + + // Time a series of multiplications with the BsrMatrix + double time_bsr = 0.0; + KokkosKernels::Experimental::Controls controls; + switch (static_cast(test)) { + case Implementation::KokkosKernels: { + controls.setParameter("algorithm", "native"); + } break; + default: break; + } + + // Time a series of multiplications with the BsrMatrix + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal jc = 0; jc < nvec; ++jc) + for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ybsr, h_ybsr); + Kokkos::Timer timer; + KokkosSparse::spmv(controls, fOp, alpha, Absr, xref, beta, ybsr); + time_bsr += timer.seconds(); + } + + // Check that the result is matching + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_ybsr, ybsr); + double tol = + (mat_val.size() / nRow) * std::numeric_limits::epsilon(); + for (int jc = 0; jc < nvec; ++jc) { + double error = 0.0, maxNorm = 0.0; + for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { + maxNorm = std::max( + maxNorm, std::abs(static_cast(h_ycrs(ir, jc)))); + error = std::max(error, std::abs(static_cast( + h_ycrs(ir, jc) - h_ybsr(ir, jc)))); + } + if (error > tol * maxNorm) { + num_errors += 1; + std::cout << fOp << ", " << blockSize << " : rhs " << jc << " error " + << error << " maxNorm " << maxNorm << " tol " << tol + << " tol * maxNorm " << tol * maxNorm << "\n"; + } + } + + // Print the number of Gflops + if (blockSize == 1) { + printf("Op, blockSize: AvgGFlop(CrsMatrix) "); + switch (static_cast(test)) { + default: + case Implementation::KokkosKernels: + printf(" AvgGFlop(BsrMatrix - KokkosKernels) \n"); + break; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + case Implementation::Cuda: + printf(" AvgGFlop(BsrMatrix - CUSPARSE) \n"); + break; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + case Implementation::MKL: + printf(" AvgGFlop(BsrMatrix - MKL) \n"); + break; +#endif + } + } + double num_flops = mat_val.size() * 2 * loop * nvec; + double crs_flop = (num_flops / time_crs) * 1.0e-09; + double bsr_flop = (num_flops / time_bsr) * 1.0e-09; + std::cout << fOp << ", " << blockSize << " "; + if (crs_flop < bsr_flop) { + // std::cout << crs_flop << " <" << bsr_flop << ">"; + std::cout << crs_flop << " " << bsr_flop << " "; + } else { + // std::cout << "<" << crs_flop << "> " << bsr_flop; + std::cout << " " << crs_flop << " " << bsr_flop; + } + std::cout << std::endl; + } + + return int(num_errors); +} + +void print_help() { + printf("BsrMatrix SPMV benchmark code \n"); + printf("Options:\n"); + printf( + " -bs : Maximum blocksize for the sparse matrix (default " + "= " + "16). \n"); + printf(" -h : Help. \n"); + printf( + " -l [LOOP] : How many spmv to run to aggregate average time " + "(default = 512). \n"); + printf( + " -nx : Number of points in the x-direction (default = " + "32).\n"); + printf( + " The matrix will be of dimension nx (nx - 1) (nx + " + "1).\n"); + printf( + " -nv : Number of vectors to multiply with (default = 1). " + "\n"); + printf(" --op : Use different operation \n"); + printf(" Options: \n"); + printf(" N = normal (default) y <- alpha A x + beta y\n"); + printf( + " C = conjugate y <- alpha conj(A) x + beta " + "y\n"); + printf( + " T = transpose y <- alpha A^T x + beta " + "y\n"); + printf( + " H = hermitian y <- alpha A^H x + beta " + "y\n"); +} +} // namespace details + +int main(int argc, char **argv) { + int loop = 512; + int bMax = 16; + int nvec = 1; + int nx = 32; + + char fOp[] = "N"; + + char *filename = nullptr; + int rows_per_thread = -1; + int vector_length = -1; + int team_size = -1; + int test = static_cast(details::Implementation::KokkosKernels); + int schedule = 0; + + for (int i = 0; i < argc; i++) { + std::cout << " i " << i << " argv " << argv[i] << "\n"; + + if ((strcmp(argv[i], "-bs") == 0)) { + int tmp = atoi(argv[++i]); + bMax = (tmp > 0) ? tmp : bMax; + continue; + } + + if ((strcmp(argv[i], "--tpl") == 0)) { + std::cout << argv[i] << "\n"; + i++; + std::cout << argv[i] << "\n"; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if ((strcmp(argv[i], "cuda") == 0)) + test = static_cast(details::Implementation::Cuda); +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if ((strcmp(argv[i], "mkl") == 0)) + test = static_cast(details::Implementation::MKL); +#endif + std::cout << test << "\n"; + continue; + } + + if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) { + details::print_help(); + return 0; + } + + if ((strcmp(argv[i], "-l") == 0)) { + int tmp = atoi(argv[++i]); + loop = (tmp > 0) ? tmp : loop; + continue; + } + + if ((strcmp(argv[i], "-nx") == 0)) { + int tmp = atoi(argv[++i]); + nx = (tmp > 0) ? tmp : nx; + continue; + } + + if ((strcmp(argv[i], "-nv") == 0)) { + int tmp = atoi(argv[++i]); + nvec = (tmp > 0) ? tmp : nvec; + continue; + } + + if ((strcmp(argv[i], "--op") == 0)) { + i++; + if ((strcmp(argv[i], "N") == 0)) strcpy(fOp, "N"); + if ((strcmp(argv[i], "C") == 0)) strcpy(fOp, "C"); + if ((strcmp(argv[i], "T") == 0)) strcpy(fOp, "T"); + if ((strcmp(argv[i], "H") == 0)) strcpy(fOp, "H"); + continue; + } + } + + Kokkos::initialize(argc, argv); + + { + // The mat_structure view is used to generate a matrix using + // finite difference (FD) or finite element (FE) discretization + // on a cartesian grid. + Kokkos::View mat_structure( + "Matrix Structure", 3); + mat_structure(0, 0) = nx; // Request 8 grid point in 'x' direction + mat_structure(0, 1) = 0; // Add BC to the left + mat_structure(0, 2) = 0; // Add BC to the right + mat_structure(1, 0) = nx - 1; // Request 7 grid point in 'y' direction + mat_structure(1, 1) = 0; // Add BC to the bottom + mat_structure(1, 2) = 0; // Add BC to the top + mat_structure(2, 0) = nx + 1; // Request 9 grid point in 'z' direction + mat_structure(2, 1) = 0; // Add BC to the bottom + mat_structure(2, 2) = 0; // Add BC to the top + + typedef typename KokkosSparse::CrsMatrix + h_crsMat_type; + + h_crsMat_type mat_b1 = + Test::generate_structured_matrix3D("FD", mat_structure); + + int total_errors = 0; + + if (nvec == 1) + total_errors = details::test_bsr_matrix_single_vec( + fOp, mat_b1, test, filename, rows_per_thread, team_size, + vector_length, schedule, loop, details::Scalar(3.1), + details::Scalar(-2.4), bMax); + else + total_errors = details::test_bsr_matrix_vec( + fOp, mat_b1, nvec, test, filename, rows_per_thread, team_size, + vector_length, schedule, loop, details::Scalar(3.1), + details::Scalar(-2.4), bMax); + + if (total_errors != 0) { + printf("Kokkos::BsrMatrix SpMV Test: Failed\n"); + } + } + + Kokkos::finalize(); +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index ae830c9a89..3648939585 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -304,6 +304,20 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_struct spmv TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_blockcrsmatrix spmv + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + +KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_blockcrsmatrix spmv + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_bsrmatrix spmv COMPONENTS sparse HEADER_LIST ETI_HEADERS @@ -311,6 +325,13 @@ KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_bsrmatrix spmv TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES ) +KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv_mv_bsrmatrix spmv + COMPONENTS sparse + HEADER_LIST ETI_HEADERS + SOURCE_LIST SOURCES + TYPE_LISTS FLOATS ORDINALS OFFSETS LAYOUTS DEVICES +) + KOKKOSKERNELS_GENERATE_ETI(Sparse_spmv spmv COMPONENTS sparse HEADER_LIST ETI_HEADERS diff --git a/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_blockcrsmatrix_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_blockcrsmatrix_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..1bb85d6067 --- /dev/null +++ b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_blockcrsmatrix_eti_spec_inst.cpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosSparse_spmv_blockcrsmatrix_spec.hpp" + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_BLOCKCRSMATRIX_ETI_INST_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse diff --git a/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..ae672bc04a --- /dev/null +++ b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_inst.cpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosSparse_spmv_blockcrsmatrix_spec.hpp" + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_INST_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse diff --git a/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in new file mode 100644 index 0000000000..5f2c437627 --- /dev/null +++ b/src/impl/generated_specializations_cpp/spmv/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_inst.cpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#define KOKKOSKERNELS_IMPL_COMPILE_LIBRARY true +#include "KokkosKernels_config.h" +#include "KokkosSparse_spmv_bsrmatrix_spec.hpp" + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BSRMATRIX_ETI_INST_BLOCK@ +/// // clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..1ce97a5795 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_avail.hpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_BLOCKCRSMATRIX_ETI_AVAIL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..9ad333ccfd --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_blockcrsmatrix_eti_spec_decl.hpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_BLOCKCRSMATRIX_ETI_DECL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in index 13210dc835..be5a45d793 100644 --- a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_bsrmatrix_eti_spec_avail.hpp.in @@ -1,5 +1,5 @@ -#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ -#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ /* //@HEADER // ************************************************************************ @@ -52,4 +52,4 @@ namespace Impl { } // namespace Impl } // namespace Experimental } // namespace KokkosSparse -#endif \ No newline at end of file +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..85b72e3b7b --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_avail.hpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_AVAIL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..c0b77c54f2 --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_blockcrsmatrix_eti_spec_decl.hpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_DECL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in new file mode 100644 index 0000000000..b3e9fb662a --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_avail.hpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BSRMATRIX_ETI_AVAIL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse +#endif diff --git a/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in new file mode 100644 index 0000000000..c49e565f7b --- /dev/null +++ b/src/impl/generated_specializations_hpp/KokkosSparse_spmv_mv_bsrmatrix_eti_spec_decl.hpp.in @@ -0,0 +1,56 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// clang-format off +@SPARSE_SPMV_MV_BSRMATRIX_ETI_DECL_BLOCK@ +// clang-format on +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse +#endif \ No newline at end of file diff --git a/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp new file mode 100644 index 0000000000..e7ac862f22 --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_avail.hpp @@ -0,0 +1,70 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +// Specialization struct which defines whether a specialization exists +template +struct spmv_blockcrsmatrix_tpl_spec_avail { + enum : bool { value = false }; +}; + +// Specialization struct which defines whether a specialization exists +template +struct spmv_mv_blockcrsmatrix_tpl_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif // KOKKOSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_AVAIL_HPP_ diff --git a/src/sparse/impl/KokkosSparse_BsrMatrix_impl.hpp b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp similarity index 82% rename from src/sparse/impl/KokkosSparse_BsrMatrix_impl.hpp rename to src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp index 77f88a99df..d5e9aad5be 100644 --- a/src/sparse/impl/KokkosSparse_BsrMatrix_impl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_blockcrsmatrix_tpl_spec_decl.hpp @@ -36,23 +36,13 @@ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // -// Questions? Contact Luc Berger-Vergiat (lberge@sandia.gov) +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) // // ************************************************************************ //@HEADER */ -#ifndef KOKKOS_SPARSE_BSRMATRIX_IMPL_HPP_ -#define KOKKOS_SPARSE_BSRMATRIX_IMPL_HPP_ +#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_DECL_HPP +#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_DECL_HPP -#include -#include -#include -#include - -#include "Kokkos_Core.hpp" -#include "Kokkos_StaticCrsGraph.hpp" -#include "Kokkos_ArithTraits.hpp" -#include "KokkosSparse_CrsMatrix.hpp" - -#endif // KOKKOS_SPARSE_BSRMATRIX_IMPL_HPP_ +#endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_TPL_SPEC_DECL_HPP diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp new file mode 100644 index 0000000000..cd8287b38e --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -0,0 +1,327 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ +#define KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +// Specialization struct which defines whether a specialization exists +template +struct spmv_bsrmatrix_tpl_spec_avail { + enum : bool { value = false }; +}; + +// cuSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +// These versions of cuSPARSE require the ordinal and offset types to be the +// same. For KokkosKernels, this means int/int only. + +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ + SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ + XL, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + YL, Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +#if (9000 <= CUDA_VERSION) + +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, + int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, + int, Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, + int, Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, + int, Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) + +#endif // CUDA/CUSPARSE >= 9.0? +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int, const SCALAR*, \ + Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::Serial) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::OpenMP) +#endif + +#endif + +// Specialization struct which defines whether a specialization exists +template ::type>::value> +struct spmv_mv_bsrmatrix_tpl_spec_avail { + enum : bool { value = false }; +}; + +// cuSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +// These versions of cuSPARSE require the ordinal and offset types to be the +// same. For KokkosKernels, this means int/int only. + +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ + SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ + XL, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + YL, Kokkos::Device, \ + Kokkos::MemoryTraits, true> { \ + enum : bool { value = true }; \ + }; + +#if (9000 <= CUDA_VERSION) + +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutLeft, + Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, + int, int, + Kokkos::LayoutRight, + Kokkos::LayoutRight, + Kokkos::CudaUVMSpace) + +#endif // CUDA/CUSPARSE >= 9.0? +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(SCALAR, EXECSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const int, Kokkos::Device, \ + Kokkos::MemoryTraits, const int, const SCALAR*, \ + Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, true> { \ + enum : bool { value = true }; \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::Serial) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::Serial) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::Serial) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(float, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(double, Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::OpenMP) +KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_MKL(Kokkos::complex, + Kokkos::OpenMP) +#endif + +#endif + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif // KOKKOSPARSE_SPMV_BSRMATRIX_TPL_SPEC_AVAIL_HPP_ diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp new file mode 100644 index 0000000000..35c78b4b8d --- /dev/null +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -0,0 +1,865 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP + +#include "KokkosKernels_Controls.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +#if (__INTEL_MKL__ > 2017) +// MKL 2018 and above: use new interface: sparse_matrix_t and mkl_sparse_?_mv() + +namespace BSR { +inline void mkl_safe_call(int errcode) { + if (errcode != SPARSE_STATUS_SUCCESS) + throw std::runtime_error("MKL returned non-success error code"); +} + +inline sparse_operation_t mode_kk_to_mkl(char mode_kk) { + switch (toupper(mode_kk)) { + case 'N': return SPARSE_OPERATION_NON_TRANSPOSE; + case 'T': return SPARSE_OPERATION_TRANSPOSE; + case 'H': return SPARSE_OPERATION_CONJUGATE_TRANSPOSE; + default:; + } + throw std::invalid_argument( + "Invalid mode for MKL (should be one of N, T, H)"); +} +} // namespace BSR + +using BSR::mkl_safe_call; +using BSR::mode_kk_to_mkl; + +inline matrix_descr getDescription() { + matrix_descr A_descr; + A_descr.type = SPARSE_MATRIX_TYPE_GENERAL; + A_descr.mode = SPARSE_FILL_MODE_FULL; + A_descr.diag = SPARSE_DIAG_NON_UNIT; + return A_descr; +} + +inline void spmv_block_impl_mkl(sparse_operation_t op, float alpha, float beta, + int m, int n, int b, const int* Arowptrs, + const int* Aentries, const float* Avalues, + const float* x, float* y) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_s_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + + matrix_descr A_descr = getDescription(); + mkl_safe_call(mkl_sparse_s_mv(op, alpha, A_mkl, A_descr, x, beta, y)); +} + +inline void spmv_block_impl_mkl(sparse_operation_t op, double alpha, + double beta, int m, int n, int b, + const int* Arowptrs, const int* Aentries, + const double* Avalues, const double* x, + double* y) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_d_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + + matrix_descr A_descr = getDescription(); + mkl_safe_call(mkl_sparse_d_mv(op, alpha, A_mkl, A_descr, x, beta, y)); +} + +inline void spmv_block_impl_mkl(sparse_operation_t op, + Kokkos::complex alpha, + Kokkos::complex beta, int m, int n, + int b, const int* Arowptrs, const int* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, + Kokkos::complex* y) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_c_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); + + MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex8& beta_mkl = reinterpret_cast(beta); + matrix_descr A_descr = getDescription(); + mkl_safe_call(mkl_sparse_c_mv(op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); +} + +inline void spmv_block_impl_mkl(sparse_operation_t op, + Kokkos::complex alpha, + Kokkos::complex beta, int m, int n, + int b, const int* Arowptrs, const int* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, + Kokkos::complex* y) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_z_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); + + matrix_descr A_descr = getDescription(); + MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex16& beta_mkl = reinterpret_cast(beta); + mkl_safe_call(mkl_sparse_z_mv(op, alpha_mkl, A_mkl, A_descr, + reinterpret_cast(x), + beta_mkl, reinterpret_cast(y))); +} + +inline void spm_mv_block_impl_mkl(sparse_operation_t op, float alpha, + float beta, int m, int n, int b, + const int* Arowptrs, const int* Aentries, + const float* Avalues, const float* x, + int colx, int ldx, float* y, int ldy) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_s_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + + matrix_descr A_descr = getDescription(); + mkl_safe_call(mkl_sparse_s_mm(op, alpha, A_mkl, A_descr, + SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, + ldy)); +} + +inline void spm_mv_block_impl_mkl(sparse_operation_t op, double alpha, + double beta, int m, int n, int b, + const int* Arowptrs, const int* Aentries, + const double* Avalues, const double* x, + int colx, int ldx, double* y, int ldy) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_d_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), const_cast(Avalues))); + + matrix_descr A_descr = getDescription(); + mkl_safe_call(mkl_sparse_d_mm(op, alpha, A_mkl, A_descr, + SPARSE_LAYOUT_ROW_MAJOR, x, colx, ldx, beta, y, + ldy)); +} + +inline void spm_mv_block_impl_mkl(sparse_operation_t op, + Kokkos::complex alpha, + Kokkos::complex beta, int m, int n, + int b, const int* Arowptrs, + const int* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, int colx, + int ldx, Kokkos::complex* y, int ldy) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_c_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex8*)Avalues)); + + MKL_Complex8& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex8& beta_mkl = reinterpret_cast(beta); + matrix_descr A_descr = getDescription(); + mkl_safe_call( + mkl_sparse_c_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, + reinterpret_cast(x), colx, ldx, + beta_mkl, reinterpret_cast(y), ldy)); +} + +inline void spm_mv_block_impl_mkl( + sparse_operation_t op, Kokkos::complex alpha, + Kokkos::complex beta, int m, int n, int b, const int* Arowptrs, + const int* Aentries, const Kokkos::complex* Avalues, + const Kokkos::complex* x, int colx, int ldx, + Kokkos::complex* y, int ldy) { + sparse_matrix_t A_mkl; + mkl_safe_call(mkl_sparse_z_create_bsr( + &A_mkl, SPARSE_INDEX_BASE_ZERO, SPARSE_LAYOUT_ROW_MAJOR, m, n, b, + const_cast(Arowptrs), const_cast(Arowptrs + 1), + const_cast(Aentries), (MKL_Complex16*)Avalues)); + + matrix_descr A_descr = getDescription(); + MKL_Complex16& alpha_mkl = reinterpret_cast(alpha); + MKL_Complex16& beta_mkl = reinterpret_cast(beta); + mkl_safe_call( + mkl_sparse_z_mm(op, alpha_mkl, A_mkl, A_descr, SPARSE_LAYOUT_ROW_MAJOR, + reinterpret_cast(x), colx, ldx, + beta_mkl, reinterpret_cast(y), ldy)); +} + +#endif + +#if (__INTEL_MKL__ == 2017) + +inline void spmv_block_impl_mkl(char mode, float alpha, float beta, int m, + int n, int b, const int* Arowptrs, + const int* Aentries, const float* Avalues, + const float* x, float* y) { + mkl_sbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, + Arowptrs + 1, x, &beta, y); +} + +inline void spmv_block_impl_mkl(char mode, double alpha, double beta, int m, + int n, int b, const int* Arowptrs, + const int* Aentries, const double* Avalues, + const double* x, double* y) { + mkl_dbsrmv(&mode, &m, &n, &b, &alpha, "G**C", Avalues, Aentries, Arowptrs, + Arowptrs + 1, x, &beta, y); +} + +inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, + Kokkos::complex beta, int m, int n, + int b, const int* Arowptrs, const int* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, + Kokkos::complex* y) { + const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex8* Avalues_mkl = + reinterpret_cast(Avalues); + const MKL_Complex8* x_mkl = reinterpret_cast(x); + MKL_Complex8* y_mkl = reinterpret_cast(y); + mkl_cbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, + Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); +} + +inline void spmv_block_impl_mkl(char mode, Kokkos::complex alpha, + Kokkos::complex beta, int m, int n, + int b, const int* Arowptrs, const int* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, + Kokkos::complex* y) { + const MKL_Complex16* alpha_mkl = + reinterpret_cast(&alpha); + const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex16* Avalues_mkl = + reinterpret_cast(Avalues); + const MKL_Complex16* x_mkl = reinterpret_cast(x); + MKL_Complex16* y_mkl = reinterpret_cast(y); + mkl_zbsrmv(&mode, &m, &n, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, + Arowptrs, Arowptrs + 1, x_mkl, beta_mkl, y_mkl); +} + +inline void spm_mv_block_impl_mkl(char mode, float alpha, float beta, int m, + int n, int b, const int* Arowptrs, + const int* Aentries, const float* Avalues, + const float* x, int colx, int ldx, float* y, + int ldy) { + mkl_sbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, + Arowptrs, Arowptrs + 1, x, &beta, y); +} + +inline void spm_mv_block_impl_mkl(char mode, double alpha, double beta, int m, + int n, int b, const int* Arowptrs, + const int* Aentries, const double* Avalues, + const double* x, int colx, int ldx, double* y, + int ldy) { + mkl_dbsrmm(&mode, &m, &n, &colx, &b, &alpha, "G**C", Avalues, Aentries, + Arowptrs, Arowptrs + 1, x, ldx, &beta, y, ldy); +} + +inline void spm_mv_block_impl_mkl(char mode, Kokkos::complex alpha, + Kokkos::complex beta, int m, int n, + int b, const int* Arowptrs, + const int* Aentries, + const Kokkos::complex* Avalues, + const Kokkos::complex* x, int colx, + int ldx, Kokkos::complex* y, int ldy) { + const MKL_Complex8* alpha_mkl = reinterpret_cast(&alpha); + const MKL_Complex8* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex8* Avalues_mkl = + reinterpret_cast(Avalues); + const MKL_Complex8* x_mkl = reinterpret_cast(x); + MKL_Complex8* y_mkl = reinterpret_cast(y); + mkl_cbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, + Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); +} + +inline void spm_mv_block_impl_mkl( + char mode, Kokkos::complex alpha, Kokkos::complex beta, + int m, int n, int b, const int* Arowptrs, const int* Aentries, + const Kokkos::complex* Avalues, const Kokkos::complex* x, + int colx, int ldx, Kokkos::complex* y, int ldy) { + const MKL_Complex16* alpha_mkl = + reinterpret_cast(&alpha); + const MKL_Complex16* beta_mkl = reinterpret_cast(&beta); + const MKL_Complex16* Avalues_mkl = + reinterpret_cast(Avalues); + const MKL_Complex16* x_mkl = reinterpret_cast(x); + MKL_Complex16* y_mkl = reinterpret_cast(y); + mkl_zbsrmv(&mode, &m, &n, &colx, &b, alpha_mkl, "G**C", Avalues_mkl, Aentries, + Arowptrs, Arowptrs + 1, x_mkl, ldx, beta_mkl, y_mkl, ldy); +} + +#endif + +#define KOKKOSSPARSE_SPMV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + SCALAR const, int const, Kokkos::Device, \ + Kokkos::MemoryTraits, int const, SCALAR const*, \ + Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using AMatrix = \ + BsrMatrix, int const>; \ + using XVector = Kokkos::View< \ + SCALAR const*, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv_bsrmatrix( \ + const KokkosKernels::Experimental::Controls& controls, \ + const char mode[], const YScalar& alpha, const AMatrix& A, \ + const XVector& X, const YScalar& beta, const YVector& Y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.blockDim(), A.graph.row_map.data(), \ + A.graph.entries.data(), A.values.data(), x.data(), \ + y.data()); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSSPARSE_SPMV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(double, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSSPARSE_SPMV_MKL(float, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(double, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif + +#undef KOKKOSSPARSE_SPMV_MKL + +#define KOKKOSSPARSE_SPMV_MV_MKL(SCALAR, EXECSPACE, COMPILE_LIBRARY) \ + template <> \ + struct SPMV_MV_BSRMATRIX< \ + SCALAR const, int const, Kokkos::Device, \ + Kokkos::MemoryTraits, int const, SCALAR const**, \ + Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR**, Kokkos::LayoutLeft, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using AMatrix = \ + BsrMatrix, int const>; \ + using XVector = Kokkos::View< \ + SCALAR const**, Kokkos::LayoutLeft, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = Kokkos::View>; \ + using coefficient_type = typename YVector::non_const_value_type; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + static void spmv_mv_bsrmatrix( \ + const KokkosKernels::Experimental::Controls& controls, \ + const char mode[], const YScalar& alpha, const AMatrix& A, \ + const XVector& X, const YScalar& beta, const YVector& Y) { \ + std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + int colx = static_cast(x.extent(1)); \ + int ldx = static_cast(x.stride_1()); \ + int ldy = static_cast(y.stride_1()); \ + spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ + A.numCols(), A.blockDim(), A.graph.row_map.data(), \ + A.graph.entries.data(), A.values.data(), x.data(), \ + colx, ldx, y.data(), ldy); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#ifdef KOKKOS_ENABLE_SERIAL +KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif + +#undef KOKKOSSPARSE_SPMV_MV_MKL + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif + +// cuSPARSE +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#include "cusparse.h" +#include "KokkosKernels_SparseUtils_cusparse.hpp" + +// +// From https://docs.nvidia.com/cuda/cusparse/index.html#bsrmv +// Several comments on bsrmv(): +// - Only blockDim > 1 is supported +// - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported +// - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported. +// +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +template +void spmv_block_impl_cusparse( + const KokkosKernels::Experimental::Controls& controls, const char mode[], + typename YVector::non_const_value_type const& alpha, const AMatrix& A, + const XVector& x, typename YVector::non_const_value_type const& beta, + const YVector& y) { + using offset_type = typename AMatrix::non_const_size_type; + using entry_type = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + + /* initialize cusparse library */ + cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + + /* Set the operation mode */ + cusparseOperation_t myCusparseOperation; + switch (toupper(mode[0])) { + case 'N': myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; break; + default: { + std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n"; + throw std::invalid_argument("Invalid mode"); + } break; + } + +#if (9000 <= CUDA_VERSION) + + /* create and set the matrix descriptor */ + cusparseMatDescr_t descrA = 0; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; + + /* perform the actual SpMV operation */ + if ((std::is_same::value) && + (std::is_same::value)) { + if (std::is_same::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if (std::is_same::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if (std::is_same>::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else if (std::is_same>::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmv( + cusparseHandle, dirA, myCusparseOperation, A.numRows(), A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), + reinterpret_cast(&beta), + reinterpret_cast(y.data()))); + } else { + throw std::logic_error( + "Trying to call cusparse[*]bsrmv with a scalar type not " + "float/double, " + "nor complex of either!"); + } + } else { + throw std::logic_error( + "With cuSPARSE pre-10.0, offset and entry types must be int. " + "Something wrong with TPL avail logic."); + } + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); +#endif // CUDA_VERSION +} + +// Reference +// https://docs.nvidia.com/cuda/cusparse/index.html#bsrmm +// Several comments on bsrmm(): +// - Only blockDim > 1 is supported +// - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported +// - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported. +// +template +void spm_mv_block_impl_cusparse( + const KokkosKernels::Experimental::Controls& controls, const char mode[], + typename YVector::non_const_value_type const& alpha, const AMatrix& A, + const XVector& x, typename YVector::non_const_value_type const& beta, + const YVector& y) { + using offset_type = typename AMatrix::non_const_size_type; + using entry_type = typename AMatrix::non_const_ordinal_type; + using value_type = typename AMatrix::non_const_value_type; + + /* initialize cusparse library */ + cusparseHandle_t cusparseHandle = controls.getCusparseHandle(); + + /* Set the operation mode */ + cusparseOperation_t myCusparseOperation; + switch (toupper(mode[0])) { + case 'N': myCusparseOperation = CUSPARSE_OPERATION_NON_TRANSPOSE; break; + default: { + std::cerr << "Mode " << mode << " invalid for cusparse[*]bsrmv.\n"; + throw std::invalid_argument("Invalid mode"); + } break; + } + + int colx = static_cast(x.extent(1)); + int ldx = static_cast(x.stride_1()); + int ldy = static_cast(y.stride_1()); + +#if (9000 <= CUDA_VERSION) + + /* create and set the matrix descriptor */ + cusparseMatDescr_t descrA = 0; + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&descrA)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatType(descrA, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetMatIndexBase(descrA, CUSPARSE_INDEX_BASE_ZERO)); + cusparseDirection_t dirA = CUSPARSE_DIRECTION_ROW; + + /* perform the actual SpMV operation */ + if ((std::is_same::value) && + (std::is_same::value)) { + if (std::is_same::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if (std::is_same::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if (std::is_same>::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else if (std::is_same>::value) { + KOKKOS_CUSPARSE_SAFE_CALL(cusparseZbsrmm( + cusparseHandle, dirA, myCusparseOperation, + CUSPARSE_OPERATION_NON_TRANSPOSE, A.numRows(), colx, A.numCols(), + A.nnz(), reinterpret_cast(&alpha), descrA, + reinterpret_cast(A.values.data()), + A.graph.row_map.data(), A.graph.entries.data(), A.blockDim(), + reinterpret_cast(x.data()), ldx, + reinterpret_cast(&beta), + reinterpret_cast(y.data()), ldy)); + } else { + throw std::logic_error( + "Trying to call cusparse[*]bsrmm with a scalar type not " + "float/double, " + "nor complex of either!"); + } + } else { + throw std::logic_error( + "With cuSPARSE pre-10.0, offset and entry types must be int. " + "Something wrong with TPL avail logic."); + } + + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroyMatDescr(descrA)); +#endif // CUDA_VERSION +} + +#define KOKKOSSPARSE_SPMV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_BSRMATRIX< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const, SCALAR const*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, SCALAR*, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = BsrMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const*, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_bsrmatrix(const Controls& controls, const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spmv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#if (9000 <= CUDA_VERSION) +KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif + +#undef KOKKOSSPARSE_SPMV_CUSPARSE + +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ + COMPILE_LIBRARY) \ + template <> \ + struct SPMV_MV_BSRMATRIX< \ + SCALAR const, ORDINAL const, Kokkos::Device, \ + Kokkos::MemoryTraits, OFFSET const, SCALAR const**, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ + using device_type = Kokkos::Device; \ + using memory_trait_type = Kokkos::MemoryTraits; \ + using AMatrix = BsrMatrix; \ + using XVector = Kokkos::View< \ + SCALAR const**, LAYOUT, device_type, \ + Kokkos::MemoryTraits>; \ + using YVector = \ + Kokkos::View; \ + using Controls = KokkosKernels::Experimental::Controls; \ + \ + using coefficient_type = typename YVector::non_const_value_type; \ + \ + static void spmv_mv_bsrmatrix(const Controls& controls, const char mode[], \ + const coefficient_type& alpha, \ + const AMatrix& A, const XVector& x, \ + const coefficient_type& beta, \ + const YVector& y) { \ + std::string label = "KokkosSparse::spmv[TPL_CUSPARSE,BSRMATRIX" + \ + Kokkos::ArithTraits::name() + "]"; \ + Kokkos::Profiling::pushRegion(label); \ + spm_mv_block_impl_cusparse(controls, mode, alpha, A, x, beta, y); \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + +#if (9000 <= CUDA_VERSION) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, + Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, + Kokkos::LayoutRight, Kokkos::CudaUVMSpace, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +#endif + +#undef KOKKOSSPARSE_SPMV_MV_CUSPARSE + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif + +#endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP diff --git a/src/sparse/KokkosSparse_BsrMatrix.hpp b/src/sparse/KokkosSparse_BsrMatrix.hpp index 17273884bc..78ea66f48b 100644 --- a/src/sparse/KokkosSparse_BsrMatrix.hpp +++ b/src/sparse/KokkosSparse_BsrMatrix.hpp @@ -52,7 +52,15 @@ #ifndef KOKKOS_SPARSE_BSRMATRIX_HPP_ #define KOKKOS_SPARSE_BSRMATRIX_HPP_ -#include "KokkosSparse_BsrMatrix_impl.hpp" +#include +#include +#include +#include + +#include "Kokkos_Core.hpp" +#include "Kokkos_StaticCrsGraph.hpp" +#include "Kokkos_ArithTraits.hpp" +#include "KokkosSparse_CrsMatrix.hpp" namespace KokkosSparse { @@ -65,7 +73,7 @@ struct BsrRowView { //! The type of the column indices in the row. typedef typename MatrixType::ordinal_type ordinal_type; //! The type for returned block of values. - typedef Kokkos::View block_values_type; @@ -170,9 +178,8 @@ struct BsrRowView { /// block-row KOKKOS_INLINE_FUNCTION block_values_type block(const ordinal_type& K) const { - return block_values_type( - &(values_[K * blockDim_ * blockDim_]), - Kokkos::LayoutStride(blockDim_, blockDim_, blockDim_, 1)); + return block_values_type(&(values_[K * blockDim_ * blockDim_]), + Kokkos::LayoutRight(blockDim_, blockDim_)); } /// \brief Return offset into colidx_ for the requested block idx @@ -201,7 +208,7 @@ struct BsrRowViewConst { //! The type of the column indices in the row. typedef const typename MatrixType::non_const_ordinal_type ordinal_type; //! The type for returned block of values. - typedef Kokkos::View block_values_type; @@ -295,14 +302,20 @@ struct BsrRowViewConst { return values_[K * blockDim_ * blockDim_ + i * blockDim_ + j]; } + /// \brief Return the block column index for a specified block K + /// + /// \param K [in] must be the LOCAL block index within this block-row + /// \return Block column index for "uncompressed" block row + KOKKOS_INLINE_FUNCTION + ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; } + /// \brief Return unmanaged 2D strided View wrapping local block K from this /// block-row \param K [in] must be the LOCAL block index within this /// block-row KOKKOS_INLINE_FUNCTION block_values_type block(const ordinal_type& K) const { - return block_values_type( - &(values_[K * blockDim_ * blockDim_]), - Kokkos::LayoutStride(blockDim_, blockDim_, blockDim_, 1)); + return block_values_type(&(values_[K * blockDim_ * blockDim_]), + Kokkos::LayoutRight(blockDim_, blockDim_)); } /// \brief Return offset into colidx_ for the requested block idx @@ -428,7 +441,7 @@ class BsrMatrix { /// /// mfh: numCols and nnz should be properties of the graph, not the matrix. /// Then BsrMatrix needs methods to get these from the graph. - BsrMatrix() : numCols_(0), blockDim_(0) {} + BsrMatrix() : graph(), values(), dev_config(), numCols_(0), blockDim_(1) {} //! Copy constructor (shallow copy). template ( rows.extent(0) - static_cast(1)) @@ -642,7 +677,13 @@ class BsrMatrix { BsrMatrix(const std::string& /*label*/, const OrdinalType& ncols, const values_type& vals, const staticcrsgraph_type& graph_, const OrdinalType& blockDimIn) - : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) {} + : graph(graph_), values(vals), numCols_(ncols), blockDim_(blockDimIn) { + if (blockDim_ < 1) { + std::ostringstream os; + os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } /// \brief Constructor that accepts a CrsMatrix and block dimension, /// assuming the provided CrsMatrix has appropriate block structure. @@ -658,6 +699,11 @@ class BsrMatrix { typedef typename crs_graph_type::row_map_type crs_graph_row_map_type; blockDim_ = blockDimIn; + if (blockDim_ < 1) { + std::ostringstream os; + os << "KokkosSparse::BsrMatrix: Inappropriate block size: " << blockDim_; + Kokkos::Impl::throw_runtime_exception(os.str()); + } assert( (crs_mtx.numCols() % blockDim_ == 0) && @@ -987,8 +1033,8 @@ class BsrMatrix { } private: - ordinal_type numCols_; - ordinal_type blockDim_; // TODO Assuming square blocks for now + ordinal_type numCols_ = 0; + ordinal_type blockDim_ = 1; // TODO Assuming square blocks for now }; //---------------------------------------------------------------------------- diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index ab5196ac55..9837c9fbe3 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -49,6 +49,7 @@ #include "KokkosKernels_Controls.hpp" #include "KokkosSparse_spmv_spec.hpp" #include "KokkosSparse_spmv_struct_spec.hpp" +#include "KokkosSparse_spmv_blockcrsmatrix_spec.hpp" #include "KokkosSparse_spmv_bsrmatrix_spec.hpp" #include #include "KokkosSparse_BsrMatrix.hpp" @@ -66,11 +67,13 @@ struct RANK_TWO {}; } // namespace template + class YVector, + typename std::enable_if< + KokkosSparse::is_crs_matrix::value>::type* = nullptr> void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, const RANK_ONE) { - // Make sure that both x and y have the same rank. + // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), "KokkosSparse::spmv: Vector ranks do not match."); @@ -226,6 +229,295 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], } } +template < + class AlphaType, class AMatrix, class XVector, class BetaType, + class YVector, + typename std::enable_if::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE) { + // Make sure that x and y have the same rank. + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that x (and therefore y) is rank 1. + static_assert(static_cast(XVector::rank) == 1, + "KokkosSparse::spmv: Both Vector inputs must have rank 1 " + "in order to call this specialization of spmv."); + // Make sure that y is non-const. + static_assert(std::is_same::value, + "KokkosSparse::spmv: Output Vector must be non-const."); + // + if (A.blockDim() == 1) { + KokkosSparse::CrsMatrix< + typename AMatrix::value_type, typename AMatrix::ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::size_type> + Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); + KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_ONE()); + return; + } + // Check compatibility of dimensions at run time. + if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(x.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(y.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match: " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } else { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(y.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(x.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match " + "(transpose): " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } + // + typedef KokkosSparse::Experimental::BlockCrsMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type> + AMatrix_Internal; + + typedef Kokkos::View< + typename XVector::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits > + XVector_Internal; + + typedef Kokkos::View< + typename YVector::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits > + YVector_Internal; + + AMatrix_Internal A_i(A); + XVector_Internal x_i(x); + YVector_Internal y_i(y); + + return Experimental::Impl::SPMV_BLOCKCRSMATRIX< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type*, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type*, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits>::spmv_blockcrsmatrix(controls, + mode, + alpha, A_i, + x_i, beta, + y_i); +} + +template ::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_ONE) { + // Make sure that x and y have the same rank. + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that x (and therefore y) is rank 1. + static_assert(static_cast(XVector::rank) == 1, + "KokkosSparse::spmv: Both Vector inputs must have rank 1 " + "in order to call this specialization of spmv."); + // Make sure that y is non-const. + static_assert(std::is_same::value, + "KokkosSparse::spmv: Output Vector must be non-const."); + + // + if (A.blockDim() == 1) { + KokkosSparse::CrsMatrix< + typename AMatrix::value_type, typename AMatrix::ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::size_type> + Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); + KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_ONE()); + return; + } + // Check compatibility of dimensions at run time. + if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(x.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(y.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } else { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(y.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(x.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match " + "(transpose): " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } + // + typedef KokkosSparse::Experimental::BsrMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type> + AMatrix_Internal; + + typedef Kokkos::View< + typename XVector::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits > + XVector_Internal; + + typedef Kokkos::View< + typename YVector::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits > + YVector_Internal; + + AMatrix_Internal A_i(A); + XVector_Internal x_i(x); + YVector_Internal y_i(y); + + if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || + A_i.numCols() == 0 || A_i.nnz() == 0) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y_i, beta, y_i); + return; + } + + // + // Whether to call KokkosKernel's native implementation, even if a TPL impl is + // available + bool useFallback = controls.isParameter("algorithm") && + controls.getParameter("algorithm") == "native"; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE does not support the modes (C), (T), (H) + if (std::is_same::value || + std::is_same::value) { +#if defined(CUSPARSE_VERSION) + useFallback = useFallback || (mode[0] != NoTranspose[0]); +#endif + } +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (std::is_same::value) { + useFallback = useFallback || (mode[0] == Conjugate[0]); + } +#endif + + if (useFallback) { + // Explicitly call the non-TPL SPMV_BSRMATRIX implementation + std::string label = + "KokkosSparse::spmv[NATIVE,BSMATRIX," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Experimental::Impl::SPMV_BSRMATRIX< + typename AMatrix_Internal::const_value_type, + typename AMatrix_Internal::const_ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::const_size_type, + typename XVector_Internal::const_value_type*, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type*, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits, + false>::spmv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + Kokkos::Profiling::popRegion(); + } else { +#define __SPMV_TYPES__ \ + typename AMatrix_Internal::const_value_type, \ + typename AMatrix_Internal::const_ordinal_type, \ + typename AMatrix_Internal::device_type, \ + typename AMatrix_Internal::memory_traits, \ + typename AMatrix_Internal::const_size_type, \ + typename XVector_Internal::const_value_type*, \ + typename XVector_Internal::array_layout, \ + typename XVector_Internal::device_type, \ + typename XVector_Internal::memory_traits, \ + typename YVector_Internal::value_type*, \ + typename YVector_Internal::array_layout, \ + typename YVector_Internal::device_type, \ + typename YVector_Internal::memory_traits + + constexpr bool tpl_spec_avail = + KokkosSparse::Experimental::Impl::spmv_bsrmatrix_tpl_spec_avail< + __SPMV_TYPES__>::value; + + constexpr bool eti_spec_avail = + tpl_spec_avail + ? KOKKOSKERNELS_IMPL_COMPILE_LIBRARY /* force FALSE in app/test */ + : KokkosSparse::Experimental::Impl::spmv_bsrmatrix_eti_spec_avail< + __SPMV_TYPES__>::value; + + Experimental::Impl::SPMV_BSRMATRIX<__SPMV_TYPES__, tpl_spec_avail, + eti_spec_avail>::spmv_bsrmatrix(controls, + mode, + alpha, + A_i, x_i, + beta, + y_i); + +#undef __SPMV_TYPES__ + } +} + template struct SPMV2D1D { @@ -307,18 +599,24 @@ struct SPMV2D1D + class YVector, + typename std::enable_if< + KokkosSparse::is_crs_matrix::value>::type* = nullptr> void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y, const RANK_TWO) { - // Make sure that both x and y have the same rank. + // Make sure that x and y have the same rank. static_assert( static_cast(XVector::rank) == static_cast(YVector::rank), - "KokkosBlas::spmv: Vector ranks do not match."); + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that x (and therefore y) is rank 2. + static_assert(static_cast(XVector::rank) == 2, + "KokkosSparse::spmv: Both Vector inputs must have rank 2 " + "in order to call this specialization of spmv."); // Make sure that y is non-const. static_assert(std::is_same::value, - "KokkosBlas::spmv: Output Vector must be non-const."); + "KokkosSparse::spmv: Output Vector must be non-const."); // Check compatibility of dimensions at run time. if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { @@ -412,6 +710,317 @@ void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[], } } +template ::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO) { + // Make sure that x and y have the same rank. + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that x (and therefore y) is rank 2. + static_assert(static_cast(XVector::rank) == 2, + "KokkosSparse::spmv: Both Vector inputs must have rank 2 " + "in order to call this specialization of spmv."); + // Make sure that y is non-const. + static_assert(std::is_same::value, + "KokkosSparse::spmv: Output Vector must be non-const."); + + // + if (A.blockDim() == 1) { + KokkosSparse::CrsMatrix< + typename AMatrix::value_type, typename AMatrix::ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::size_type> + Acrs("bsr_to_crs", A.numCols(), A.values, A.graph); + KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_TWO()); + return; + } + // Check compatibility of dimensions at run time. + if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(x.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(y.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match: " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } else { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(y.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(x.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BsrMatrix): Dimensions do not match " + "(transpose): " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } + // + typedef KokkosSparse::Experimental::BsrMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type> + AMatrix_Internal; + AMatrix_Internal A_i(A); + + typedef Kokkos::View< + typename XVector::const_value_type**, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits > + XVector_Internal; + XVector_Internal x_i(x); + + typedef Kokkos::View< + typename YVector::non_const_value_type**, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits > + YVector_Internal; + YVector_Internal y_i(y); + // + if (alpha == Kokkos::ArithTraits::zero() || A_i.numRows() == 0 || + A_i.numCols() == 0 || A_i.nnz() == 0) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y_i, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y_i, beta, y_i); + return; + } + // + // Call single-vector version if appropriate + // + if (x.extent(1) == 1) { + typedef Kokkos::View< + typename XVector::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits > + XVector_SubInternal; + typedef Kokkos::View< + typename YVector::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits > + YVector_SubInternal; + + XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0); + YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0); + + return spmv(controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); + } + // + // Whether to call KokkosKernel's native implementation, even if a TPL impl is + // available + bool useFallback = controls.isParameter("algorithm") && + controls.getParameter("algorithm") == "native"; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + // cuSPARSE does not support the modes (C), (T), (H) + if (std::is_same::value || + std::is_same::value) { +#if defined(CUSPARSE_VERSION) + useFallback = useFallback || (mode[0] != NoTranspose[0]); +#endif + } +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if (std::is_same::value) { + useFallback = useFallback || (mode[0] == Conjugate[0]); + } +#endif + + if (useFallback) { + // Explicitly call the non-TPL SPMV_BSRMATRIX implementation + std::string label = + "KokkosSparse::spmv[NATIVE,BSMATRIX," + + Kokkos::ArithTraits< + typename AMatrix_Internal::non_const_value_type>::name() + + "]"; + Kokkos::Profiling::pushRegion(label); + Experimental::Impl::SPMV_MV_BSRMATRIX< + typename AMatrix_Internal::const_value_type, + typename AMatrix_Internal::const_ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::const_size_type, + typename XVector_Internal::const_value_type**, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type**, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits, + false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); + Kokkos::Profiling::popRegion(); + } else { + Experimental::Impl::SPMV_MV_BSRMATRIX< + typename AMatrix_Internal::const_value_type, + typename AMatrix_Internal::const_ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::const_size_type, + typename XVector_Internal::const_value_type**, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type**, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits>::spmv_mv_bsrmatrix(controls, + mode, + alpha, A_i, + x_i, beta, + y_i); + } +} + +template < + class AlphaType, class AMatrix, class XVector, class BetaType, + class YVector, + typename std::enable_if::value>::type* = nullptr> +void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], + const AlphaType& alpha, const AMatrix& A, const XVector& x, + const BetaType& beta, const YVector& y, const RANK_TWO) { + // Make sure that x and y have the same rank. + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that x (and therefore y) is rank 2. + static_assert(static_cast(XVector::rank) == 2, + "KokkosSparse::spmv: Both Vector inputs must have rank 2 " + "in order to call this specialization of spmv."); + // Make sure that y is non-const. + static_assert(std::is_same::value, + "KokkosSparse::spmv: Output Vector must be non-const."); + + // + if (A.blockDim() == 1) { + KokkosSparse::CrsMatrix< + typename AMatrix::value_type, typename AMatrix::ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::size_type> + Acrs("blockcrs_to_crs", A.numCols(), A.values, A.graph); + KokkosSparse::spmv(controls, mode, alpha, Acrs, x, beta, y, RANK_TWO()); + return; + } + // Check compatibility of dimensions at run time. + if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(x.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(y.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match: " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } else { + if ((x.extent(1) != y.extent(1)) || + (static_cast(A.numCols() * A.blockDim()) != + static_cast(y.extent(0))) || + (static_cast(A.numRows() * A.blockDim()) != + static_cast(x.extent(0)))) { + std::ostringstream os; + os << "KokkosSparse::spmv (BlockCrsMatrix): Dimensions do not match " + "(transpose): " + << ", A: " << A.numRows() * A.blockDim() << " x " + << A.numCols() * A.blockDim() << ", x: " << x.extent(0) << " x " + << x.extent(1) << ", y: " << y.extent(0) << " x " << y.extent(1); + + Kokkos::Impl::throw_runtime_exception(os.str()); + } + } + // + typedef KokkosSparse::Experimental::BlockCrsMatrix< + typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, + typename AMatrix::device_type, Kokkos::MemoryTraits, + typename AMatrix::const_size_type> + AMatrix_Internal; + AMatrix_Internal A_i(A); + + typedef Kokkos::View< + typename XVector::const_value_type**, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits > + XVector_Internal; + XVector_Internal x_i(x); + + typedef Kokkos::View< + typename YVector::non_const_value_type**, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits > + YVector_Internal; + YVector_Internal y_i(y); + // + // + // Call single-vector version if appropriate + // + if (x.extent(1) == 1) { + typedef Kokkos::View< + typename XVector::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename XVector::device_type, + Kokkos::MemoryTraits > + XVector_SubInternal; + typedef Kokkos::View< + typename YVector::non_const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename YVector::device_type, Kokkos::MemoryTraits > + YVector_SubInternal; + + XVector_SubInternal x_0 = Kokkos::subview(x_i, Kokkos::ALL(), 0); + YVector_SubInternal y_0 = Kokkos::subview(y_i, Kokkos::ALL(), 0); + + return spmv(controls, mode, alpha, A_i, x_0, beta, y_0, RANK_ONE()); + } + // + return Experimental::Impl::SPMV_MV_BLOCKCRSMATRIX< + typename AMatrix_Internal::value_type, + typename AMatrix_Internal::ordinal_type, + typename AMatrix_Internal::device_type, + typename AMatrix_Internal::memory_traits, + typename AMatrix_Internal::size_type, + typename XVector_Internal::value_type**, + typename XVector_Internal::array_layout, + typename XVector_Internal::device_type, + typename XVector_Internal::memory_traits, + typename YVector_Internal::value_type**, + typename YVector_Internal::array_layout, + typename YVector_Internal::device_type, + typename YVector_Internal::memory_traits>:: + spmv_mv_blockcrsmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); +} + /// \brief Public interface to local sparse matrix-vector multiply. /// /// Compute y = beta*y + alpha*Op(A)*x, where x and y are either both @@ -432,12 +1041,32 @@ void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char mode[], /// multivector (rank-2 Kokkos::View). It must have the same number /// of columns as x. template ::value>::type* = nullptr> + class YVector> void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], const AlphaType& alpha, const AMatrix& A, const XVector& x, const BetaType& beta, const YVector& y) { + // Make sure that both x and y have the same rank. + static_assert( + static_cast(XVector::rank) == static_cast(YVector::rank), + "KokkosSparse::spmv: Vector ranks do not match."); + // Make sure that y is non-const. + static_assert(std::is_same::value, + "KokkosSparse::spmv: Output Vector must be non-const."); + + // + if (alpha == Kokkos::ArithTraits::zero() || A.numRows() == 0 || + A.numCols() == 0 || A.nnz() == 0) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + return; + } + // using RANK_SPECIALISE = typename std::conditional(XVector::rank) == 2, RANK_TWO, RANK_ONE>::type; @@ -468,94 +1097,28 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], /// matrix; KokkosSparse::BsrMatrix instance. \param x [in] A multivector /// (rank-2 Kokkos::View). \param beta [in] Scalar multiplier for the /// (multivector y. \param y [in/out] multivector (rank-2 Kokkos::View). -template ::value>::type* = nullptr> -void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], - const AlphaType& alpha, const AMatrix& A, const XVector& x, - const BetaType& beta, const YVector& y) { - if (mode[0] != NoTranspose[0]) { - Kokkos::Impl::throw_runtime_exception( - "BsrMatrix SpMV only supports mode=N"); - } - - static_assert(XVector::rank == 2, - "KokkosSparse::spmv on a BsrMatrix requires X with rank 2"); - static_assert(YVector::rank == 2, - "KokkosSparse::spmv on a BsrMatrix requires Y with rank 2"); - - static_assert(KokkosKernels::Impl::kk_is_gpu_mem_space< - typename XVector::memory_space>(), - "KokkosSparse::spmv on a BsrMatrix requires X be in a " - "CUDA-accessible space"); - static_assert(KokkosKernels::Impl::kk_is_gpu_mem_space< - typename YVector::memory_space>(), - "KokkosSparse::spmv on a BsrMatrix requires Y be in a " - "CUDA-accessible space"); - static_assert(KokkosKernels::Impl::kk_is_gpu_mem_space< - typename AMatrix::memory_space>(), - "KokkosSparse::spmv on a BsrMatrix requires A be in a " - "CUDA-accessible space"); - - typedef KokkosSparse::Experimental::BsrMatrix< - typename AMatrix::const_value_type, typename AMatrix::const_ordinal_type, - typename AMatrix::device_type, Kokkos::MemoryTraits, - typename AMatrix::const_size_type> - AMatrix_Internal; - - typedef Kokkos::View< - typename XVector::const_value_type**, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename XVector::device_type, - Kokkos::MemoryTraits > - XVector_Internal; - - typedef Kokkos::View< - typename YVector::non_const_value_type**, - typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, - typename YVector::device_type, Kokkos::MemoryTraits > - YVector_Internal; - - AMatrix_Internal A_i(A); - XVector_Internal x_i(x); - YVector_Internal y_i(y); - - return Experimental::Impl::SPMV_BSRMATRIX< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type**, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type**, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_bsrmatrix(controls, mode, - alpha, A_i, x_i, - beta, y_i); -} /* Catch-all spmv interface that throws a compile-time error if KokkosSparse::spmv is call on a non-BsrMatrix or non-CrsMatrix. */ -template ::value && - !KokkosSparse::is_crs_matrix::value>::type* = nullptr> +template < + class AlphaType, class AMatrix, class XVector, class BetaType, + class YVector, + typename std::enable_if< + !KokkosSparse::Experimental::is_block_crs_matrix::value && + !KokkosSparse::Experimental::is_bsr_matrix::value && + !KokkosSparse::is_crs_matrix::value>::type* = nullptr> void spmv(KokkosKernels::Experimental::Controls /*controls*/, const char[] /*mode*/, const AlphaType& /*alpha*/, const AMatrix& /*A*/, const XVector& /*x*/, const BetaType& /*beta*/, const YVector& /*y*/) { // have to arrange this so that the compiler can't tell this is false until // instantiation - static_assert(KokkosSparse::is_crs_matrix::value || - KokkosSparse::Experimental::is_bsr_matrix::value, - "SpMV: AMatrix must be CrsMatrix or BsrMatrix"); + static_assert( + KokkosSparse::is_crs_matrix::value || + KokkosSparse::Experimental::is_bsr_matrix::value || + KokkosSparse::Experimental::is_block_crs_matrix::value, + "SpMV: AMatrix must be CrsMatrix, BsrMatrix, or BlockCrsMatrix"); } // Overload for backward compatibility and also just simpler diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp new file mode 100644 index 0000000000..3aec74466a --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp @@ -0,0 +1,48 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_IMPL_HPP +#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_IMPL_HPP + +#endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_IMPL_HPP diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp new file mode 100644 index 0000000000..36696acdda --- /dev/null +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp @@ -0,0 +1,421 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ +#ifndef KOKKOSSPARSE_IMPL_SPMV_BLOCKCRSMATRIX_SPEC_HPP_ +#define KOKKOSSPARSE_IMPL_SPMV_BLOCKCRSMATRIX_SPEC_HPP_ + +#include +#include +#include + +#include "KokkosSparse_BlockCrsMatrix.hpp" +#include "KokkosKernels_Controls.hpp" +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +#include +#endif + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +// default is no eti available +template +struct spmv_blockcrsmatrix_eti_spec_avail { + enum : bool { value = false }; +}; + +// default is no eti available +template +struct spmv_mv_blockcrsmatrix_eti_spec_avail { + enum : bool { value = false }; +}; + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_blockcrsmatrix_eti_spec_avail< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_blockcrsmatrix_eti_spec_avail< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +// Include which ETIs are available +#include +#include +#include + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { + +// declaration +template ::value> +struct SPMV_BLOCKCRSMATRIX { + typedef BlockCrsMatrix AMatrix; + typedef Kokkos::View XVector; + typedef Kokkos::View YVector; + typedef typename YVector::non_const_value_type YScalar; + + static void spmv_blockcrsmatrix( + const KokkosKernels::Experimental::Controls &controls, const char mode[], + const YScalar &alpha, const AMatrix &A, const XVector &x, + const YScalar &beta, const YVector &y); +}; + +// declaration +template ::value> +struct SPMV_MV_BLOCKCRSMATRIX { + typedef BlockCrsMatrix AMatrix; + typedef Kokkos::View XVector; + typedef Kokkos::View YVector; + typedef typename YVector::non_const_value_type YScalar; + + static void spmv_mv_blockcrsmatrix( + const KokkosKernels::Experimental::Controls &controls, const char mode[], + const YScalar &alpha, const AMatrix &A, const XVector &x, + const YScalar &beta, const YVector &y); +}; + +// actual implementations to be compiled +#if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +template +struct SPMV_BLOCKCRSMATRIX { + typedef BlockCrsMatrix AMatrix; + typedef Kokkos::View XVector; + typedef Kokkos::View YVector; + typedef typename YVector::non_const_value_type YScalar; + + static void spmv_blockcrsmatrix( + const KokkosKernels::Experimental::Controls &controls, const char mode[], + const YScalar &alpha, const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { + // + auto h_a_row_map = Kokkos::create_mirror_view(A.graph.row_map); + Kokkos::deep_copy(h_a_row_map, A.graph.row_map); + // + auto h_a_entries = Kokkos::create_mirror_view(A.graph.entries); + Kokkos::deep_copy(h_a_entries, A.graph.entries); + // + auto h_a_values = Kokkos::create_mirror_view(A.values); + Kokkos::deep_copy(h_a_values, A.values); + // + auto h_x = Kokkos::create_mirror_view(X); + Kokkos::deep_copy(h_x, X); + // + auto h_y = Kokkos::create_mirror_view(Y); + Kokkos::deep_copy(h_y, Y); + // + const auto numBlockRows = A.numRows(); + const auto blockSize = A.blockDim(); + const auto blockSize2 = blockSize * blockSize; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; + using value_type = typename AMatrix::non_const_value_type; + // + for (ordinal_type ii = 0; ii < numBlockRows * blockSize; ++ii) + h_y(ii) = beta * h_y(ii); + // + if ((mode[0] == KokkosSparse::NoTranspose[0]) || + (mode[0] == KokkosSparse::Conjugate[0])) { + bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); + for (ordinal_type iblock = 0; iblock < numBlockRows; ++iblock) { + const auto jbeg = h_a_row_map(iblock); + const auto jend = h_a_row_map(iblock + 1); + for (auto jb = jbeg; jb < jend; ++jb) { + const auto col_block = h_a_entries(jb); + for (ordinal_type ir = 0; ir < blockSize; ++ir) { + for (ordinal_type jr = 0; jr < blockSize; ++jr) { + const size_type index = jbeg * blockSize2 + jr + + (jb - jbeg) * blockSize + + ir * (jend - jbeg) * blockSize; + const auto avalue = + (useConjugate) + ? Kokkos::ArithTraits::conj(h_a_values(index)) + : h_a_values(index); + h_y(ir + iblock * blockSize) += + alpha * avalue * h_x(jr + col_block * blockSize); + } + } + } + } + } else if ((mode[0] == KokkosSparse::Transpose[0]) || + (mode[0] == KokkosSparse::ConjugateTranspose[0])) { + bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); + for (ordinal_type iblock = 0; iblock < numBlockRows; ++iblock) { + const auto jbeg = h_a_row_map(iblock); + const auto jend = h_a_row_map(iblock + 1); + for (ordinal_type ir = 0; ir < blockSize; ++ir) { + for (auto jb = jbeg; jb < jend; ++jb) { + const auto col_block = h_a_entries(jb); + for (ordinal_type jr = 0; jr < blockSize; ++jr) { + const size_type index = jbeg * blockSize2 + ir + + (jb - jbeg) * blockSize + + jr * blockSize * (jend - jbeg); + const auto avalue = + (useConjugate) + ? Kokkos::ArithTraits::conj(h_a_values(index)) + : h_a_values(index); + h_y(ir + col_block * blockSize) += + alpha * avalue * h_x(jr + iblock * blockSize); + } + } + } + } + } + // + Kokkos::deep_copy(Y, h_y); + // + } +}; + +template +struct SPMV_MV_BLOCKCRSMATRIX { + typedef BlockCrsMatrix AMatrix; + typedef Kokkos::View XVector; + typedef Kokkos::View YVector; + typedef typename YVector::non_const_value_type YScalar; + + static void spmv_mv_blockcrsmatrix( + const KokkosKernels::Experimental::Controls &controls, const char mode[], + const YScalar &alpha, const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { + // + auto h_a_row_map = Kokkos::create_mirror_view(A.graph.row_map); + Kokkos::deep_copy(h_a_row_map, A.graph.row_map); + // + auto h_a_entries = Kokkos::create_mirror_view(A.graph.entries); + Kokkos::deep_copy(h_a_entries, A.graph.entries); + // + auto h_a_values = Kokkos::create_mirror_view(A.values); + Kokkos::deep_copy(h_a_values, A.values); + // + auto h_x = Kokkos::create_mirror_view(X); + Kokkos::deep_copy(h_x, X); + // + auto h_y = Kokkos::create_mirror_view(Y); + Kokkos::deep_copy(h_y, Y); + // + const auto numRhs = X.extent(1); + const auto numBlockRows = A.numRows(); + const auto blockSize = A.blockDim(); + const auto blockSize2 = blockSize * blockSize; + using ordinal_type = typename AMatrix::non_const_ordinal_type; + using size_type = typename AMatrix::non_const_size_type; + using value_type = typename AMatrix::non_const_value_type; + // + for (ordinal_type jj = 0; jj < numRhs; ++jj) { + for (ordinal_type ii = 0; ii < numBlockRows * blockSize; ++ii) + h_y(ii, jj) = beta * h_y(ii, jj); + } + // + if ((mode[0] == KokkosSparse::NoTranspose[0]) || + (mode[0] == KokkosSparse::Conjugate[0])) { + bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); + for (ordinal_type iblock = 0; iblock < numBlockRows; ++iblock) { + const auto jbeg = h_a_row_map(iblock); + const auto jend = h_a_row_map(iblock + 1); + for (auto jb = jbeg; jb < jend; ++jb) { + const auto col_block = h_a_entries(jb); + for (ordinal_type jj = 0; jj < numRhs; ++jj) { + for (ordinal_type jr = 0; jr < blockSize; ++jr) { + const auto alpha_x = alpha * h_x(jr + col_block * blockSize, jj); + for (ordinal_type ir = 0; ir < blockSize; ++ir) { + const size_type index = jbeg * blockSize2 + jr + + (jb - jbeg) * blockSize + + ir * (jend - jbeg) * blockSize; + const auto avalue = (useConjugate) + ? Kokkos::ArithTraits::conj( + h_a_values(index)) + : h_a_values(index); + h_y(ir + iblock * blockSize, jj) += avalue * alpha_x; + } + } + } + } + } + } else if ((mode[0] == KokkosSparse::Transpose[0]) || + (mode[0] == KokkosSparse::ConjugateTranspose[0])) { + bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); + for (ordinal_type iblock = 0; iblock < numBlockRows; ++iblock) { + const auto jbeg = h_a_row_map(iblock); + const auto jend = h_a_row_map(iblock + 1); + for (auto jb = jbeg; jb < jend; ++jb) { + const auto col_block = h_a_entries(jb); + for (ordinal_type jj = 0; jj < numRhs; ++jj) { + for (ordinal_type ir = 0; ir < blockSize; ++ir) { + for (ordinal_type jr = 0; jr < blockSize; ++jr) { + const size_type index = jbeg * blockSize2 + ir + + (jb - jbeg) * blockSize + + jr * blockSize * (jend - jbeg); + const auto avalue = (useConjugate) + ? Kokkos::ArithTraits::conj( + h_a_values(index)) + : h_a_values(index); + h_y(ir + col_block * blockSize, jj) += + alpha * avalue * h_x(jr + iblock * blockSize, jj); + } + } + } + } + } + } + // + Kokkos::deep_copy(Y, h_y); + // + } +}; + +#endif // !defined(KOKKOSKERNELS_ETI_ONLY) || +// KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +// declare / instantiate the vector version +// Instantiate with A,x,y are all the requested Scalar type (no instantiation of +// mixed-precision operands) +#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_BLOCKCRSMATRIX< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, true>; + +#define KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_BLOCKCRSMATRIX< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, true>; + +// declare / instantiate the 2D MV version +// Instantiate with A,x,y are all the requested Scalar type (no instantiation of +// mixed-precision operands) +#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_MV_BLOCKCRSMATRIX< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, true>; + +#define KOKKOSSPARSE_SPMV_MV_BLOCKCRSMATRIX_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_MV_BLOCKCRSMATRIX< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, true>; + +#include +#include +#include + +#endif // KOKKOSSPARSE_IMPL_SPMV_BLOCKCRSMATRIX_SPEC_HPP_ diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 1dd2db6792..24333cafd8 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -42,8 +42,8 @@ //@HEADER */ -#ifndef KOKKOSSPARSE_IMPL_SPMV_TENSOR_CORE_DEF_HPP_ -#define KOKKOSSPARSE_IMPL_SPMV_TENSOR_CORE_DEF_HPP_ +#ifndef KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_ +#define KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_ #if defined(KOKKOS_ENABLE_CUDA) && \ (defined(KOKKOS_ARCH_VOLTA) || defined(KOKKOS_ARCH_AMPERE)) @@ -109,21 +109,21 @@ struct BsrMatrixSpMVTensorCoreFunctor { // grid of warps in team) Kokkos::LayoutRight, typename Device::execution_space::scratch_memory_space, - Kokkos::MemoryTraits > + Kokkos::MemoryTraits> AScratchView; typedef typename Kokkos::View< XFragScalar * [FRAG_K][FRAG_N], typename Kokkos::LayoutRight, // so that [FRAG_K][FRAG_N] part is // contiguous in memory typename Device::execution_space::scratch_memory_space, - Kokkos::MemoryTraits > + Kokkos::MemoryTraits> XScratchView; typedef typename Kokkos::View< YFragScalar * * [FRAG_M][FRAG_N], typename Kokkos::LayoutRight, // so that [FRAG_M][FRAG_N] part is // contiguous in memory typename Device::execution_space::scratch_memory_space, - Kokkos::MemoryTraits > + Kokkos::MemoryTraits> YScratchView; YScalar alpha; @@ -526,4 +526,1153 @@ struct BsrMatrixSpMVTensorCoreDispatcher { #endif // #if CUDA && (VOLTA || AMPERE) -#endif // KOKKOSSPARSE_IMPL_SPMV_TENSOR_CORE_DEF_HPP_ +// +// +// + +#include "KokkosBlas.hpp" +#include "KokkosBatched_Gemv_Serial_Internal.hpp" +#include "KokkosBatched_Gemm_Serial_Internal.hpp" +#include "KokkosKernels_ExecSpaceUtils.hpp" +#include "KokkosSparse_spmv_impl.hpp" + +namespace KokkosSparse { +namespace Experimental { +namespace Impl { +namespace Bsr { + +template +struct BSR_GEMV_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + AMatrix m_A; + XVector m_x; + YVector m_y; + + const ordinal_type block_dim; + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BSR_GEMV_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 1, + "XVector must be a rank 1 View."); + static_assert(static_cast(YVector::rank) == 1, + "YVector must be a rank 1 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + const auto ystart = iBlock * block_dim; + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto beta1 = static_cast(1); + // + if (conjugate) { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto xstart = row.block_colidx(ic) * block_dim; + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + value_type t(0); + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + const auto aval = + Kokkos::ArithTraits::conj(Aview(ii, jj)); + t += aval * m_x(xstart + jj); + } + m_y(ystart + ii) += alpha * t; + } + } + } else { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto xstart = row.block_colidx(ic) * block_dim; + KokkosBatched::SerialGemvInternal:: + invoke(block_dim, block_dim, alpha, + Aview.data(), block_dim, 1, + &m_x(xstart), m_x.stride_0(), beta1, + &m_y(ystart), m_y.stride_0()); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + // + auto yview = Kokkos::subview( + m_y, Kokkos::make_pair(iBlock * block_dim, + iBlock * block_dim + block_dim)); + // + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + y_value_type sum = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry, y_value_type &lsum) { + const auto start_col = row.block_colidx(iEntry) * block_dim; + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, ir, jr)) + : row.local_block_value(iEntry, ir, jr); + lsum += val * m_x(start_col + jr); + } + }, + sum); + + Kokkos::single(Kokkos::PerThread(dev), [&]() { + sum *= alpha; + yview(ir) += sum; + }); + } + }); + } +}; + +/* ******************* */ + +// +// spMatVec_no_transpose: version for CPU execution spaces +// (RangePolicy or trivial serial impl used) +// +template ()>::type * = nullptr> +void spMatVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + + // + // Treat the case y <- alpha * A * x + beta * y + // + + typedef KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BSR_GEMV_Functor func(alpha, A, x, y, 1, + useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::bspmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::bspmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } +} + +/* ******************* */ + +// +// spMatVec_no_transpose: version for GPU execution spaces (TeamPolicy used) +// +template ()>::type * = nullptr> +void spMatVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + if (A.numRows() <= static_cast(0)) { + return; + } + + // We need to scale y first ("scaling" by zero just means filling + // with zeros), since the functor updates y (by adding alpha Op(A) x). + KokkosBlas::scal(y, beta, y); + + typedef KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + typedef typename AMatrix_Internal::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + AMatrix_Internal A_internal = A; + + BSR_GEMV_Functor func( + alpha, A_internal, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bspmv", policy, + func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bspmv", policy, + func); + } +} + +/* ******************* */ + +template +struct BSR_GEMV_Transpose_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + + AMatrix m_A; + XVector m_x; + YVector m_y; + + const ordinal_type block_dim; + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BSR_GEMV_Transpose_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 1, + "XVector must be a rank 1 View."); + static_assert(static_cast(YVector::rank) == 1, + "YVector must be a rank 1 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + // + // Assume that alpha is not zero + // + const auto xstart = iBlock * block_dim; + const auto xview = + Kokkos::subview(m_x, Kokkos::make_pair(xstart, xstart + block_dim)); + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto beta1 = static_cast(1); + const auto alpha1 = beta1; + if (conjugate) { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + const auto aval = + Kokkos::ArithTraits::conj(Aview(ii, jj)); + t += aval * xview(ii); + } + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj), t); + } + } + } else { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + KokkosBatched::SerialGemvInternal< + KokkosBatched::Algo::Gemv::Blocked>::invoke( + 1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(), + Aview.stride_0(), xview.data(), xview.stride_0(), beta1, &t, 1); + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj), t); + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + // + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry) { + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, jr, ir)) + : row.local_block_value(iEntry, jr, ir); + const ordinal_type ind = row.block_colidx(iEntry); + Kokkos::atomic_add( + &m_y(block_dim * ind + ir), + static_cast( + alpha * val * m_x(block_dim * iBlock + jr))); + } + }); + } + }); + } +}; + +/* ******************* */ + +/// \brief spMatVec_transpose: version for CPU execution spaces (RangePolicy or +/// trivial serial impl used) +template ()>::type * = nullptr> +void spMatVec_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + + if (alpha == Kokkos::ArithTraits::zero()) return; + + // + // Treat the case y <- alpha * A^T * x + beta * y + // + + typedef KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + + AMatrix_Internal A_internal = A; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BSR_GEMV_Transpose_Functor func( + alpha, A_internal, x, y, 1, useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::bspmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::bspmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } +} + +// +// spMatVec_transpose: version for GPU execution spaces (TeamPolicy used) +// +template ()>::type * = nullptr> +void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, const AMatrix &A, + const XVector &x, const BetaType &beta, YVector &y, + bool useConjugate) { + if (A.numRows() <= 0) { + return; + } + + // We need to scale y first ("scaling" by zero just means filling + // with zeros), since the functor works by atomic-adding into y. + KokkosBlas::scal(y, beta, y); + + typedef typename AMatrix::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + BSR_GEMV_Transpose_Functor func( + alpha, A, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bspmv", policy, + func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bspmv", policy, + func); + } +} + +/* ******************* */ + +template +struct BSR_GEMM_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + AMatrix m_A; + XVector m_x; + YVector m_y; + const ordinal_type block_dim; + const ordinal_type num_rhs; + + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BSR_GEMM_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + num_rhs(m_x_.extent(1)), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 2, + "XVector must be a rank 2 View."); + static_assert(static_cast(YVector::rank) == 2, + "YVector must be a rank 2 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + // + const auto ystart = iBlock * block_dim; + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto beta1 = static_cast(1); + const auto ldx = m_x.stride_1(); + const auto ldy = m_y.stride_1(); + // + if (conjugate) { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto xstart = row.block_colidx(ic) * block_dim; + for (ordinal_type jr = 0; jr < num_rhs; ++jr) { + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + value_type t(0); + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + const auto aval = + Kokkos::ArithTraits::conj(Aview(ii, jj)); + t += aval * m_x(xstart + jj, jr); + } + m_y(ystart + ii, jr) += alpha * t; + } + } + } + } else { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto xstart = row.block_colidx(ic) * block_dim; + KokkosBatched::SerialGemmInternal:: + invoke( + block_dim, num_rhs, block_dim, alpha, Aview.data(), block_dim, + 1, &m_x(xstart, 0), m_x.stride_0(), ldx, beta1, &m_y(ystart, 0), + m_y.stride_0(), ldy); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + // + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const KokkosSparse::Experimental::BsrRowViewConst row( + m_A.values, m_A.graph.entries, block_dim, count, start); + const auto nrhs = num_rhs; + // + for (ordinal_type ic = 0; ic < nrhs; ++ic) { + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + y_value_type sum = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry, y_value_type &lsum) { + const auto start_col = row.block_colidx(iEntry) * block_dim; + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, ir, jr)) + : row.local_block_value(iEntry, ir, jr); + lsum += val * m_x(start_col + jr, ic); + } + }, + sum); + + Kokkos::single(Kokkos::PerThread(dev), [&]() { + sum *= alpha; + m_y(iBlock * block_dim + ir, ic) += sum; + }); + } + } + // + }); + } +}; + +/* ******************* */ + +// +// spMatMultiVec_no_transpose: version for CPU execution spaces +// (RangePolicy or trivial serial impl used) +// +template ()>::type * = nullptr> +void spMatMultiVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + // + // Treat the case y <- alpha * A * x + beta * y + // + typedef KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BSR_GEMM_Functor func(alpha, A, x, y, 1, + useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::bsr_spm_mv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::bsr_spm_mv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } +} + +/* ******************* */ + +// +// spMatMultiVec_no_transpose: version for GPU execution spaces (TeamPolicy +// used) +// +template ()>::type * = nullptr> +void spMatMultiVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + if (A.numRows() <= static_cast(0)) { + return; + } + + KokkosBlas::scal(y, beta, y); + + typedef KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + typedef typename AMatrix_Internal::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + AMatrix_Internal A_internal = A; + + BSR_GEMM_Functor func( + alpha, A_internal, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", + policy, func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", + policy, func); + } +} + +/* ******************* */ +template +struct BSR_GEMM_Transpose_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + AMatrix m_A; + XVector m_x; + YVector m_y; + const ordinal_type block_dim; + const ordinal_type num_rhs; + + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BSR_GEMM_Transpose_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + num_rhs(m_x_.extent(1)), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 2, + "XVector must be a rank 2 View."); + static_assert(static_cast(YVector::rank) == 2, + "YVector must be a rank 2 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + // + const auto xstart = iBlock * block_dim; + const auto xview = Kokkos::subview( + m_x, Kokkos::make_pair(xstart, xstart + block_dim), Kokkos::ALL()); + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto beta1 = static_cast(1); + const auto alpha1 = beta1; + const auto ldx = m_x.stride_1(); + const auto ldy = m_y.stride_1(); + // + if (conjugate) { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jr = 0; jr < num_rhs; ++jr) { + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + const auto aval = + Kokkos::ArithTraits::conj(Aview(ii, jj)); + t += aval * xview(ii, jr); + } + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj, jr), t); + } + } + } + } else { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jr = 0; jr < num_rhs; ++jr) { + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + KokkosBatched::SerialGemvInternal< + KokkosBatched::Algo::Gemv::Blocked>::invoke( + 1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(), + Aview.stride_0(), xview.data() + jr * ldx, xview.stride_0(), + beta1, &t, 1); + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj, jr), t); + } + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + // + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const KokkosSparse::Experimental::BsrRowViewConst row( + m_A.values, m_A.graph.entries, block_dim, count, start); + const auto nrhs = m_x.extent(1); + // + for (ordinal_type ic = 0; ic < nrhs; ++ic) { + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry) { + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, jr, ir)) + : row.local_block_value(iEntry, jr, ir); + const ordinal_type ind = row.block_colidx(iEntry); + Kokkos::atomic_add( + &m_y(block_dim * ind + ir, ic), + static_cast( + alpha * val * m_x(block_dim * iBlock + jr, ic))); + } + }); + } + } + // + }); + } +}; + +/* ******************* */ + +/// \brief spMatMultiVec_transpose: version for CPU execution spaces +/// (RangePolicy or trivial serial impl used) +template ()>::type * = nullptr> +void spMatMultiVec_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + // + // Treat the case y <- alpha * A^T * x + beta * y + // + typedef KokkosSparse::Experimental::BsrMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + typedef typename AMatrix_Internal::execution_space execution_space; + + AMatrix_Internal A_internal = A; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BSR_GEMM_Transpose_Functor func( + alpha, A_internal, x, y, 1, useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::bsr_spm_mv", + Kokkos::RangePolicy>( + 0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::bsr_spm_mv", + Kokkos::RangePolicy>( + 0, A.numRows()), + func); + } +} + +// +// spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used) +// +template ()>::type * = nullptr> +void spMatMultiVec_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, const AMatrix &A, const XVector &x, + const BetaType &beta, YVector &y, bool useConjugate) { + if (A.numRows() <= 0) { + return; + } + + KokkosBlas::scal(y, beta, y); + + typedef typename AMatrix::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning + // parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + BSR_GEMM_Transpose_Functor func( + alpha, A, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", policy, + func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bsr_spm_mv", policy, + func); + } +} + +/* ******************* */ + +} // namespace Bsr + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_IMPL_HPP_ diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 621c52c157..51727441d3 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -65,6 +65,14 @@ struct spmv_bsrmatrix_eti_spec_avail { enum : bool { value = false }; }; +template ::type>::value> +struct spmv_mv_bsrmatrix_eti_spec_avail { + enum : bool { value = false }; +}; + } // namespace Impl } // namespace Experimental } // namespace KokkosSparse @@ -74,6 +82,23 @@ struct spmv_bsrmatrix_eti_spec_avail { MEM_SPACE_TYPE) \ template <> \ struct spmv_bsrmatrix_eti_spec_avail< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE *, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits > { \ + enum : bool { value = true }; \ + }; + +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_AVAIL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template <> \ + struct spmv_mv_bsrmatrix_eti_spec_avail< \ const SCALAR_TYPE, const ORDINAL_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits, const OFFSET_TYPE, \ @@ -87,7 +112,9 @@ struct spmv_bsrmatrix_eti_spec_avail { }; // Include which ETIs are available +#include #include +#include namespace KokkosSparse { namespace Experimental { @@ -96,6 +123,8 @@ namespace Impl { // declaration template ::value, bool eti_spec_avail = spmv_bsrmatrix_eti_spec_avail< AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> struct SPMV_BSRMATRIX { @@ -110,11 +139,33 @@ struct SPMV_BSRMATRIX { const YScalar &beta, const YVector &y); }; +// declaration +template ::type>::value, + bool tpl_spec_avail = spmv_mv_bsrmatrix_tpl_spec_avail< + AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value, + bool eti_spec_avail = spmv_mv_bsrmatrix_eti_spec_avail< + AT, AO, AD, AM, AS, XT, XL, XD, XM, YT, YL, YD, YM>::value> +struct SPMV_MV_BSRMATRIX { + typedef BsrMatrix AMatrix; + typedef Kokkos::View XVector; + typedef Kokkos::View YVector; + typedef typename YVector::non_const_value_type YScalar; + + static void spmv_mv_bsrmatrix( + const KokkosKernels::Experimental::Controls &controls, const char mode[], + const YScalar &alpha, const AMatrix &A, const XVector &x, + const YScalar &beta, const YVector &y); +}; + // actual implementations to be compiled #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY + template -struct SPMV_BSRMATRIX { typedef BsrMatrix AMatrix; typedef Kokkos::View XVector; @@ -122,9 +173,38 @@ struct SPMV_BSRMATRIX +struct SPMV_MV_BSRMATRIX { + typedef BsrMatrix AMatrix; + typedef Kokkos::View XVector; + typedef Kokkos::View YVector; + typedef typename YVector::non_const_value_type YScalar; + + static void spmv_mv_bsrmatrix( + const KokkosKernels::Experimental::Controls &controls, const char mode[], + const YScalar &alpha, const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { +#if defined(KOKKOS_ARCH_AMPERE) || defined(KOKKOS_ARCH_VOLTA) // user explicitly requests a particular precision bool requestMixed = false; bool requestDouble = false; @@ -135,6 +215,7 @@ struct SPMV_BSRMATRIX::dispatch(alpha, A, - x, beta, - y); + X, beta, + Y); } else if (requestDouble) { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, x, beta, y); + 4>::dispatch(alpha, A, X, beta, Y); } else if (operandsHalfHalfFloat) { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, - x, beta, - y); + X, beta, + Y); } else { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, x, beta, y); } - #elif defined(KOKKOS_ARCH_VOLTA) /* Volta has float += half * half use it for all matrices @@ -183,32 +263,74 @@ struct SPMV_BSRMATRIX::dispatch(alpha, A, x, - beta, y); + float, 16, 16, 16>::dispatch(alpha, A, X, + beta, Y); (void)requestMixed; // unused -#else - - Kokkos::Impl::throw_runtime_exception( - "KOKKOS_ARCH_VOLTA or KOKKOS_ARCH_AMPERE not defined"); - (void)requestMixed; - (void)requestDouble; - (void)controls; - (void)alpha; - (void)A; - (void)x; - (void)beta; - (void)y; #endif // KOKKOS_ARCH + + if ((mode[0] == KokkosSparse::NoTranspose[0]) || + (mode[0] == KokkosSparse::Conjugate[0])) { + bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); + if (X.extent(1) == 1) { + const auto x0 = Kokkos::subview(X, Kokkos::ALL(), 0); + auto y0 = Kokkos::subview(Y, Kokkos::ALL(), 0); + return Bsr::spMatVec_no_transpose(controls, alpha, A, x0, beta, y0, + useConjugate); + } else { + return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y, + useConjugate); + } + } else if ((mode[0] == KokkosSparse::Transpose[0]) || + (mode[0] == KokkosSparse::ConjugateTranspose[0])) { + bool useConjugate = (mode[0] == KokkosSparse::ConjugateTranspose[0]); + if (X.extent(1) == 1) { + const auto x0 = Kokkos::subview(X, Kokkos::ALL(), 0); + auto y0 = Kokkos::subview(Y, Kokkos::ALL(), 0); + return Bsr::spMatVec_transpose(controls, alpha, A, x0, beta, y0, + useConjugate); + } else { + return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y, + useConjugate); + } + } } }; +template +struct SPMV_MV_BSRMATRIX { + typedef BsrMatrix AMatrix; + typedef Kokkos::View XVector; + typedef Kokkos::View YVector; + typedef typename YVector::non_const_value_type YScalar; + + static void spmv_mv_bsrmatrix( + const KokkosKernels::Experimental::Controls &controls, const char mode[], + const YScalar &alpha, const AMatrix &A, const XVector &X, + const YScalar &beta, const YVector &Y) { + static_assert(std::is_integral::value, + "This implementation is only for integer Scalar types."); + typedef SPMV_BSRMATRIX + impl_type; + KokkosKernels::Experimental::Controls defaultControls; + for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { + auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); + auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); + impl_type::spmv_bsrmatrix(defaultControls, mode, alpha, A, x_j, beta, + y_j); + } + } +}; #endif // !defined(KOKKOSKERNELS_ETI_ONLY) || // KOKKOSKERNELS_IMPL_COMPILE_LIBRARY } // namespace Impl } // namespace Experimental } // namespace KokkosSparse -// declare / instantiate the 2D MV version +// declare / instantiate the vector version // Instantiate with A,x,y are all the requested Scalar type (no instantiation of // mixed-precision operands) #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_DECL( \ @@ -218,12 +340,12 @@ struct SPMV_BSRMATRIX, \ Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ + SCALAR_TYPE *, LAYOUT_TYPE, \ Kokkos::Device, \ - Kokkos::MemoryTraits, true>; + Kokkos::MemoryTraits, false, true>; #define KOKKOSSPARSE_SPMV_BSRMATRIX_ETI_SPEC_INST( \ SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ @@ -232,13 +354,50 @@ struct SPMV_BSRMATRIX, \ Kokkos::MemoryTraits, const OFFSET_TYPE, \ - SCALAR_TYPE const **, LAYOUT_TYPE, \ + SCALAR_TYPE const *, LAYOUT_TYPE, \ Kokkos::Device, \ Kokkos::MemoryTraits, \ - SCALAR_TYPE **, LAYOUT_TYPE, \ + SCALAR_TYPE *, LAYOUT_TYPE, \ Kokkos::Device, \ - Kokkos::MemoryTraits, true>; + Kokkos::MemoryTraits, false, true>; + +// declare / instantiate the 2D MV version +// Instantiate with A,x,y are all the requested Scalar type (no instantiation of +// mixed-precision operands) +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_DECL( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + extern template struct SPMV_MV_BSRMATRIX< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + std::is_integral::type>::value, false, \ + true>; + +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_ETI_SPEC_INST( \ + SCALAR_TYPE, ORDINAL_TYPE, OFFSET_TYPE, LAYOUT_TYPE, EXEC_SPACE_TYPE, \ + MEM_SPACE_TYPE) \ + template struct SPMV_MV_BSRMATRIX< \ + const SCALAR_TYPE, const ORDINAL_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET_TYPE, \ + SCALAR_TYPE const **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR_TYPE **, LAYOUT_TYPE, \ + Kokkos::Device, \ + Kokkos::MemoryTraits, \ + std::is_integral::type>::value, false, \ + true>; +#include #include +#include #endif // KOKKOSSPARSE_IMPL_SPMV_BSRMATRIX_SPEC_HPP_ diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp index d8edb22944..26d2830f90 100644 --- a/unit_test/sparse/Test_Sparse.hpp +++ b/unit_test/sparse/Test_Sparse.hpp @@ -14,6 +14,8 @@ #include "Test_Sparse_spgemm.hpp" #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" +//#include "Test_Sparse_spmv_blockcrs.hpp" +#include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" diff --git a/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp new file mode 100644 index 0000000000..7996e9e4e6 --- /dev/null +++ b/unit_test/sparse/Test_Sparse_spmv_blockcrs.hpp @@ -0,0 +1,750 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_BlockCrsMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#include +#include +#include +#include + +#include "KokkosKernels_Controls.hpp" +#include "KokkosKernels_default_types.hpp" + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; + +namespace Test_BlockCrs { + +/// Random generator +template +inline Scalar random() { + auto const max = static_cast(RAND_MAX) + static_cast(1); + return static_cast(std::rand()) / max; +} + +template +inline void set_random_value(Scalar &v) { + v = random(); +} + +template +inline void set_random_value(Kokkos::complex &v) { + Scalar vre = random(); + Scalar vim = random(); + v = Kokkos::complex(vre, vim); +} + +template +inline void set_random_value(std::complex &v) { + Scalar vre = random(); + Scalar vim = random(); + v = std::complex(vre, vim); +} + +/// \brief Driver routine for checking BlockCrsMatrix times vector +template +void check_blockcrs_times_v(const char fOp[], scalar_t alpha, scalar_t beta, + const lno_t bMax, int &num_errors) { + // The mat_structure view is used to generate a matrix using + // finite difference (FD) or finite element (FE) discretization + // on a cartesian grid. + Kokkos::View mat_structure("Matrix Structure", + 3); + mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction + mat_structure(0, 1) = 0; // Add BC to the left + mat_structure(0, 2) = 0; // Add BC to the right + mat_structure(1, 0) = 7; // Request 7 grid point in 'y' direction + mat_structure(1, 1) = 0; // Add BC to the bottom + mat_structure(1, 2) = 0; // Add BC to the top + mat_structure(2, 0) = 9; // Request 9 grid point in 'z' direction + mat_structure(2, 1) = 0; // Add BC to the bottom + mat_structure(2, 2) = 0; // Add BC to the top + + typedef + typename KokkosSparse::CrsMatrix + crsMat_t; + typedef typename KokkosSparse::CrsMatrix + h_crsMat_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef scalar_view_t x_vector_type; + typedef scalar_view_t y_vector_type; + + h_crsMat_t mat_b1 = + Test::generate_structured_matrix3D("FD", mat_structure); + + num_errors = 0; + for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { + lno_t nRow = blockSize * mat_b1.numRows(); + lno_t nCol = blockSize * mat_b1.numCols(); + size_type nnz = static_cast(blockSize) * + static_cast(blockSize) * mat_b1.nnz(); + + // Fill block with random values + std::vector mat_val(nnz); + for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); + + // + // Create graph for CrsMatrix + // + + std::vector mat_rowmap(nRow + 1, 0); + std::vector mat_colidx(nnz, 0); + + for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { + const auto jbeg = mat_b1.graph.row_map(ir); + const auto jend = mat_b1.graph.row_map(ir + 1); + for (lno_t ib = 0; ib < blockSize; ++ib) { + const lno_t my_row = ir * blockSize + ib; + mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; + for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + const auto col0 = mat_b1.graph.entries(ijk); + for (lno_t jb = 0; jb < blockSize; ++jb) { + mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + col0 * blockSize + jb; + } + } + } + } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) + + // Create the CrsMatrix for the reference computation + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], + &mat_colidx[0]); + + x_vector_type xref("new_right_hand_side", nRow); + auto h_xref = Kokkos::create_mirror_view(xref); + for (lno_t ir = 0; ir < nRow; ++ir) { + set_random_value(h_xref(ir)); + } + Kokkos::deep_copy(xref, h_xref); + + y_vector_type y0("y_init", nRow); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir)); + Kokkos::deep_copy(y0, h_y0); + + y_vector_type ycrs("crs_product_result", nRow); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); + Kokkos::deep_copy(ycrs, h_ycrs); + + // Compute the reference product + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + + y_vector_type ybcrs("bsr_product_result", nRow); + auto h_ybcrs = Kokkos::create_mirror_view(ybcrs); + for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir) = h_y0(ir); + Kokkos::deep_copy(ybcrs, h_ybcrs); + + // Create the BlockCrsMatrix + KokkosSparse::Experimental::BlockCrsMatrix + Absr(Acrs, blockSize); + + // Compute the product with the BlockCrsMatrix format + KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs); + + // Compare the two products + double error = 0.0, maxNorm = 0.0; + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_ybcrs, ybcrs); + for (lno_t ir = 0; ir < nRow; ++ir) { + error = std::max( + error, Kokkos::ArithTraits::abs(h_ycrs(ir) - h_ybcrs(ir))); + maxNorm = std::max( + maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir))); + } + + double tmps = + static_cast(Kokkos::ArithTraits::abs(alpha)) + + static_cast(Kokkos::ArithTraits::abs(beta)); + if ((tmps > 0.0) && (maxNorm == 0)) { + std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize + << " maxNorm " << maxNorm << " error " << error << " alpha " + << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + // + // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row + // + const auto tol = ((nnz / nRow) + 1) * + static_cast(Kokkos::ArithTraits::abs( + Kokkos::ArithTraits::epsilon())); + if (error > tol * maxNorm) { + std::cout << " BlockCRSMatrix - SpMV times V >> blockSize " << blockSize + << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " + << maxNorm << " alpha " << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) +} + +/// \brief Driver routine for checking BlockCrsMatrix times multiple vector +template +void check_blockcrs_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, + const lno_t bMax, int &num_errors) { + // The mat_structure view is used to generate a matrix using + // finite difference (FD) or finite element (FE) discretization + // on a cartesian grid. + Kokkos::View mat_structure("Matrix Structure", + 3); + mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction + mat_structure(0, 1) = 0; // Add BC to the left + mat_structure(0, 2) = 0; // Add BC to the right + mat_structure(1, 0) = 5; // Request 11 grid point in 'y' direction + mat_structure(1, 1) = 0; // Add BC to the bottom + mat_structure(1, 2) = 0; // Add BC to the top + mat_structure(2, 0) = 9; // Request 13 grid point in 'y' direction + mat_structure(2, 1) = 0; // Add BC to the bottom + mat_structure(2, 2) = 0; // Add BC to the top + + typedef typename KokkosSparse::CrsMatrix + h_crsMat_t; + typedef + typename KokkosSparse::CrsMatrix + crsMat_t; + typedef Kokkos::View block_vector_t; + + h_crsMat_t mat_b1 = + Test::generate_structured_matrix3D("FD", mat_structure); + + num_errors = 0; + const int nrhs = 5; + + for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { + // + // Fill blocks with random values + // + + lno_t nRow = blockSize * mat_b1.numRows(); + lno_t nCol = blockSize * mat_b1.numCols(); + size_type nnz = static_cast(blockSize) * + static_cast(blockSize) * mat_b1.nnz(); + + std::vector mat_val(nnz); + for (size_type ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); + + // + // Create graph for CrsMatrix + // + + std::vector mat_rowmap(nRow + 1); + std::vector mat_colidx(nnz); + + mat_rowmap.resize(nRow + 1); + auto *rowmap = &mat_rowmap[0]; + rowmap[0] = 0; + + mat_colidx.resize(nnz); + auto *cols = &mat_colidx[0]; + + for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { + const auto jbeg = mat_b1.graph.row_map(ir); + const auto jend = mat_b1.graph.row_map(ir + 1); + for (lno_t ib = 0; ib < blockSize; ++ib) { + const lno_t my_row = ir * blockSize + ib; + rowmap[my_row + 1] = rowmap[my_row] + (jend - jbeg) * blockSize; + for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + const auto col0 = mat_b1.graph.entries(ijk); + for (lno_t jb = 0; jb < blockSize; ++jb) { + cols[rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + col0 * blockSize + jb; + } + } + } + } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) + + // Create the CrsMatrix for the reference computation + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], rowmap, cols); + + block_vector_t xref("new_right_hand_side", nRow, nrhs); + auto h_xref = Kokkos::create_mirror_view(xref); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_xref(ir, jc)); + Kokkos::deep_copy(xref, h_xref); + + block_vector_t y0("y_init", nRow, nrhs); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); + Kokkos::deep_copy(y0, h_y0); + + block_vector_t ycrs("crs_product_result", nRow, nrhs); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ycrs, h_ycrs); + + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + + block_vector_t ybcrs("bsr_product_result", nRow, nrhs); + auto h_ybcrs = Kokkos::create_mirror_view(ybcrs); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) h_ybcrs(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ybcrs, h_ybcrs); + + // Create the BlockCrsMatrix + KokkosSparse::Experimental::BlockCrsMatrix + Absr(Acrs, blockSize); + + // Compute the product for the BlockCrsMatrix format + KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybcrs); + + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_ybcrs, ybcrs); + + // Compare the two products + double error = 0.0, maxNorm = 0.0; + for (int jc = 0; jc < nrhs; ++jc) { + for (int ir = 0; ir < nRow; ++ir) { + error = std::max(error, Kokkos::ArithTraits::abs( + h_ycrs(ir, jc) - h_ybcrs(ir, jc))); + maxNorm = std::max( + maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir, jc))); + } + } + auto tol = ((nnz / nRow) + 1) * + static_cast(Kokkos::ArithTraits::abs( + Kokkos::ArithTraits::epsilon())); + + double tmps = + static_cast(Kokkos::ArithTraits::abs(alpha)) + + static_cast(Kokkos::ArithTraits::abs(beta)); + if ((tmps > 0.0) && (maxNorm == 0)) { + std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize + << " maxNorm " << maxNorm << " error " << error << " alpha " + << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + if (error > tol * maxNorm) { + std::cout << " BlockCRSMatrix - SpMV times MV >> blockSize " << blockSize + << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " + << maxNorm << " alpha " << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) +} + +} // namespace Test_BlockCrs + +template +void testSpMVBlockCrsMatrix() { + // + // Test for the operation y <- alpha * Op(A) * x + beta * y + // + + // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = + // A^H + std::vector modes = {'N', 'C', 'T', 'H'}; + + // Define a set of pairs (alpha, beta) + std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, + 0.0, 1.0, 3.1, -2.5}; + + // + // Set the largest block size for the block matrix + // The code will create matrices with block sizes 1, .., bMax + // + const lno_t bMax = 13; + + //--- Test single vector case + for (const auto mode : modes) { + int num_errors = 0; + for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { + auto alpha_s = static_cast(testAlphaBeta[ii]); + auto beta_s = static_cast(testAlphaBeta[ii + 1]); + num_errors = 0; + Test_BlockCrs::check_blockcrs_times_v( + &mode, alpha_s, beta_s, bMax, num_errors); + if (num_errors > 0) { + printf( + "KokkosSparse::Test::spmv_blockcrs: %i errors of %i with params: " + "%c %lf %lf\n", + num_errors, bMax, mode, Kokkos::ArithTraits::abs(alpha_s), + Kokkos::ArithTraits::abs(beta_s)); + } + EXPECT_TRUE(num_errors == 0); + } + } +} + +template +void testBlockCrsMatrix_SpM_MV() { + // + // Test for the operation Y <- alpha * Op(A) * X + beta * Y + // + + // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = + // A^H + std::vector modes = {'N', 'C', 'T', 'H'}; + + // Define a set of pairs (alpha, beta) + std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, + 0.0, 1.0, 3.1, -2.5}; + + // + // Set the largest block size for the block matrix + // The code will create matrices with block sizes 1, .., bMax + // + const lno_t bMax = 13; + + //--- Test multiple vector case + for (auto mode : modes) { + int num_errors = 0; + for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { + auto alpha_s = static_cast(testAlphaBeta[ii]); + auto beta_s = static_cast(testAlphaBeta[ii + 1]); + num_errors = 0; + Test_BlockCrs::check_blockcrs_times_mv(&mode, alpha_s, beta_s, + bMax, num_errors); + if (num_errors > 0) { + printf( + "KokkosSparse::Test::spm_mv_blockcrs: %i errors of %i with params: " + "%c %lf %lf\n", + num_errors, bMax, mode, Kokkos::ArithTraits::abs(alpha_s), + Kokkos::ArithTraits::abs(beta_s)); + } + EXPECT_TRUE(num_errors == 0); + } + } +} + +////////////////////////// + +#define EXECUTE_BCRS_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bcrs_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + testSpMVBlockCrsMatrix(); \ + } + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, + TestExecSpace) +#endif + +#undef EXECUTE_BCRS_TIMES_VEC_TEST + +////////////////////////// + +#define EXECUTE_BCRS_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bcrs_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + testBlockCrsMatrix_SpM_MV(); \ + } + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BCRS_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, + TestExecSpace) +#endif + +#undef EXECUTE_BCRS_TIMES_MVEC_TEST diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp new file mode 100644 index 0000000000..1b50a34269 --- /dev/null +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -0,0 +1,855 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include +#include "KokkosSparse_spmv.hpp" +#include "KokkosSparse_BsrMatrix.hpp" +#include "KokkosSparse_CrsMatrix.hpp" + +#include +#include +#include +#include + +#include "KokkosKernels_Controls.hpp" +#include "KokkosKernels_default_types.hpp" + +typedef Kokkos::complex kokkos_complex_double; +typedef Kokkos::complex kokkos_complex_float; + +namespace Test_Bsr { + +/// Random generator +template +inline Scalar random() { + auto const max = static_cast(RAND_MAX) + static_cast(1); + return static_cast(std::rand()) / max; +} + +template +inline void set_random_value(Scalar &v) { + v = random(); +} + +template +inline void set_random_value(Kokkos::complex &v) { + Scalar vre = random(); + Scalar vim = random(); + v = Kokkos::complex(vre, vim); +} + +template +inline void set_random_value(std::complex &v) { + Scalar vre = random(); + Scalar vim = random(); + v = std::complex(vre, vim); +} + +/// \brief Routine to make CRS-style entries of the block matrix +/// +/// \tparam scalar_t Template type for the numerical values +/// \param mat_b1 Sparse matrix whose graph will be used +/// \param blockSize Block size for each entries +/// \param mat_rowmap[out] CRS-style row map for the block matrix +/// \param mat_colidx[out] CRS-style column entries for the block matrix +/// \param mat_val[out] Numerical (random) values +template +void make_block_entries( + const KokkosSparse::CrsMatrix &mat_b1, + int blockSize, std::vector &mat_rowmap, + std::vector &mat_colidx, std::vector &mat_val) { + lno_t nRow = blockSize * mat_b1.numRows(); + lno_t nCol = blockSize * mat_b1.numCols(); + size_t nnz = static_cast(blockSize) * static_cast(blockSize) * + mat_b1.nnz(); + + mat_val.resize(nnz); + for (size_t ii = 0; ii < nnz; ++ii) set_random_value(mat_val[ii]); + + // + // Create graph for CrsMatrix + // + + mat_rowmap.assign(nRow + 1, 0); + mat_colidx.assign(nnz, 0); + + for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) { + const auto jbeg = mat_b1.graph.row_map(ir); + const auto jend = mat_b1.graph.row_map(ir + 1); + for (lno_t ib = 0; ib < blockSize; ++ib) { + const lno_t my_row = ir * blockSize + ib; + mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; + for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + const auto col0 = mat_b1.graph.entries(ijk); + for (lno_t jb = 0; jb < blockSize; ++jb) { + mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = + col0 * blockSize + jb; + } + } + } + } // for (lno_t ir = 0; ir < mat_b1.numRows(); ++ir) +} + +/// \brief Driver routine for checking BsrMatrix times vector +template +void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, + const lno_t bMax, int &num_errors) { + // The mat_structure view is used to generate a matrix using + // finite difference (FD) or finite element (FE) discretization + // on a cartesian grid. + Kokkos::View mat_structure("Matrix Structure", + 3); + mat_structure(0, 0) = 8; // Request 8 grid point in 'x' direction + mat_structure(0, 1) = 0; // Add BC to the left + mat_structure(0, 2) = 0; // Add BC to the right + mat_structure(1, 0) = 7; // Request 7 grid point in 'y' direction + mat_structure(1, 1) = 0; // Add BC to the bottom + mat_structure(1, 2) = 0; // Add BC to the top + mat_structure(2, 0) = 9; // Request 9 grid point in 'z' direction + mat_structure(2, 1) = 0; // Add BC to the bottom + mat_structure(2, 2) = 0; // Add BC to the top + + typedef + typename KokkosSparse::CrsMatrix + crsMat_t; + typedef typename KokkosSparse::CrsMatrix + h_crsMat_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef scalar_view_t x_vector_type; + typedef scalar_view_t y_vector_type; + + h_crsMat_t mat_b1 = + Test::generate_structured_matrix3D("FD", mat_structure); + + num_errors = 0; + for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { + // + // Fill blocks with random values + // + + lno_t nRow = blockSize * mat_b1.numRows(); + lno_t nCol = blockSize * mat_b1.numCols(); + size_type nnz = static_cast(blockSize) * + static_cast(blockSize) * mat_b1.nnz(); + + std::vector mat_rowmap(nRow + 1, 0); + std::vector mat_colidx(nnz, 0); + std::vector mat_val(nnz); + + // Create the entries + make_block_entries(mat_b1, blockSize, mat_rowmap, + mat_colidx, mat_val); + + // Create the CrsMatrix for the reference computation + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], + &mat_colidx[0]); + + x_vector_type xref("new_right_hand_side", nRow); + auto h_xref = Kokkos::create_mirror_view(xref); + for (lno_t ir = 0; ir < nRow; ++ir) { + set_random_value(h_xref(ir)); + } + Kokkos::deep_copy(xref, h_xref); + + y_vector_type y0("y_init", nRow); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir)); + Kokkos::deep_copy(y0, h_y0); + + y_vector_type ycrs("crs_product_result", nRow); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); + Kokkos::deep_copy(ycrs, h_ycrs); + + // + // Make reference computation with a CrsMatrix variable + // + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + + y_vector_type ybsr("bsr_product_result", nRow); + auto h_ybsr = Kokkos::create_mirror_view(ybsr); + for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); + Kokkos::deep_copy(ybsr, h_ybsr); + + // Create the BsrMatrix for the check test + KokkosSparse::Experimental::BsrMatrix + Absr(Acrs, blockSize); + + // + // Make computation with the BsrMatrix format + // + KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr); + + // + // Compare the two products + // + double error = 0.0, maxNorm = 0.0; + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_ybsr, ybsr); + for (lno_t ir = 0; ir < nRow; ++ir) { + error = std::max( + error, Kokkos::ArithTraits::abs(h_ycrs(ir) - h_ybsr(ir))); + maxNorm = std::max( + maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir))); + } + + double tmps = + static_cast(Kokkos::ArithTraits::abs(alpha)) + + static_cast(Kokkos::ArithTraits::abs(beta)); + if ((tmps > 0.0) && (maxNorm == 0)) { + std::cout << " BSR - SpMV times MV >> blockSize " << blockSize + << " maxNorm " << maxNorm << " error " << error << " alpha " + << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + // + // --- Factor ((nnz / nRow) + 1) = Average number of non-zeros per row + // + const auto tol = ((nnz / nRow) + 1) * + static_cast(Kokkos::ArithTraits::abs( + Kokkos::ArithTraits::epsilon())); + if (error > tol * maxNorm) { + std::cout << " BSR - SpMV times V >> blockSize " << blockSize << " ratio " + << error / maxNorm << " tol " << tol << " maxNorm " << maxNorm + << " alpha " << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) +} + +/// \brief Driver routine for checking BsrMatrix times multiple vector +template +void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, + const lno_t bMax, int &num_errors) { + // The mat_structure view is used to generate a matrix using + // finite difference (FD) or finite element (FE) discretization + // on a cartesian grid. + Kokkos::View mat_structure("Matrix Structure", + 3); + mat_structure(0, 0) = 7; // Request 7 grid point in 'x' direction + mat_structure(0, 1) = 0; // Add BC to the left + mat_structure(0, 2) = 0; // Add BC to the right + mat_structure(1, 0) = 5; // Request 11 grid point in 'y' direction + mat_structure(1, 1) = 0; // Add BC to the bottom + mat_structure(1, 2) = 0; // Add BC to the top + mat_structure(2, 0) = 9; // Request 13 grid point in 'y' direction + mat_structure(2, 1) = 0; // Add BC to the bottom + mat_structure(2, 2) = 0; // Add BC to the top + + typedef typename KokkosSparse::CrsMatrix + h_crsMat_t; + typedef + typename KokkosSparse::CrsMatrix + crsMat_t; + typedef Kokkos::View block_vector_t; + + h_crsMat_t mat_b1 = + Test::generate_structured_matrix3D("FD", mat_structure); + + num_errors = 0; + const int nrhs = 5; + + for (lno_t blockSize = 1; blockSize <= bMax; ++blockSize) { + // + // Fill blocks with random values + // + + lno_t nRow = blockSize * mat_b1.numRows(); + lno_t nCol = blockSize * mat_b1.numCols(); + size_type nnz = static_cast(blockSize) * + static_cast(blockSize) * mat_b1.nnz(); + + std::vector mat_rowmap(nRow + 1, 0); + std::vector mat_colidx(nnz, 0); + std::vector mat_val(nnz); + + // Create the entries + make_block_entries(mat_b1, static_cast(blockSize), + mat_rowmap, mat_colidx, mat_val); + + // Create the CrsMatrix for the reference computation + crsMat_t Acrs("new_crs_matr", nRow, nCol, nnz, &mat_val[0], &mat_rowmap[0], + &mat_colidx[0]); + + block_vector_t xref("new_right_hand_side", nRow, nrhs); + auto h_xref = Kokkos::create_mirror_view(xref); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_xref(ir, jc)); + Kokkos::deep_copy(xref, h_xref); + + block_vector_t y0("y_init", nRow, nrhs); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); + Kokkos::deep_copy(y0, h_y0); + + block_vector_t ycrs("crs_product_result", nRow, nrhs); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ycrs, h_ycrs); + + // + // Compute the reference product with a CrsMatrix variable + // + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + + block_vector_t ybsr("bsr_product_result", nRow, nrhs); + auto h_ybsr = Kokkos::create_mirror_view(ybsr); + for (int jc = 0; jc < nrhs; ++jc) + for (lno_t ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ybsr, h_ybsr); + + // Create the BsrMatrix for the check test + KokkosSparse::Experimental::BsrMatrix + Absr(Acrs, blockSize); + + // + // Compute the product with the BsrMatrix format + // + KokkosSparse::spmv(fOp, alpha, Absr, xref, beta, ybsr); + + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_ybsr, ybsr); + + // + // Compare the two products + // + double error = 0.0, maxNorm = 0.0; + for (int jc = 0; jc < nrhs; ++jc) { + for (int ir = 0; ir < nRow; ++ir) { + error = std::max(error, Kokkos::ArithTraits::abs( + h_ycrs(ir, jc) - h_ybsr(ir, jc))); + maxNorm = std::max( + maxNorm, Kokkos::ArithTraits::abs(h_ycrs(ir, jc))); + } + } + + double tmps = + static_cast(Kokkos::ArithTraits::abs(alpha)) + + static_cast(Kokkos::ArithTraits::abs(beta)); + if ((tmps > 0.0) && (maxNorm == 0)) { + std::cout << " BSR - SpMV times MV >> blockSize " << blockSize + << " maxNorm " << maxNorm << " error " << error << " alpha " + << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + auto tol = ((nnz / nRow) + 1) * + static_cast(Kokkos::ArithTraits::abs( + Kokkos::ArithTraits::epsilon())); + if (error > tol * maxNorm) { + std::cout << " BSR - SpMV times MV >> blockSize " << blockSize + << " ratio " << error / maxNorm << " tol " << tol << " maxNorm " + << maxNorm << " alpha " << alpha << " beta " << beta << "\n"; + num_errors += 1; + } + + } // for (int blockSize = 1; blockSize <= bMax; ++blockSize) +} + +} // namespace Test_Bsr + +template +void testSpMVBsrMatrix() { + // + // Check a few corner cases + // + + // 0 x 0 case + { + typedef + typename KokkosSparse::Experimental::BsrMatrix + bsrMat_t; + bsrMat_t Absr("empty", 0, 0, 0, nullptr, nullptr, nullptr, 1); + typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; + typedef scalar_view_t x_vector_type; + typedef scalar_view_t y_vector_type; + x_vector_type x("corner-case-x", Absr.numCols()); + y_vector_type y("corner-case-y", Absr.numRows()); + Kokkos::deep_copy(y, static_cast(0)); + scalar_t alpha = static_cast(1); + scalar_t beta = static_cast(1); + const char fOp = 'N'; + int num_errors = 0; + try { + KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); + Kokkos::fence(); + } catch (std::exception &e) { + num_errors += 1; + std::cout << e.what(); + } + EXPECT_TRUE(num_errors == 0); + } + + // 0 x 1 case + { + typedef + typename KokkosSparse::Experimental::BsrMatrix + bsrMat_t; + bsrMat_t Absr("empty", 0, 1, 0, nullptr, nullptr, nullptr, 1); + typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; + typedef scalar_view_t x_vector_type; + typedef scalar_view_t y_vector_type; + x_vector_type x("corner-case-x", Absr.numCols()); + y_vector_type y("corner-case-y", Absr.numRows()); + Kokkos::deep_copy(y, static_cast(0)); + scalar_t alpha = static_cast(1); + scalar_t beta = static_cast(1); + const char fOp = 'N'; + int num_errors = 0; + try { + KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); + Kokkos::fence(); + } catch (std::exception &e) { + num_errors += 1; + std::cout << e.what(); + } + EXPECT_TRUE(num_errors == 0); + } + + // 1 x 0 case + { + typedef + typename KokkosSparse::Experimental::BsrMatrix + bsrMat_t; + bsrMat_t Absr("empty", 1, 0, 0, nullptr, nullptr, nullptr, 1); + typedef typename bsrMat_t::values_type::non_const_type scalar_view_t; + typedef scalar_view_t x_vector_type; + typedef scalar_view_t y_vector_type; + x_vector_type x("corner-case-x", Absr.numCols()); + y_vector_type y("corner-case-y", Absr.numRows()); + Kokkos::deep_copy(y, static_cast(0)); + scalar_t alpha = static_cast(1); + scalar_t beta = static_cast(1); + const char fOp = 'N'; + int num_errors = 0; + try { + KokkosSparse::spmv(&fOp, alpha, Absr, x, beta, y); + Kokkos::fence(); + } catch (std::exception &e) { + num_errors += 1; + std::cout << e.what(); + } + EXPECT_TRUE(num_errors == 0); + } + + // + // Test for the operation y <- alpha * Op(A) * x + beta * y + // + + // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = + // A^H + std::vector modes = {'N', 'C', 'T', 'H'}; + + // Define a set of pairs (alpha, beta) + std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, + 0.0, 1.0, 3.1, -2.5}; + + // + // Set the largest block size for the block matrix + // The code will create matrices with block sizes 1, .., bMax + // + constexpr lno_t bMax = 13; + + // + //--- Test single vector case + // + for (const auto mode : modes) { + int num_errors = 0; + for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { + auto alpha_s = static_cast(testAlphaBeta[ii]); + auto beta_s = static_cast(testAlphaBeta[ii + 1]); + num_errors = 0; + Test_Bsr::check_bsrm_times_v( + &mode, alpha_s, beta_s, bMax, num_errors); + if (num_errors > 0) { + printf( + "KokkosSparse::Test::spmv_bsr: %i errors of %i with params: " + "%c %lf %lf\n", + num_errors, bMax, mode, Kokkos::ArithTraits::abs(alpha_s), + Kokkos::ArithTraits::abs(beta_s)); + } + EXPECT_TRUE(num_errors == 0); + } + } +} + +template +void testBsrMatrix_SpM_MV() { + // + // Test for the operation Y <- alpha * Op(A) * X + beta * Y + // + + // Define the function Op: Op(A) = A, Op(A) = conj(A), Op(A) = A^T, Op(A) = + // A^H + std::vector modes = {'N', 'C', 'T', 'H'}; + + // Define a set of pairs (alpha, beta) + std::vector testAlphaBeta = {0.0, 0.0, -1.0, 0.0, + 0.0, 1.0, 3.1, -2.5}; + + // + // Set the largest block size for the block matrix + // The code will create matrices with block sizes 1, .., bMax + // + const lno_t bMax = 13; + + //--- Test multiple vector case + for (auto mode : modes) { + int num_errors = 0; + for (size_t ii = 0; ii < testAlphaBeta.size(); ii += 2) { + auto alpha_s = static_cast(testAlphaBeta[ii]); + auto beta_s = static_cast(testAlphaBeta[ii + 1]); + num_errors = 0; + Test_Bsr::check_bsrm_times_mv( + &mode, alpha_s, beta_s, bMax, num_errors); + if (num_errors > 0) { + printf( + "KokkosSparse::Test::spm_mv_bsr: %i errors of %i with params: " + "%c %lf %lf\n", + num_errors, bMax, mode, Kokkos::ArithTraits::abs(alpha_s), + Kokkos::ArithTraits::abs(beta_s)); + } + EXPECT_TRUE(num_errors == 0); + } + } +} + +////////////////////////// + +#define EXECUTE_BSR_TIMES_VEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bsrmat_times_vec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + testSpMVBsrMatrix(); \ + } + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(double, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(float, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_VEC_TEST(kokkos_complex_float, int64_t, size_t, TestExecSpace) +#endif + +#undef EXECUTE_BSR_TIMES_VEC_TEST + +////////////////////////// + +#define EXECUTE_BSR_TIMES_MVEC_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##bsrmat_times_multivec##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + testBsrMatrix_SpM_MV(); \ + } + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_DOUBLE) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(double, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_FLOAT) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(float, int64_t, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_DOUBLE_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_double, int64_t, size_t, + TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_INT)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, int, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int, size_t, TestExecSpace) +#endif + +#if (defined(KOKKOSKERNELS_INST_KOKKOS_COMPLEX_FLOAT_) && \ + defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) && \ + defined(KOKKOSKERNELS_INST_OFFSET_SIZE_T)) || \ + (!defined(KOKKOSKERNELS_ETI_ONLY) && \ + !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) +EXECUTE_BSR_TIMES_MVEC_TEST(kokkos_complex_float, int64_t, size_t, + TestExecSpace) +#endif + +#undef EXECUTE_BSR_TIMES_MVEC_TEST From 0dac99b0be22a0b6eb2d5e76e00887a4f40c4512 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 21 Dec 2021 14:08:49 -0700 Subject: [PATCH 02/15] Improve the Mat-Vec without transpose for BlockCrsMatrix. --- src/sparse/KokkosSparse_BlockCrsMatrix.hpp | 14 + src/sparse/KokkosSparse_spmv.hpp | 42 +-- .../KokkosSparse_spmv_blockcrsmatrix_impl.hpp | 298 ++++++++++++++++++ .../KokkosSparse_spmv_blockcrsmatrix_spec.hpp | 65 ++-- 4 files changed, 360 insertions(+), 59 deletions(-) diff --git a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp index 3985d2a45d..42532ee989 100644 --- a/src/sparse/KokkosSparse_BlockCrsMatrix.hpp +++ b/src/sparse/KokkosSparse_BlockCrsMatrix.hpp @@ -202,6 +202,13 @@ struct SparseBlockRowView { return values_[K * blockDim_ + i * length * blockDim_ + j]; } + /// \brief Return the block column index for a specified block K + /// + /// \param K [in] must be the LOCAL block index within this block-row + /// \return Block column index for "uncompressed" block row + KOKKOS_INLINE_FUNCTION + ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; } + /// \brief Return unmanaged 2D strided View wrapping local block K from this /// block-row \param K [in] must be the LOCAL block index within this /// block-row @@ -361,6 +368,13 @@ struct SparseBlockRowViewConst { return values_[K * blockDim_ + i * length * blockDim_ + j]; } + /// \brief Return the block column index for a specified block K + /// + /// \param K [in] must be the LOCAL block index within this block-row + /// \return Block column index for "uncompressed" block row + KOKKOS_INLINE_FUNCTION + ordinal_type block_colidx(const ordinal_type K) const { return colidx_[K]; } + /// \brief Return unmanaged 2D strided View wrapping local block K from this /// block-row \param K [in] must be the LOCAL block index within this /// block-row diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index 9837c9fbe3..cc945c145b 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -314,24 +314,30 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], XVector_Internal x_i(x); YVector_Internal y_i(y); - return Experimental::Impl::SPMV_BLOCKCRSMATRIX< - typename AMatrix_Internal::value_type, - typename AMatrix_Internal::ordinal_type, - typename AMatrix_Internal::device_type, - typename AMatrix_Internal::memory_traits, - typename AMatrix_Internal::size_type, - typename XVector_Internal::value_type*, - typename XVector_Internal::array_layout, - typename XVector_Internal::device_type, - typename XVector_Internal::memory_traits, - typename YVector_Internal::value_type*, - typename YVector_Internal::array_layout, - typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_blockcrsmatrix(controls, - mode, - alpha, A_i, - x_i, beta, - y_i); +#define __SPMV_TYPES__ \ + typename AMatrix_Internal::const_value_type, \ + typename AMatrix_Internal::const_ordinal_type, \ + typename AMatrix_Internal::device_type, \ + typename AMatrix_Internal::memory_traits, \ + typename AMatrix_Internal::const_size_type, \ + typename XVector_Internal::const_value_type*, \ + typename XVector_Internal::array_layout, \ + typename XVector_Internal::device_type, \ + typename XVector_Internal::memory_traits, \ + typename YVector_Internal::value_type*, \ + typename YVector_Internal::array_layout, \ + typename YVector_Internal::device_type, \ + typename YVector_Internal::memory_traits + + constexpr bool eti_spec_avail = + KokkosSparse::Experimental::Impl::spmv_blockcrsmatrix_eti_spec_avail< + __SPMV_TYPES__>::value; + + Experimental::Impl::SPMV_BLOCKCRSMATRIX< + __SPMV_TYPES__, eti_spec_avail>::spmv_blockcrsmatrix(controls, mode, + alpha, A_i, x_i, + beta, y_i); +#undef __SPMV_TYPES__ } template +struct BCRS_GEMV_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + AMatrix m_A; + XVector m_x; + YVector m_y; + + const ordinal_type block_dim; + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BCRS_GEMV_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 1, + "XVector must be a rank 1 View."); + static_assert(static_cast(YVector::rank) == 1, + "YVector must be a rank 1 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + const auto ystart = iBlock * block_dim; + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto beta1 = static_cast(1); + // + if (conjugate) { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto xstart = row.block_colidx(ic) * block_dim; + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + value_type t(0); + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + const auto aval = + Kokkos::ArithTraits::conj(Aview(ii, jj)); + t += aval * m_x(xstart + jj); + } + m_y(ystart + ii) += alpha * t; + } + } + } else { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto xstart = row.block_colidx(ic) * block_dim; + KokkosBatched::SerialGemvInternal:: + invoke( + block_dim, block_dim, alpha, Aview.data(), Aview.stride_0(), + Aview.stride_1(), &m_x(xstart), m_x.stride_0(), beta1, + &m_y(ystart), m_y.stride_0()); + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + // + auto yview = Kokkos::subview( + m_y, Kokkos::make_pair(iBlock * block_dim, + iBlock * block_dim + block_dim)); + // + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + y_value_type sum = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry, y_value_type &lsum) { + const auto start_col = row.block_colidx(iEntry) * block_dim; + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, ir, jr)) + : row.local_block_value(iEntry, ir, jr); + lsum += val * m_x(start_col + jr); + } + }, + sum); + + Kokkos::single(Kokkos::PerThread(dev), [&]() { + sum *= alpha; + yview(ir) += sum; + }); + } + }); + } +}; + +/* ******************* */ + +// +// spMatVec_no_transpose: version for CPU execution spaces +// (RangePolicy or trivial serial impl used) +// +template ()>::type * = nullptr> +void spMatVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + + // + // Treat the case y <- alpha * A * x + beta * y + // + + typedef KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BCRS_GEMV_Functor func(alpha, A, x, y, 1, + useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::bcrs_spmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::bcrs_spmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } +} + +/* ******************* */ + +// +// spMatVec_no_transpose: version for GPU execution spaces (TeamPolicy used) +// +template ()>::type * = nullptr> +void spMatVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + if (A.numRows() <= static_cast(0)) { + return; + } + + // We need to scale y first ("scaling" by zero just means filling + // with zeros), since the functor updates y (by adding alpha Op(A) x). + KokkosBlas::scal(y, beta, y); + + typedef KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + typedef typename AMatrix_Internal::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + AMatrix_Internal A_internal = A; + + BCRS_GEMV_Functor func( + alpha, A_internal, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bcrs_spmv", policy, + func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bcrs_spmv", policy, + func); + } +} + +/* ******************* */ + +} // namespace BCRS + +} // namespace Impl +} // namespace Experimental +} // namespace KokkosSparse + #endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BLOCKCRSMATRIX_IMPL_HPP diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp index 36696acdda..c40ec08293 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp @@ -169,6 +169,15 @@ struct SPMV_BLOCKCRSMATRIX::conj(h_a_values(index)) - : h_a_values(index); - h_y(ir + iblock * blockSize) += - alpha * avalue * h_x(jr + col_block * blockSize); - } - } - } - } - } else if ((mode[0] == KokkosSparse::Transpose[0]) || - (mode[0] == KokkosSparse::ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); - for (ordinal_type iblock = 0; iblock < numBlockRows; ++iblock) { - const auto jbeg = h_a_row_map(iblock); - const auto jend = h_a_row_map(iblock + 1); - for (ordinal_type ir = 0; ir < blockSize; ++ir) { - for (auto jb = jbeg; jb < jend; ++jb) { - const auto col_block = h_a_entries(jb); - for (ordinal_type jr = 0; jr < blockSize; ++jr) { - const size_type index = jbeg * blockSize2 + ir + - (jb - jbeg) * blockSize + - jr * blockSize * (jend - jbeg); - const auto avalue = - (useConjugate) - ? Kokkos::ArithTraits::conj(h_a_values(index)) - : h_a_values(index); - h_y(ir + col_block * blockSize) += - alpha * avalue * h_x(jr + iblock * blockSize); - } + for (ordinal_type jr = 0; jr < blockSize; ++jr) { + const size_type index = jbeg * blockSize2 + ir + + (jb - jbeg) * blockSize + + jr * blockSize * (jend - jbeg); + const auto avalue = + (useConjugate) + ? Kokkos::ArithTraits::conj(h_a_values(index)) + : h_a_values(index); + h_y(ir + col_block * blockSize) += + alpha * avalue * h_x(jr + iblock * blockSize); } } } From 4c60a51f8f27e7ac82e0f6f5c07f8cbcc204e081 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 21 Dec 2021 16:48:58 -0700 Subject: [PATCH 03/15] Improve Mat-MultiVec for BlockCrsMatrix withoout Transpose --- .../KokkosSparse_spmv_blockcrsmatrix_impl.hpp | 273 ++++++++++++++++++ .../KokkosSparse_spmv_blockcrsmatrix_spec.hpp | 9 + 2 files changed, 282 insertions(+) diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp index 64a9ce1ec2..213b2c9aa2 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp @@ -337,6 +337,279 @@ void spMatVec_no_transpose( /* ******************* */ +template +struct BCRS_GEMM_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + AMatrix m_A; + XVector m_x; + YVector m_y; + const ordinal_type block_dim; + const ordinal_type num_rhs; + + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BCRS_GEMM_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + num_rhs(m_x_.extent(1)), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 2, + "XVector must be a rank 2 View."); + static_assert(static_cast(YVector::rank) == 2, + "YVector must be a rank 2 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + // + const auto ystart = iBlock * block_dim; + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + // + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto xstart = row.block_colidx(ic) * block_dim; + for (ordinal_type jr = 0; jr < num_rhs; ++jr) { + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + value_type t(0); + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + const auto aval = + (conjugate) + ? Kokkos::ArithTraits::conj(Aview(ii, jj)) + : Aview(ii, jj); + t += aval * m_x(xstart + jj, jr); + } + m_y(ystart + ii, jr) += alpha * t; + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + // + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto nrhs = num_rhs; + // + for (ordinal_type ic = 0; ic < nrhs; ++ic) { + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + y_value_type sum = 0; + + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry, y_value_type &lsum) { + const auto start_col = row.block_colidx(iEntry) * block_dim; + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, ir, jr)) + : row.local_block_value(iEntry, ir, jr); + lsum += val * m_x(start_col + jr, ic); + } + }, + sum); + + Kokkos::single(Kokkos::PerThread(dev), [&]() { + sum *= alpha; + m_y(iBlock * block_dim + ir, ic) += sum; + }); + } + } + // + }); + } +}; + +/* ******************* */ + +// +// spMatMultiVec_no_transpose: version for CPU execution spaces +// (RangePolicy or trivial serial impl used) +// +template ()>::type * = nullptr> +void spMatMultiVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + // + // Treat the case y <- alpha * A * x + beta * y + // + typedef KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BCRS_GEMM_Functor func(alpha, A, x, y, 1, + useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::bcrs_spm_mv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::bcrs_spm_mv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } +} + +/* ******************* */ + +// +// spMatMultiVec_no_transpose: version for GPU execution spaces (TeamPolicy +// used) +// +template ()>::type * = nullptr> +void spMatMultiVec_no_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + if (A.numRows() <= static_cast(0)) { + return; + } + + KokkosBlas::scal(y, beta, y); + + typedef KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + typedef typename AMatrix_Internal::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + AMatrix_Internal A_internal = A; + + BCRS_GEMM_Functor func( + alpha, A_internal, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bcrs_spm_mv", + policy, func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bcrs_spm_mv", + policy, func); + } +} + +/* ******************* */ + } // namespace BCRS } // namespace Impl diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp index c40ec08293..1832ac1ce0 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp @@ -244,6 +244,15 @@ struct SPMV_MV_BLOCKCRSMATRIX Date: Tue, 21 Dec 2021 20:18:16 -0700 Subject: [PATCH 04/15] Improve BlockCrsMatrix product with transpose --- .../KokkosSparse_spmv_blockcrsmatrix_impl.hpp | 560 ++++++++++++++++++ .../KokkosSparse_spmv_blockcrsmatrix_spec.hpp | 145 +---- 2 files changed, 568 insertions(+), 137 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp index 213b2c9aa2..1477126c5a 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp @@ -337,6 +337,283 @@ void spMatVec_no_transpose( /* ******************* */ +template +struct BCRS_GEMV_Transpose_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + + AMatrix m_A; + XVector m_x; + YVector m_y; + + const ordinal_type block_dim; + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BCRS_GEMV_Transpose_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 1, + "XVector must be a rank 1 View."); + static_assert(static_cast(YVector::rank) == 1, + "YVector must be a rank 1 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + // + // Assume that alpha is not zero + // + const auto xstart = iBlock * block_dim; + const auto xview = + Kokkos::subview(m_x, Kokkos::make_pair(xstart, xstart + block_dim)); + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto beta1 = static_cast(1); + const auto alpha1 = beta1; + if (conjugate) { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + const auto aval = + Kokkos::ArithTraits::conj(Aview(ii, jj)); + t += aval * xview(ii); + } + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj), t); + } + } + } else { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + KokkosBatched::SerialGemvInternal< + KokkosBatched::Algo::Gemv::Blocked>::invoke( + 1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(), + Aview.stride_0(), xview.data(), xview.stride_0(), beta1, &t, 1); + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj), t); + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + // + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry) { + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, jr, ir)) + : row.local_block_value(iEntry, jr, ir); + const ordinal_type ind = row.block_colidx(iEntry); + Kokkos::atomic_add( + &m_y(block_dim * ind + ir), + static_cast( + alpha * val * m_x(block_dim * iBlock + jr))); + } + }); + } + }); + } +}; + +/* ******************* */ + +/// \brief spMatVec_transpose: version for CPU execution spaces (RangePolicy or +/// trivial serial impl used) +template ()>::type * = nullptr> +void spMatVec_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + + if (alpha == Kokkos::ArithTraits::zero()) return; + + // + // Treat the case y <- alpha * A^T * x + beta * y + // + + typedef KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + + AMatrix_Internal A_internal = A; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BCRS_GEMV_Transpose_Functor func( + alpha, A_internal, x, y, 1, useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::blockcrs_spmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::blockcrs_spmv", + Kokkos::RangePolicy< + typename AMatrix_Internal::device_type::execution_space, + Kokkos::Schedule>(0, A.numRows()), + func); + } +} + +// +// spMatVec_transpose: version for GPU execution spaces (TeamPolicy used) +// +template ()>::type * = nullptr> +void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, const AMatrix &A, + const XVector &x, const BetaType &beta, YVector &y, + bool useConjugate) { + if (A.numRows() <= 0) { + return; + } + + // We need to scale y first ("scaling" by zero just means filling + // with zeros), since the functor works by atomic-adding into y. + KokkosBlas::scal(y, beta, y); + + typedef typename AMatrix::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + BCRS_GEMV_Transpose_Functor func( + alpha, A, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bspmv", policy, + func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::bspmv", policy, + func); + } +} + +/* ******************* */ + template struct BCRS_GEMM_Functor { typedef typename AMatrix::execution_space execution_space; @@ -610,6 +887,289 @@ void spMatMultiVec_no_transpose( /* ******************* */ +template +struct BCRS_GEMM_Transpose_Functor { + typedef typename AMatrix::execution_space execution_space; + typedef typename AMatrix::non_const_value_type value_type; + typedef typename Kokkos::TeamPolicy team_policy; + typedef typename team_policy::member_type team_member; + typedef Kokkos::Details::ArithTraits ATV; + + //! Nonconst version of the type of column indices in the sparse matrix. + typedef typename AMatrix::non_const_ordinal_type ordinal_type; + //! Nonconst version of the type of row offsets in the sparse matrix. + typedef typename AMatrix::non_const_size_type size_type; + + const value_type alpha; + AMatrix m_A; + XVector m_x; + YVector m_y; + const ordinal_type block_dim; + const ordinal_type num_rhs; + + const ordinal_type blocks_per_team; + + bool conjugate = false; + + BCRS_GEMM_Transpose_Functor(const value_type alpha_, const AMatrix m_A_, + const XVector m_x_, const YVector m_y_, + const int blocks_per_team_, bool conj_) + : alpha(alpha_), + m_A(m_A_), + m_x(m_x_), + m_y(m_y_), + block_dim(m_A_.blockDim()), + num_rhs(m_x_.extent(1)), + blocks_per_team(blocks_per_team_), + conjugate(conj_) { + static_assert(static_cast(XVector::rank) == 2, + "XVector must be a rank 2 View."); + static_assert(static_cast(YVector::rank) == 2, + "YVector must be a rank 2 View."); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type iBlock) const { + // + const auto xstart = iBlock * block_dim; + const auto xview = Kokkos::subview( + m_x, Kokkos::make_pair(xstart, xstart + block_dim), Kokkos::ALL()); + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto beta1 = static_cast(1); + const auto alpha1 = beta1; + const auto ldx = m_x.stride_1(); + const auto ldy = m_y.stride_1(); + // + if (conjugate) { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jr = 0; jr < num_rhs; ++jr) { + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + for (ordinal_type ii = 0; ii < block_dim; ++ii) { + const auto aval = + Kokkos::ArithTraits::conj(Aview(ii, jj)); + t += aval * xview(ii, jr); + } + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj, jr), t); + } + } + } + } else { + for (ordinal_type ic = 0; ic < count; ++ic) { + const auto Aview = row.block(ic); + const auto ystart = row.block_colidx(ic) * block_dim; + for (ordinal_type jr = 0; jr < num_rhs; ++jr) { + for (ordinal_type jj = 0; jj < block_dim; ++jj) { + value_type t(0); + KokkosBatched::SerialGemvInternal< + KokkosBatched::Algo::Gemv::Blocked>::invoke( + 1, block_dim, alpha1, Aview.data() + jj, Aview.stride_1(), + Aview.stride_0(), xview.data() + jr * ldx, xview.stride_0(), + beta1, &t, 1); + t *= alpha; + Kokkos::atomic_add(&m_y(ystart + jj, jr), t); + } + } + } + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const team_member &dev) const { + using y_value_type = typename YVector::non_const_value_type; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(dev, 0, blocks_per_team), + [&](const ordinal_type &loop) { + const ordinal_type iBlock = + static_cast(dev.league_rank()) * blocks_per_team + + loop; + if (iBlock >= m_A.numRows()) { + return; + } + // + const auto start = m_A.graph.row_map(iBlock); + const ordinal_type count = + static_cast(m_A.graph.row_map(iBlock + 1) - start); + const auto row = m_A.block_row_Const(iBlock); + const auto nrhs = m_x.extent(1); + // + for (ordinal_type ic = 0; ic < nrhs; ++ic) { + for (ordinal_type ir = 0; ir < block_dim; ++ir) { + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(dev, count), + [&](const ordinal_type &iEntry) { + for (ordinal_type jr = 0; jr < block_dim; ++jr) { + const value_type val = + conjugate + ? ATV::conj(row.local_block_value(iEntry, jr, ir)) + : row.local_block_value(iEntry, jr, ir); + const ordinal_type ind = row.block_colidx(iEntry); + Kokkos::atomic_add( + &m_y(block_dim * ind + ir, ic), + static_cast( + alpha * val * m_x(block_dim * iBlock + jr, ic))); + } + }); + } + } + // + }); + } +}; + +/* ******************* */ + +/// \brief spMatMultiVec_transpose: version for CPU execution spaces +/// (RangePolicy or trivial serial impl used) +template ()>::type * = nullptr> +void spMatMultiVec_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, + const KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> &A, + const XVector &x, const BetaType &beta, YVector &y, bool useConjugate) { + // This is required to maintain semantics of KokkosKernels native SpMV: + // if y contains NaN but beta = 0, the result y should be filled with 0. + // For example, this is useful for passing in uninitialized y and beta=0. + if (beta == Kokkos::ArithTraits::zero()) + Kokkos::deep_copy(y, Kokkos::ArithTraits::zero()); + else + KokkosBlas::scal(y, beta, y); + // + // Treat the case y <- alpha * A^T * x + beta * y + // + typedef KokkosSparse::Experimental::BlockCrsMatrix< + AT, AO, AD, Kokkos::MemoryTraits, AS> + AMatrix_Internal; + typedef typename AMatrix_Internal::execution_space execution_space; + + AMatrix_Internal A_internal = A; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + + BCRS_GEMM_Transpose_Functor func( + alpha, A_internal, x, y, 1, useConjugate); + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::parallel_for( + "KokkosSparse::blockcrs_spm_mv", + Kokkos::RangePolicy>( + 0, A.numRows()), + func); + } else { + Kokkos::parallel_for( + "KokkosSparse::blockcrs_spm_mv", + Kokkos::RangePolicy>( + 0, A.numRows()), + func); + } +} + +// +// spMatMultiVec_transpose: version for GPU execution spaces (TeamPolicy used) +// +template ()>::type * = nullptr> +void spMatMultiVec_transpose( + const KokkosKernels::Experimental::Controls &controls, + const AlphaType &alpha, const AMatrix &A, const XVector &x, + const BetaType &beta, YVector &y, bool useConjugate) { + if (A.numRows() <= 0) { + return; + } + + KokkosBlas::scal(y, beta, y); + + typedef typename AMatrix::execution_space execution_space; + + bool use_dynamic_schedule = false; // Forces the use of a dynamic schedule + bool use_static_schedule = false; // Forces the use of a static schedule + if (controls.isParameter("schedule")) { + if (controls.getParameter("schedule") == "dynamic") { + use_dynamic_schedule = true; + } else if (controls.getParameter("schedule") == "static") { + use_static_schedule = true; + } + } + int team_size = -1; + int vector_length = -1; + int64_t blocks_per_thread = -1; + + // + // Use the controls to allow the user to pass in some tuning + // parameters. + // + if (controls.isParameter("team size")) { + team_size = std::stoi(controls.getParameter("team size")); + } + if (controls.isParameter("vector length")) { + vector_length = std::stoi(controls.getParameter("vector length")); + } + if (controls.isParameter("rows per thread")) { + blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + } + + // + // Use the existing launch parameters routine from SPMV + // + int64_t blocks_per_team = + KokkosSparse::Impl::spmv_launch_parameters( + A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; + + BCRS_GEMM_Transpose_Functor func( + alpha, A, x, y, blocks_per_team, useConjugate); + + if (((A.nnz() > 10000000) || use_dynamic_schedule) && !use_static_schedule) { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::blockcrs_spm_mv", + policy, func); + } else { + Kokkos::TeamPolicy> + policy(1, 1); + if (team_size < 0) + policy = + Kokkos::TeamPolicy>( + worksets, Kokkos::AUTO, vector_length); + else + policy = + Kokkos::TeamPolicy>( + worksets, team_size, vector_length); + Kokkos::parallel_for("KokkosSparse::blockcrs_spm_mv", + policy, func); + } +} + +/* ******************* */ + } // namespace BCRS } // namespace Impl diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp index 1832ac1ce0..7132ec0fe1 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_spec.hpp @@ -174,59 +174,11 @@ struct SPMV_BLOCKCRSMATRIX::conj(h_a_values(index)) - : h_a_values(index); - h_y(ir + col_block * blockSize) += - alpha * avalue * h_x(jr + iblock * blockSize); - } - } - } - } - // - Kokkos::deep_copy(Y, h_y); - // } }; @@ -249,92 +201,11 @@ struct SPMV_MV_BLOCKCRSMATRIX::conj( - h_a_values(index)) - : h_a_values(index); - h_y(ir + iblock * blockSize, jj) += avalue * alpha_x; - } - } - } - } - } - } else if ((mode[0] == KokkosSparse::Transpose[0]) || - (mode[0] == KokkosSparse::ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); - for (ordinal_type iblock = 0; iblock < numBlockRows; ++iblock) { - const auto jbeg = h_a_row_map(iblock); - const auto jend = h_a_row_map(iblock + 1); - for (auto jb = jbeg; jb < jend; ++jb) { - const auto col_block = h_a_entries(jb); - for (ordinal_type jj = 0; jj < numRhs; ++jj) { - for (ordinal_type ir = 0; ir < blockSize; ++ir) { - for (ordinal_type jr = 0; jr < blockSize; ++jr) { - const size_type index = jbeg * blockSize2 + ir + - (jb - jbeg) * blockSize + - jr * blockSize * (jend - jbeg); - const auto avalue = (useConjugate) - ? Kokkos::ArithTraits::conj( - h_a_values(index)) - : h_a_values(index); - h_y(ir + col_block * blockSize, jj) += - alpha * avalue * h_x(jr + iblock * blockSize, jj); - } - } - } - } - } - } - // - Kokkos::deep_copy(Y, h_y); - // } }; From 7f9ce458384629ac67d1f271976a888a3f9c3aaf Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 28 Dec 2021 19:23:11 -0700 Subject: [PATCH 05/15] Remove cout statements. --- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp index d65468ce96..6fe81e5fac 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -469,7 +469,6 @@ int main(int argc, char **argv) { int schedule = 0; for (int i = 0; i < argc; i++) { - std::cout << " i " << i << " argv " << argv[i] << "\n"; if ((strcmp(argv[i], "-bs") == 0)) { int tmp = atoi(argv[++i]); @@ -478,9 +477,7 @@ int main(int argc, char **argv) { } if ((strcmp(argv[i], "--tpl") == 0)) { - std::cout << argv[i] << "\n"; i++; - std::cout << argv[i] << "\n"; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE if ((strcmp(argv[i], "cuda") == 0)) test = static_cast(details::Implementation::Cuda); @@ -489,7 +486,6 @@ int main(int argc, char **argv) { if ((strcmp(argv[i], "mkl") == 0)) test = static_cast(details::Implementation::MKL); #endif - std::cout << test << "\n"; continue; } From d226e943f0afaa0e5432c0cec2ecd79b26318d34 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 28 Dec 2021 21:03:06 -0700 Subject: [PATCH 06/15] Reformating --- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp index 6fe81e5fac..9fb0a60117 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -469,7 +469,6 @@ int main(int argc, char **argv) { int schedule = 0; for (int i = 0; i < argc; i++) { - if ((strcmp(argv[i], "-bs") == 0)) { int tmp = atoi(argv[++i]); bMax = (tmp > 0) ? tmp : bMax; From 75f4b85a2cd78a55bf98f3d3a74d0fb95cf5f777 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Wed, 29 Dec 2021 22:21:41 -0700 Subject: [PATCH 07/15] Adjust size parameters for hierarchical parallelism. --- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 55 ++++++++++------ .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 64 ++++++++++++------- 2 files changed, 75 insertions(+), 44 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp index 9fb0a60117..75159ab3f6 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -189,13 +189,21 @@ int test_bsr_matrix_single_vec( y_vector_type ycrs("crs_product_result", nRow); auto h_ycrs = Kokkos::create_mirror_view(ycrs); + KokkosKernels::Experimental::Controls controls; + switch (static_cast(test)) { + case Implementation::KokkosKernels: { + controls.setParameter("algorithm", "native"); + } break; + default: break; + } + // Time a series of multiplications with the CrsMatrix double time_crs = 0.0; for (int jr = 0; jr < loop; ++jr) { for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir) = h_y0(ir); Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; - KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); time_crs += timer.seconds(); } @@ -209,14 +217,6 @@ int test_bsr_matrix_single_vec( scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, int> Absr(Acrs, blockSize); - KokkosKernels::Experimental::Controls controls; - switch (static_cast(test)) { - case Implementation::KokkosKernels: { - controls.setParameter("algorithm", "native"); - } break; - default: break; - } - // Time a series of multiplications with the BsrMatrix for (int jr = 0; jr < loop; ++jr) { for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir) = h_y0(ir); @@ -250,7 +250,23 @@ int test_bsr_matrix_single_vec( //-- Print the number of Gflops for both products if (blockSize == 1) { - printf("Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BsrMatrix) \n"); + printf("Op, blockSize: AvgGFlop(CrsMatrix) "); + switch (static_cast(test)) { + default: + case Implementation::KokkosKernels: + printf(" AvgGFlop(BsrMatrix - KokkosKernels) \n"); + break; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + case Implementation::Cuda: + printf(" AvgGFlop(BsrMatrix - CUSPARSE) \n"); + break; +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + case Implementation::MKL: + printf(" AvgGFlop(BsrMatrix - MKL) \n"); + break; +#endif + } } double num_flops = mat_val.size() * 2 * loop; double crs_flop = (num_flops / time_crs) * 1.0e-09; @@ -321,6 +337,14 @@ int test_bsr_matrix_vec( block_vector_t ycrs("crs_product_result", nRow, nvec); auto h_ycrs = Kokkos::create_mirror_view(ycrs); + KokkosKernels::Experimental::Controls controls; + switch (static_cast(test)) { + case Implementation::KokkosKernels: { + controls.setParameter("algorithm", "native"); + } break; + default: break; + } + // Time a series of multiplications with the CrsMatrix format double time_crs = 0.0; for (int jr = 0; jr < loop; ++jr) { @@ -328,7 +352,7 @@ int test_bsr_matrix_vec( for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); Kokkos::deep_copy(ycrs, h_ycrs); Kokkos::Timer timer; - KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + KokkosSparse::spmv(controls, fOp, alpha, Acrs, xref, beta, ycrs); time_crs += timer.seconds(); } @@ -342,15 +366,6 @@ int test_bsr_matrix_vec( // Time a series of multiplications with the BsrMatrix double time_bsr = 0.0; - KokkosKernels::Experimental::Controls controls; - switch (static_cast(test)) { - case Implementation::KokkosKernels: { - controls.setParameter("algorithm", "native"); - } break; - default: break; - } - - // Time a series of multiplications with the BsrMatrix for (int jr = 0; jr < loop; ++jr) { for (Ordinal jc = 0; jc < nvec; ++jc) for (Ordinal ir = 0; ir < nRow; ++ir) h_ybsr(ir, jc) = h_y0(ir, jc); diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 24333cafd8..e079b92210 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -761,9 +761,9 @@ void spMatVec_no_transpose( use_static_schedule = true; } } - int team_size = -1; - int vector_length = -1; - int64_t blocks_per_thread = -1; + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; // // Use the controls to allow the user to pass in some tuning parameters. @@ -775,15 +775,19 @@ void spMatVec_no_transpose( vector_length = std::stoi(controls.getParameter("vector length")); } if (controls.isParameter("rows per thread")) { - blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + rows_per_thread = std::stoll(controls.getParameter("rows per thread")); } // // Use the existing launch parameters routine from SPMV // - int64_t blocks_per_team = + const auto block_dim = A.blockDim(); + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters( - A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + A.numRows() * block_dim, A.nnz() * block_dim * block_dim, + rows_per_thread, team_size, vector_length); + int64_t blocks_per_team = (rows_per_team + block_dim - 1) / block_dim; + blocks_per_team = (blocks_per_team < 1) ? 1 : blocks_per_team; int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; AMatrix_Internal A_internal = A; @@ -1040,9 +1044,9 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, use_static_schedule = true; } } - int team_size = -1; - int vector_length = -1; - int64_t blocks_per_thread = -1; + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; // // Use the controls to allow the user to pass in some tuning parameters. @@ -1054,15 +1058,19 @@ void spMatVec_transpose(const KokkosKernels::Experimental::Controls &controls, vector_length = std::stoi(controls.getParameter("vector length")); } if (controls.isParameter("rows per thread")) { - blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + rows_per_thread = std::stoll(controls.getParameter("rows per thread")); } // // Use the existing launch parameters routine from SPMV // - int64_t blocks_per_team = + const auto block_dim = A.blockDim(); + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters( - A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + A.numRows() * block_dim, A.nnz() * block_dim * block_dim, + rows_per_thread, team_size, vector_length); + int64_t blocks_per_team = (rows_per_team + block_dim - 1) / block_dim; + blocks_per_team = (blocks_per_team < 1) ? 1 : blocks_per_team; int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; BSR_GEMV_Transpose_Functor func( @@ -1325,9 +1333,9 @@ void spMatMultiVec_no_transpose( use_static_schedule = true; } } - int team_size = -1; - int vector_length = -1; - int64_t blocks_per_thread = -1; + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; // // Use the controls to allow the user to pass in some tuning parameters. @@ -1339,15 +1347,19 @@ void spMatMultiVec_no_transpose( vector_length = std::stoi(controls.getParameter("vector length")); } if (controls.isParameter("rows per thread")) { - blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + rows_per_thread = std::stoll(controls.getParameter("rows per thread")); } // // Use the existing launch parameters routine from SPMV // - int64_t blocks_per_team = + const auto block_dim = A.blockDim(); + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters( - A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + A.numRows() * block_dim, A.nnz() * block_dim * block_dim, + rows_per_thread, team_size, vector_length); + int64_t blocks_per_team = (rows_per_team + block_dim - 1) / block_dim; + blocks_per_team = (blocks_per_team < 1) ? 1 : blocks_per_team; int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; AMatrix_Internal A_internal = A; @@ -1609,9 +1621,9 @@ void spMatMultiVec_transpose( use_static_schedule = true; } } - int team_size = -1; - int vector_length = -1; - int64_t blocks_per_thread = -1; + int team_size = -1; + int vector_length = -1; + int64_t rows_per_thread = -1; // // Use the controls to allow the user to pass in some tuning @@ -1624,15 +1636,19 @@ void spMatMultiVec_transpose( vector_length = std::stoi(controls.getParameter("vector length")); } if (controls.isParameter("rows per thread")) { - blocks_per_thread = std::stoll(controls.getParameter("rows per thread")); + rows_per_thread = std::stoll(controls.getParameter("rows per thread")); } // // Use the existing launch parameters routine from SPMV // - int64_t blocks_per_team = + const auto block_dim = A.blockDim(); + int64_t rows_per_team = KokkosSparse::Impl::spmv_launch_parameters( - A.numRows(), A.nnz(), blocks_per_thread, team_size, vector_length); + A.numRows() * block_dim, A.nnz() * block_dim * block_dim, + rows_per_thread, team_size, vector_length); + int64_t blocks_per_team = (rows_per_team + block_dim - 1) / block_dim; + blocks_per_team = (blocks_per_team < 1) ? 1 : blocks_per_team; int64_t worksets = (A.numRows() + blocks_per_team - 1) / blocks_per_team; BSR_GEMM_Transpose_Functor func( From d134870f20b65949db016b5b14ee58511d1f0da8 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 4 Jan 2022 20:15:37 -0700 Subject: [PATCH 08/15] Fix compilation issues. --- .../sparse/KokkosSparse_spmv_blockcrs.cpp | 427 ++++++++---------- perf_test/sparse/KokkosSparse_spmv_bsr.cpp | 34 +- src/sparse/KokkosSparse_BsrMatrix.hpp | 29 +- .../KokkosSparse_spmv_blockcrsmatrix_impl.hpp | 1 - .../impl/KokkosSparse_spmv_bsrmatrix_impl.hpp | 1 - unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 3 +- 6 files changed, 223 insertions(+), 272 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp b/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp index 9ccd63f58f..2107965677 100644 --- a/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_blockcrs.cpp @@ -105,7 +105,6 @@ void make_block_entries( int blockSize, std::vector &mat_rowmap, std::vector &mat_colidx, std::vector &mat_val) { Ordinal nRow = blockSize * mat_b1.numRows(); - Ordinal nCol = blockSize * mat_b1.numCols(); size_t nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); @@ -125,7 +124,7 @@ void make_block_entries( for (Ordinal ib = 0; ib < blockSize; ++ib) { const Ordinal my_row = ir * blockSize + ib; mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (Ordinal ijk = jbeg; ijk < jend; ++ijk) { + for (auto ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (Ordinal jb = 0; jb < blockSize; ++jb) { mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = @@ -141,9 +140,8 @@ int test_blockcrs_matrix_single_vec( const char fOp[], KokkosSparse::CrsMatrix mat_b1, - int test, const char *filename, int rows_per_thread, int team_size, - int vector_length, int schedule, int loop, const scalar_t alpha, - const scalar_t beta, const int bMax) { + int test, int loop, const scalar_t alpha, const scalar_t beta, + const int bMax) { typedef typename KokkosSparse::CrsMatrix< scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> crsMat_type; @@ -218,241 +216,217 @@ int test_blockcrs_matrix_single_vec( time_blockcrs += timer.seconds(); } } break; -#ifdef HAVE_CUSPARSE - case Implementation::Cuda: { - // Time a series of multiplications with the BlockCrsMatrix - KokkosKernels::Experimental::Controls controls; - for (int jr = 0; jr < loop; ++jr) { - for (Ordinal ir = 0; ir < nRow; ++ir) h_yblockcrs(ir) = h_y0(ir); - Kokkos::deep_copy(yblockcrs, h_yblockcrs); - Kokkos::Timer timer; - KokkosSparse::Impl::spmv_block_impl_cusparse( - controls, fOp, alpha, Ablockcrs, xref, beta, yblockcrs); - time_blockcrs += timer.seconds(); - } - break; -#endif -#ifdef HAVE_MKL - case Implementation::MKL: { - // Time a series of multiplications with the BlockCrsMatrix - KokkosKernels::Experimental::Controls controls; - for (int jr = 0; jr < loop; ++jr) { - for (Ordinal ir = 0; ir < nRow; ++ir) h_yblockcrs(ir) = h_y0(ir); - Kokkos::deep_copy(yblockcrs, h_yblockcrs); - Kokkos::Timer timer; - KokkosSparse::Impl::spmv_block_mkl(controls, fOp, alpha, Ablockcrs, - xref, beta, yblockcrs); - time_blockcrs += timer.seconds(); - } - } break; -#endif - } + } - // Check that the numerical result is matching - Kokkos::deep_copy(h_ycrs, ycrs); - Kokkos::deep_copy(h_yblockcrs, yblockcrs); - double error = 0.0, maxNorm = 0.0; - for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { - maxNorm = std::max( - maxNorm, std::abs(static_cast(h_ycrs(ir)))); - error = std::max(error, std::abs(static_cast( - h_ycrs(ir) - h_yblockcrs(ir)))); - } + // Check that the numerical result is matching + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_yblockcrs, yblockcrs); + double error = 0.0, maxNorm = 0.0; + for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) { + maxNorm = std::max( + maxNorm, std::abs(static_cast(h_ycrs(ir)))); + error = std::max( + error, + std::abs(static_cast(h_ycrs(ir) - h_yblockcrs(ir)))); + } - double tol = - (mat_val.size() / nRow) * std::numeric_limits::epsilon(); - if (error > tol * maxNorm) { - num_errors += 1; - std::cout << static_cast(test) << " "; - std::cout << fOp << ", " << blockSize << " : " - << " error " << error << " maxNorm " << maxNorm << " tol " - << tol << " tol * maxNorm " << tol * maxNorm << "\n"; - } + double tol = + (mat_val.size() / nRow) * std::numeric_limits::epsilon(); + if (error > tol * maxNorm) { + num_errors += 1; + std::cout << static_cast(test) << " "; + std::cout << fOp << ", " << blockSize << " : " + << " error " << error << " maxNorm " << maxNorm << " tol " + << tol << " tol * maxNorm " << tol * maxNorm << "\n"; + } - //-- Print the number of Gflops for both products - if (blockSize == 1) { - printf( - "Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n"); - } - double num_flops = mat_val.size() * 2 * loop; - double crs_flop = (num_flops / time_crs) * 1.0e-09; - double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09; - std::cout << fOp << ", " << blockSize << " : "; - if (crs_flop < blockcrs_flop) { - std::cout << crs_flop << " <" << blockcrs_flop << ">"; - } else { - std::cout << "<" << crs_flop << "> " << blockcrs_flop; - } - std::cout << std::endl; + //-- Print the number of Gflops for both products + if (blockSize == 1) { + printf("Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n"); + } + double num_flops = mat_val.size() * 2 * loop; + double crs_flop = (num_flops / time_crs) * 1.0e-09; + double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09; + std::cout << fOp << ", " << blockSize << " : "; + if (crs_flop < blockcrs_flop) { + std::cout << crs_flop << " <" << blockcrs_flop << ">"; + } else { + std::cout << "<" << crs_flop << "> " << blockcrs_flop; + } + std::cout << std::endl; - } // for (Ordinal blockSize = 1; blockSize < bMax; ++blockSize) + } // for (Ordinal blockSize = 1; blockSize < bMax; ++blockSize) - return int(num_errors); - } + return int(num_errors); +} - template - int test_blockcrs_matrix_vec( - const char fOp[], - KokkosSparse::CrsMatrix - mat_b1, - int nvec, int test, const char *filename, int rows_per_thread, - int team_size, int vector_length, int schedule, int loop, - const scalar_t alpha, const scalar_t beta, const int bMax) { - typedef typename KokkosSparse::CrsMatrix< - scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> - crsMat_type; - - typedef Kokkos::View - block_vector_t; - - srand(17312837); - - int num_errors = 0; - const auto bMax_o = static_cast(bMax); - for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) { - Ordinal nRow = blockSize * mat_b1.numRows(); - Ordinal nCol = nRow; - std::vector mat_rowmap; - std::vector mat_colidx; - std::vector mat_val; - - make_block_entries(mat_b1, blockSize, mat_rowmap, mat_colidx, - mat_val); - - // Create the CrsMatrix for the reference computation - crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0], - &mat_rowmap[0], &mat_colidx[0]); - - block_vector_t xref("new_right_hand_side", nRow, nvec); - auto h_xref = Kokkos::create_mirror_view(xref); - for (Ordinal jc = 0; jc < nvec; ++jc) { - for (Ordinal ir = 0; ir < nRow; ++ir) { - set_random_value(h_xref(ir, jc)); - } +template +int test_blockcrs_matrix_vec( + const char fOp[], + KokkosSparse::CrsMatrix + mat_b1, + int nvec, int test, int loop, const scalar_t alpha, const scalar_t beta, + const int bMax) { + typedef typename KokkosSparse::CrsMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> + crsMat_type; + + typedef Kokkos::View + block_vector_t; + + srand(17312837); + + int num_errors = 0; + const auto bMax_o = static_cast(bMax); + for (Ordinal blockSize = 1; blockSize <= bMax_o; ++blockSize) { + Ordinal nRow = blockSize * mat_b1.numRows(); + Ordinal nCol = nRow; + std::vector mat_rowmap; + std::vector mat_colidx; + std::vector mat_val; + + make_block_entries(mat_b1, blockSize, mat_rowmap, mat_colidx, + mat_val); + + // Create the CrsMatrix for the reference computation + crsMat_type Acrs("new_crs_matr", nRow, nCol, mat_val.size(), &mat_val[0], + &mat_rowmap[0], &mat_colidx[0]); + + block_vector_t xref("new_right_hand_side", nRow, nvec); + auto h_xref = Kokkos::create_mirror_view(xref); + for (Ordinal jc = 0; jc < nvec; ++jc) { + for (Ordinal ir = 0; ir < nRow; ++ir) { + set_random_value(h_xref(ir, jc)); } - Kokkos::deep_copy(xref, h_xref); + } + Kokkos::deep_copy(xref, h_xref); - block_vector_t y0("y_init", nRow, nvec); - auto h_y0 = Kokkos::create_mirror_view(y0); + block_vector_t y0("y_init", nRow, nvec); + auto h_y0 = Kokkos::create_mirror_view(y0); + for (Ordinal jc = 0; jc < nvec; ++jc) + for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); + Kokkos::deep_copy(y0, h_y0); + + block_vector_t ycrs("crs_product_result", nRow, nvec); + auto h_ycrs = Kokkos::create_mirror_view(ycrs); + + // Time a series of multiplications with the CrsMatrix format + double time_crs = 0.0; + for (int jr = 0; jr < loop; ++jr) { for (Ordinal jc = 0; jc < nvec; ++jc) - for (Ordinal ir = 0; ir < nRow; ++ir) set_random_value(h_y0(ir, jc)); - Kokkos::deep_copy(y0, h_y0); - - block_vector_t ycrs("crs_product_result", nRow, nvec); - auto h_ycrs = Kokkos::create_mirror_view(ycrs); - - // Time a series of multiplications with the CrsMatrix format - double time_crs = 0.0; - for (int jr = 0; jr < loop; ++jr) { - for (Ordinal jc = 0; jc < nvec; ++jc) - for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); - Kokkos::deep_copy(ycrs, h_ycrs); - Kokkos::Timer timer; - KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); - time_crs += timer.seconds(); - } + for (Ordinal ir = 0; ir < nRow; ++ir) h_ycrs(ir, jc) = h_y0(ir, jc); + Kokkos::deep_copy(ycrs, h_ycrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Acrs, xref, beta, ycrs); + time_crs += timer.seconds(); + } - // Create the BlockCrsMatrix variable - KokkosSparse::Experimental::BlockCrsMatrix< - scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> - Ablockcrs(Acrs, blockSize); - - block_vector_t yblockcrs("blockcrs_product_result", nRow, nvec); - auto h_yblockcrs = Kokkos::create_mirror_view(yblockcrs); - - // Time a series of multiplications with the BlockCrsMatrix - double time_blockcrs = 0.0; - for (int jr = 0; jr < loop; ++jr) { - for (Ordinal jc = 0; jc < nvec; ++jc) - for (Ordinal ir = 0; ir < nRow; ++ir) - h_yblockcrs(ir, jc) = h_y0(ir, jc); - Kokkos::deep_copy(yblockcrs, h_yblockcrs); - Kokkos::Timer timer; - KokkosSparse::spmv(fOp, alpha, Ablockcrs, xref, beta, yblockcrs); - time_blockcrs += timer.seconds(); - } + // Create the BlockCrsMatrix variable + KokkosSparse::Experimental::BlockCrsMatrix< + scalar_t, Ordinal, Kokkos::DefaultExecutionSpace, void, size_t> + Ablockcrs(Acrs, blockSize); - // Check that the result is matching - Kokkos::deep_copy(h_ycrs, ycrs); - Kokkos::deep_copy(h_yblockcrs, yblockcrs); - double tol = - (mat_val.size() / nRow) * std::numeric_limits::epsilon(); - for (int jc = 0; jc < nvec; ++jc) { - double error = 0.0, maxNorm = 0.0; - for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { - maxNorm = std::max( - maxNorm, std::abs(static_cast(h_ycrs(ir, jc)))); - error = std::max(error, - std::abs(static_cast( - h_ycrs(ir, jc) - h_yblockcrs(ir, jc)))); - } - if (error > tol * maxNorm) { - num_errors += 1; - std::cout << fOp << ", " << blockSize << " : rhs " << jc << " error " - << error << " maxNorm " << maxNorm << " tol " << tol - << " tol * maxNorm " << tol * maxNorm << "\n"; + block_vector_t yblockcrs("blockcrs_product_result", nRow, nvec); + auto h_yblockcrs = Kokkos::create_mirror_view(yblockcrs); + + // Time a series of multiplications with the BlockCrsMatrix + double time_blockcrs = 0.0; + switch (static_cast(test)) { + default: + case Implementation::KokkosKernels: { + // Time a series of multiplications with the BlockCrsMatrix + for (int jr = 0; jr < loop; ++jr) { + for (Ordinal jc = 0; jc < nvec; ++jc) { + for (Ordinal ir = 0; ir < nRow; ++ir) + h_yblockcrs(ir, jc) = h_y0(ir, jc); + } + Kokkos::deep_copy(yblockcrs, h_yblockcrs); + Kokkos::Timer timer; + KokkosSparse::spmv(fOp, alpha, Ablockcrs, xref, beta, yblockcrs); + time_blockcrs += timer.seconds(); } - } + } break; + } - // Print the number of Gflops - if (blockSize == 1) { - printf( - "Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n"); + // Check that the result is matching + Kokkos::deep_copy(h_ycrs, ycrs); + Kokkos::deep_copy(h_yblockcrs, yblockcrs); + double tol = + (mat_val.size() / nRow) * std::numeric_limits::epsilon(); + for (int jc = 0; jc < nvec; ++jc) { + double error = 0.0, maxNorm = 0.0; + for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) { + maxNorm = std::max( + maxNorm, std::abs(static_cast(h_ycrs(ir, jc)))); + error = + std::max(error, std::abs(static_cast( + h_ycrs(ir, jc) - h_yblockcrs(ir, jc)))); } - double num_flops = mat_val.size() * 2 * loop * nvec; - double crs_flop = (num_flops / time_crs) * 1.0e-09; - double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09; - std::cout << fOp << ", " << blockSize << " "; - if (crs_flop < blockcrs_flop) { - // std::cout << crs_flop << " <" << blockcrs_flop << ">"; - std::cout << crs_flop << " " << blockcrs_flop << " "; - } else { - // std::cout << "<" << crs_flop << "> " << blockcrs_flop; - std::cout << " " << crs_flop << " " << blockcrs_flop; + if (error > tol * maxNorm) { + num_errors += 1; + std::cout << fOp << ", " << blockSize << " : rhs " << jc << " error " + << error << " maxNorm " << maxNorm << " tol " << tol + << " tol * maxNorm " << tol * maxNorm << "\n"; } - std::cout << std::endl; } - return int(num_errors); + // Print the number of Gflops + if (blockSize == 1) { + printf("Op, blockSize: AvgGFlop(CrsMatrix) AvgGFlop(BlockCrsMatrix) \n"); + } + double num_flops = mat_val.size() * 2 * loop * nvec; + double crs_flop = (num_flops / time_crs) * 1.0e-09; + double blockcrs_flop = (num_flops / time_blockcrs) * 1.0e-09; + std::cout << fOp << ", " << blockSize << " "; + if (crs_flop < blockcrs_flop) { + // std::cout << crs_flop << " <" << blockcrs_flop << ">"; + std::cout << crs_flop << " " << blockcrs_flop << " "; + } else { + // std::cout << "<" << crs_flop << "> " << blockcrs_flop; + std::cout << " " << crs_flop << " " << blockcrs_flop; + } + std::cout << std::endl; } - void print_help() { - printf("BlockCrsMatrix SPMV benchmark code \n"); - printf("Options:\n"); - printf( - " -bs : Maximum blocksize for the sparse matrix (default " - "= " - "16). \n"); - printf(" -h : Help. \n"); - printf( - " -l [LOOP] : How many spmv to run to aggregate average time " - "(default = 512). \n"); - printf( - " -nx : Number of points in the x-direction (default = " - "32).\n"); - printf( - " The matrix will be of dimension nx (nx - 1) (nx + " - "1).\n"); - printf( - " -nv : Number of vectors to multiply with (default = 1). " - "\n"); - printf(" --op : Use different operation \n"); - printf(" Options: \n"); - printf( - " N = normal (default) y <- alpha A x + beta y\n"); - printf( - " C = conjugate y <- alpha conj(A) x + beta " - "y\n"); - printf( - " T = transpose y <- alpha A^T x + beta " - "y\n"); - printf( - " H = hermitian y <- alpha A^H x + beta " - "y\n"); - } + return int(num_errors); } +void print_help() { + printf("BlockCrsMatrix SPMV benchmark code \n"); + printf("Options:\n"); + printf( + " -bs : Maximum blocksize for the sparse matrix (default " + "= " + "16). \n"); + printf(" -h : Help. \n"); + printf( + " -l [LOOP] : How many spmv to run to aggregate average time " + "(default = 512). \n"); + printf( + " -nx : Number of points in the x-direction (default = " + "32).\n"); + printf( + " The matrix will be of dimension nx (nx - 1) (nx + " + "1).\n"); + printf( + " -nv : Number of vectors to multiply with (default = 1). " + "\n"); + printf(" --op : Use different operation \n"); + printf(" Options: \n"); + printf(" N = normal (default) y <- alpha A x + beta y\n"); + printf( + " C = conjugate y <- alpha conj(A) x + beta " + "y\n"); + printf( + " T = transpose y <- alpha A^T x + beta " + "y\n"); + printf( + " H = hermitian y <- alpha A^H x + beta " + "y\n"); +} +} // namespace details + int main(int argc, char **argv) { int loop = 512; int bMax = 16; @@ -461,12 +435,7 @@ int main(int argc, char **argv) { char fOp[] = "N"; - char *filename = nullptr; - int rows_per_thread = -1; - int vector_length = -1; - int team_size = -1; - int test = static_cast(details::Implementation::KokkosKernels); - int schedule = 0; + int test = static_cast(details::Implementation::KokkosKernels); for (int i = 0; i < argc; i++) { if ((strcmp(argv[i], "-bs") == 0)) { @@ -545,13 +514,11 @@ int main(int argc, char **argv) { if (nvec == 1) total_errors = details::test_blockcrs_matrix_single_vec( - fOp, mat_b1, test, filename, rows_per_thread, team_size, - vector_length, schedule, loop, details::Scalar(3.1), - details::Scalar(-2.4), bMax); + fOp, mat_b1, test, loop, details::Scalar(3.1), details::Scalar(-2.4), + bMax); else total_errors = details::test_blockcrs_matrix_vec( - fOp, mat_b1, nvec, test, filename, rows_per_thread, team_size, - vector_length, schedule, loop, details::Scalar(3.1), + fOp, mat_b1, nvec, test, loop, details::Scalar(3.1), details::Scalar(-2.4), bMax); if (total_errors != 0) { diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp index 75159ab3f6..0d22b5f1ec 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr.cpp @@ -104,7 +104,6 @@ void make_block_entries( int blockSize, std::vector &mat_rowmap, std::vector &mat_colidx, std::vector &mat_val) { Ordinal nRow = blockSize * mat_b1.numRows(); - Ordinal nCol = blockSize * mat_b1.numCols(); size_t nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); @@ -140,9 +139,8 @@ int test_bsr_matrix_single_vec( const char fOp[], KokkosSparse::CrsMatrix mat_b1, - int test, const char *filename, int rows_per_thread, int team_size, - int vector_length, int schedule, int loop, const scalar_t alpha, - const scalar_t beta, const int bMax) { + int test, int loop, const scalar_t alpha, const scalar_t beta, + const int bMax) { typedef typename KokkosSparse::CrsMatrix @@ -230,7 +228,7 @@ int test_bsr_matrix_single_vec( Kokkos::deep_copy(h_ycrs, ycrs); Kokkos::deep_copy(h_ybsr, ybsr); double error = 0.0, maxNorm = 0.0; - for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { + for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) { maxNorm = std::max( maxNorm, std::abs(static_cast(h_ycrs(ir)))); error = std::max( @@ -289,9 +287,8 @@ int test_bsr_matrix_vec( const char fOp[], KokkosSparse::CrsMatrix mat_b1, - int nvec, int test, const char *filename, int rows_per_thread, - int team_size, int vector_length, int schedule, int loop, - const scalar_t alpha, const scalar_t beta, const int bMax) { + int nvec, int test, int loop, const scalar_t alpha, const scalar_t beta, + const int bMax) { typedef typename KokkosSparse::CrsMatrix @@ -382,7 +379,7 @@ int test_bsr_matrix_vec( (mat_val.size() / nRow) * std::numeric_limits::epsilon(); for (int jc = 0; jc < nvec; ++jc) { double error = 0.0, maxNorm = 0.0; - for (int ir = 0; ir < h_ycrs.extent(0); ++ir) { + for (size_t ir = 0; ir < h_ycrs.extent(0); ++ir) { maxNorm = std::max( maxNorm, std::abs(static_cast(h_ycrs(ir, jc)))); error = std::max(error, std::abs(static_cast( @@ -476,12 +473,7 @@ int main(int argc, char **argv) { char fOp[] = "N"; - char *filename = nullptr; - int rows_per_thread = -1; - int vector_length = -1; - int team_size = -1; - int test = static_cast(details::Implementation::KokkosKernels); - int schedule = 0; + int test = static_cast(details::Implementation::KokkosKernels); for (int i = 0; i < argc; i++) { if ((strcmp(argv[i], "-bs") == 0)) { @@ -565,14 +557,12 @@ int main(int argc, char **argv) { if (nvec == 1) total_errors = details::test_bsr_matrix_single_vec( - fOp, mat_b1, test, filename, rows_per_thread, team_size, - vector_length, schedule, loop, details::Scalar(3.1), - details::Scalar(-2.4), bMax); + fOp, mat_b1, test, loop, details::Scalar(3.1), details::Scalar(-2.4), + bMax); else - total_errors = details::test_bsr_matrix_vec( - fOp, mat_b1, nvec, test, filename, rows_per_thread, team_size, - vector_length, schedule, loop, details::Scalar(3.1), - details::Scalar(-2.4), bMax); + total_errors = details::test_bsr_matrix_vec(fOp, mat_b1, nvec, test, loop, + details::Scalar(3.1), + details::Scalar(-2.4), bMax); if (total_errors != 0) { printf("Kokkos::BsrMatrix SpMV Test: Failed\n"); diff --git a/src/sparse/KokkosSparse_BsrMatrix.hpp b/src/sparse/KokkosSparse_BsrMatrix.hpp index 78ea66f48b..3bf1f2db1a 100644 --- a/src/sparse/KokkosSparse_BsrMatrix.hpp +++ b/src/sparse/KokkosSparse_BsrMatrix.hpp @@ -545,7 +545,7 @@ class BsrMatrix { if (annz > 0) { ordinal_type iblock = 0; std::set set_blocks; - for (ordinal_type ii = 0; ii <= annz; ++ii) { + for (size_type ii = 0; ii <= annz; ++ii) { if ((ii == annz) || ((unman_rows(ii) / blockDim_) > iblock)) { // Flush the stored entries row_map_host(iblock + 1) = set_blocks.size(); @@ -558,7 +558,7 @@ class BsrMatrix { } } - for (ordinal_type ii = 0; ii < annz; ++ii) + for (size_type ii = 0; ii < annz; ++ii) row_map_host(ii + 1) += row_map_host(ii); Kokkos::deep_copy(tmp_row_map, row_map_host); @@ -576,7 +576,7 @@ class BsrMatrix { //--- Fill tmp_entries ordinal_type cur_block = 0; std::set set_blocks; - for (ordinal_type ii = 0; ii <= annz; ++ii) { + for (size_type ii = 0; ii <= annz; ++ii) { if ((ii == annz) || ((unman_rows(ii) / blockDim_) > cur_block)) { // Flush the stored entries ordinal_type ipos = row_map_host(cur_block); @@ -589,11 +589,10 @@ class BsrMatrix { set_blocks.insert(tmp_jblock); } //--- Fill numerical values - for (ordinal_type ii = 0; ii < annz; ++ii) { - ordinal_type iblock = unman_rows(ii) / blockDim_; - ordinal_type ilocal = unman_rows(ii) % blockDim_; - ordinal_type jblock = unman_cols(ii) / blockDim_; - ordinal_type jlocal = unman_cols(ii) % blockDim_; + for (size_type ii = 0; ii < annz; ++ii) { + const auto ilocal = unman_rows(ii) % blockDim_; + const auto jblock = unman_cols(ii) / blockDim_; + const auto jlocal = unman_cols(ii) % blockDim_; for (auto jj = row_map_host(jblock); jj < row_map_host(jblock + 1); ++jj) { if (tmp_entries_host(jj) == jblock) { @@ -733,7 +732,7 @@ class BsrMatrix { OrdinalType numBlocks = 0; for (OrdinalType i = 0; i < crs_mtx.numRows(); i += blockDim_) { std::set col_set; - for (OrdinalType ie = h_crs_row_map(i); ie < h_crs_row_map(i + blockDim_); + for (auto ie = h_crs_row_map(i); ie < h_crs_row_map(i + blockDim_); ++ie) { col_set.insert(h_crs_entries(ie) / blockDim_); } @@ -758,8 +757,8 @@ class BsrMatrix { auto ir_start = ib * blockDim_; auto ir_stop = (ib + 1) * blockDim_; std::set col_set; - for (OrdinalType jk = h_crs_row_map(ir_start); - jk < h_crs_row_map(ir_stop); ++jk) { + for (auto jk = h_crs_row_map(ir_start); jk < h_crs_row_map(ir_stop); + ++jk) { col_set.insert(h_crs_entries(jk) / blockDim_); } for (auto col_block : col_set) { @@ -776,7 +775,7 @@ class BsrMatrix { typename values_type::HostMirror h_values = Kokkos::create_mirror_view(values); - if (h_values.extent(0) < numBlocks * blockDim_ * blockDim_) { + if (h_values.extent(0) < size_t(numBlocks * blockDim_ * blockDim_)) { Kokkos::resize(h_values, numBlocks * blockDim_ * blockDim_); Kokkos::resize(values, numBlocks * blockDim_ * blockDim_); } @@ -785,13 +784,11 @@ class BsrMatrix { for (OrdinalType ir = 0; ir < crs_mtx.numRows(); ++ir) { const auto iblock = ir / blockDim_; const auto ilocal = ir % blockDim_; - for (OrdinalType jk = h_crs_row_map(ir); jk < h_crs_row_map(ir + 1); - ++jk) { + for (auto jk = h_crs_row_map(ir); jk < h_crs_row_map(ir + 1); ++jk) { const auto jc = h_crs_entries(jk); const auto jblock = jc / blockDim_; const auto jlocal = jc % blockDim_; - for (OrdinalType jkb = h_row_map(iblock); jkb < h_row_map(iblock + 1); - ++jkb) { + for (auto jkb = h_row_map(iblock); jkb < h_row_map(iblock + 1); ++jkb) { if (h_entries(jkb) == jblock) { OrdinalType shift = jkb * blockDim_ * blockDim_; h_values(shift + ilocal * blockDim_ + jlocal) = h_crs_values(jk); diff --git a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp index 1477126c5a..9dc7446669 100644 --- a/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_blockcrsmatrix_impl.hpp @@ -941,7 +941,6 @@ struct BCRS_GEMM_Transpose_Functor { const auto beta1 = static_cast(1); const auto alpha1 = beta1; const auto ldx = m_x.stride_1(); - const auto ldy = m_y.stride_1(); // if (conjugate) { for (ordinal_type ic = 0; ic < count; ++ic) { diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index e079b92210..ec54f3bb6b 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -1451,7 +1451,6 @@ struct BSR_GEMM_Transpose_Functor { const auto beta1 = static_cast(1); const auto alpha1 = beta1; const auto ldx = m_x.stride_1(); - const auto ldy = m_y.stride_1(); // if (conjugate) { for (ordinal_type ic = 0; ic < count; ++ic) { diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index 1b50a34269..1d59897bb9 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -103,7 +103,6 @@ void make_block_entries( int blockSize, std::vector &mat_rowmap, std::vector &mat_colidx, std::vector &mat_val) { lno_t nRow = blockSize * mat_b1.numRows(); - lno_t nCol = blockSize * mat_b1.numCols(); size_t nnz = static_cast(blockSize) * static_cast(blockSize) * mat_b1.nnz(); @@ -123,7 +122,7 @@ void make_block_entries( for (lno_t ib = 0; ib < blockSize; ++ib) { const lno_t my_row = ir * blockSize + ib; mat_rowmap[my_row + 1] = mat_rowmap[my_row] + (jend - jbeg) * blockSize; - for (lno_t ijk = jbeg; ijk < jend; ++ijk) { + for (auto ijk = jbeg; ijk < jend; ++ijk) { const auto col0 = mat_b1.graph.entries(ijk); for (lno_t jb = 0; jb < blockSize; ++jb) { mat_colidx[mat_rowmap[my_row] + (ijk - jbeg) * blockSize + jb] = From f87dccdf593bf4fc70714d39a10bc6d9d0757682 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 4 Jan 2022 20:19:06 -0700 Subject: [PATCH 09/15] Fix unused parameter --- src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 51727441d3..6337e52a77 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -315,12 +315,10 @@ struct SPMV_MV_BSRMATRIX impl_type; - KokkosKernels::Experimental::Controls defaultControls; for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); - impl_type::spmv_bsrmatrix(defaultControls, mode, alpha, A, x_j, beta, - y_j); + impl_type::spmv_bsrmatrix(controls, mode, alpha, A, x_j, beta, y_j); } } }; From b661667a02d82f1cbf6873490d6f9bf6cd08c2e7 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Wed, 5 Jan 2022 20:23:02 -0700 Subject: [PATCH 10/15] Fix typos with MKL --- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 35c78b4b8d..8c538c657c 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -364,15 +364,15 @@ inline void spm_mv_block_impl_mkl( \ static void spmv_bsrmatrix( \ const KokkosKernels::Experimental::Controls& controls, \ - const char mode[], const YScalar& alpha, const AMatrix& A, \ - const XVector& X, const YScalar& beta, const YVector& Y) { \ + const char mode[], const coefficient_type& alpha, const AMatrix& A, \ + const XVector& X, const coefficient_type& beta, const YVector& Y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ spmv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), x.data(), \ - y.data()); \ + A.graph.entries.data(), A.values.data(), X.data(), \ + Y.data()); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -423,18 +423,18 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, \ static void spmv_mv_bsrmatrix( \ const KokkosKernels::Experimental::Controls& controls, \ - const char mode[], const YScalar& alpha, const AMatrix& A, \ - const XVector& X, const YScalar& beta, const YVector& Y) { \ + const char mode[], const coefficient_type& alpha, const AMatrix& A, \ + const XVector& X, const coefficient_type& beta, const YVector& Y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ Kokkos::ArithTraits::name() + "]"; \ Kokkos::Profiling::pushRegion(label); \ - int colx = static_cast(x.extent(1)); \ - int ldx = static_cast(x.stride_1()); \ - int ldy = static_cast(y.stride_1()); \ + int colx = static_cast(X.extent(1)); \ + int ldx = static_cast(X.stride_1()); \ + int ldy = static_cast(Y.stride_1()); \ spm_mv_block_impl_mkl(mode_kk_to_mkl(mode[0]), alpha, beta, A.numRows(), \ A.numCols(), A.blockDim(), A.graph.row_map.data(), \ - A.graph.entries.data(), A.values.data(), x.data(), \ - colx, ldx, y.data(), ldy); \ + A.graph.entries.data(), A.values.data(), X.data(), \ + colx, ldx, Y.data(), ldy); \ Kokkos::Profiling::popRegion(); \ } \ }; @@ -444,10 +444,10 @@ KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::Serial, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::Serial, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::Serial, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) #endif #ifdef KOKKOS_ENABLE_OPENMP @@ -455,10 +455,10 @@ KOKKOSSPARSE_SPMV_MV_MKL(float, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) KOKKOSSPARSE_SPMV_MV_MKL(double, Kokkos::OpenMP, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_MKL(Kokkos::complex, Kokkos::OpenMP, + KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) #endif #undef KOKKOSSPARSE_SPMV_MV_MKL From 5b7a0f417608c89f85e170e1b6311318598d4314 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Thu, 6 Jan 2022 15:49:22 -0700 Subject: [PATCH 11/15] Fix switching issues. --- src/sparse/KokkosSparse_spmv.hpp | 6 +- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 104 ++++++++++-------- 2 files changed, 61 insertions(+), 49 deletions(-) diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index cc945c145b..1862be1d20 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -449,9 +449,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], Kokkos::CudaSpace>::value || std::is_same::value) { -#if defined(CUSPARSE_VERSION) useFallback = useFallback || (mode[0] != NoTranspose[0]); -#endif } #endif @@ -465,7 +463,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], if (useFallback) { // Explicitly call the non-TPL SPMV_BSRMATRIX implementation std::string label = - "KokkosSparse::spmv[NATIVE,BSMATRIX," + + "KokkosSparse::spmv[NATIVE,BSRMATRIX," + Kokkos::ArithTraits< typename AMatrix_Internal::non_const_value_type>::name() + "]"; @@ -844,9 +842,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], Kokkos::CudaSpace>::value || std::is_same::value) { -#if defined(CUSPARSE_VERSION) useFallback = useFallback || (mode[0] != NoTranspose[0]); -#endif } #endif diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 6337e52a77..445b8657f4 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -215,85 +215,101 @@ struct SPMV_MV_BSRMATRIX::is_complex == false) + use_tc = true; + } #endif #if defined(KOKKOS_ARCH_AMPERE) - typedef typename XVector::non_const_value_type XScalar; - typedef typename AMatrix::non_const_value_type AScalar; - typedef Kokkos::Experimental::half_t Half; + typedef typename XVector::non_const_value_type XScalar; + typedef typename AMatrix::non_const_value_type AScalar; + typedef Kokkos::Experimental::half_t Half; - /* Ampere has double += double * double and float += half * half + /* Ampere has double += double * double and float += half * half - use whichever is requested. - If none requested, used mixed precision if the inputs are mixed, otherwise - use double - */ + use whichever is requested. + If none requested, used mixed precision if the inputs are mixed, otherwise + use double + */ - // input precision matches a tensor core fragment type - constexpr bool operandsHalfHalfFloat = std::is_same::value && - std::is_same::value && - std::is_same::value; + // input precision matches a tensor core fragment type + constexpr bool operandsHalfHalfFloat = std::is_same::value && + std::is_same::value && + std::is_same::value; + if (use_tc) { if (requestMixed) { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); + return; } else if (requestDouble) { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); + return; } else if (operandsHalfHalfFloat) { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); + return; } else { BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, x, beta, y); + return; } + } #elif defined(KOKKOS_ARCH_VOLTA) /* Volta has float += half * half use it for all matrices */ - if (requestDouble) { - Kokkos::Impl::throw_runtime_exception( - "KokkosSparse::spmv[algorithm=experimental_bsr_tc] " - "tc_precision=double unsupported KOKKOS_ARCH_VOLTA"); + if (use_tc) { + if (requestDouble) { + Kokkos::Impl::throw_runtime_exception( + "KokkosSparse::spmv[algorithm=experimental_bsr_tc] " + "tc_precision=double unsupported KOKKOS_ARCH_VOLTA"); + } + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, + X, beta, + Y); + (void)requestMixed; // unused + return; } - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, - beta, Y); - (void)requestMixed; // unused #endif // KOKKOS_ARCH - if ((mode[0] == KokkosSparse::NoTranspose[0]) || - (mode[0] == KokkosSparse::Conjugate[0])) { - bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); - if (X.extent(1) == 1) { - const auto x0 = Kokkos::subview(X, Kokkos::ALL(), 0); - auto y0 = Kokkos::subview(Y, Kokkos::ALL(), 0); - return Bsr::spMatVec_no_transpose(controls, alpha, A, x0, beta, y0, + if ((mode[0] == KokkosSparse::NoTranspose[0]) || + (mode[0] == KokkosSparse::Conjugate[0])) { + bool useConjugate = (mode[0] == KokkosSparse::Conjugate[0]); + if (X.extent(1) == 1) { + const auto x0 = Kokkos::subview(X, Kokkos::ALL(), 0); + auto y0 = Kokkos::subview(Y, Kokkos::ALL(), 0); + return Bsr::spMatVec_no_transpose(controls, alpha, A, x0, beta, y0, + useConjugate); + } else { + return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y, + useConjugate); + } + } else if ((mode[0] == KokkosSparse::Transpose[0]) || + (mode[0] == KokkosSparse::ConjugateTranspose[0])) { + bool useConjugate = (mode[0] == KokkosSparse::ConjugateTranspose[0]); + if (X.extent(1) == 1) { + const auto x0 = Kokkos::subview(X, Kokkos::ALL(), 0); + auto y0 = Kokkos::subview(Y, Kokkos::ALL(), 0); + return Bsr::spMatVec_transpose(controls, alpha, A, x0, beta, y0, + useConjugate); + } else { + return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y, useConjugate); - } else { - return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } - } else if ((mode[0] == KokkosSparse::Transpose[0]) || - (mode[0] == KokkosSparse::ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == KokkosSparse::ConjugateTranspose[0]); - if (X.extent(1) == 1) { - const auto x0 = Kokkos::subview(X, Kokkos::ALL(), 0); - auto y0 = Kokkos::subview(Y, Kokkos::ALL(), 0); - return Bsr::spMatVec_transpose(controls, alpha, A, x0, beta, y0, - useConjugate); - } else { - return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } } } +} }; template Date: Thu, 6 Jan 2022 15:52:31 -0700 Subject: [PATCH 12/15] Fix typo --- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 125 +++++++++--------- 1 file changed, 63 insertions(+), 62 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 445b8657f4..5a0907dd7a 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -217,54 +217,55 @@ struct SPMV_MV_BSRMATRIX::is_complex == false) use_tc = true; - } + } #endif #if defined(KOKKOS_ARCH_AMPERE) - typedef typename XVector::non_const_value_type XScalar; - typedef typename AMatrix::non_const_value_type AScalar; - typedef Kokkos::Experimental::half_t Half; + typedef typename XVector::non_const_value_type XScalar; + typedef typename AMatrix::non_const_value_type AScalar; + typedef Kokkos::Experimental::half_t Half; - /* Ampere has double += double * double and float += half * half + /* Ampere has double += double * double and float += half * half - use whichever is requested. - If none requested, used mixed precision if the inputs are mixed, otherwise - use double - */ + use whichever is requested. + If none requested, used mixed precision if the inputs are mixed, otherwise + use double + */ - // input precision matches a tensor core fragment type - constexpr bool operandsHalfHalfFloat = std::is_same::value && - std::is_same::value && - std::is_same::value; + // input precision matches a tensor core fragment type + constexpr bool operandsHalfHalfFloat = std::is_same::value && + std::is_same::value && + std::is_same::value; - if (use_tc) { - if (requestMixed) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, - X, beta, - Y); - return; - } else if (requestDouble) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); - return; - } else if (operandsHalfHalfFloat) { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, - X, beta, - Y); - return; - } else { - BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, x, beta, y); - return; + if (use_tc) { + if (requestMixed) { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, + X, beta, + Y); + return; + } else if (requestDouble) { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, X, beta, Y); + return; + } else if (operandsHalfHalfFloat) { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, + X, beta, + Y); + return; + } else { + BsrMatrixSpMVTensorCoreDispatcher::dispatch(alpha, A, x, beta, y); + return; + } } - } #elif defined(KOKKOS_ARCH_VOLTA) /* Volta has float += half * half use it for all matrices @@ -284,32 +285,32 @@ struct SPMV_MV_BSRMATRIX Date: Tue, 11 Jan 2022 10:52:55 -0700 Subject: [PATCH 13/15] Replace size_t with size_type --- unit_test/sparse/Test_Sparse_spmv_bsr.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp index 1d59897bb9..6f1523f90f 100644 --- a/unit_test/sparse/Test_Sparse_spmv_bsr.hpp +++ b/unit_test/sparse/Test_Sparse_spmv_bsr.hpp @@ -96,10 +96,10 @@ inline void set_random_value(std::complex &v) { /// \param mat_rowmap[out] CRS-style row map for the block matrix /// \param mat_colidx[out] CRS-style column entries for the block matrix /// \param mat_val[out] Numerical (random) values -template +template void make_block_entries( const KokkosSparse::CrsMatrix &mat_b1, + size_type> &mat_b1, int blockSize, std::vector &mat_rowmap, std::vector &mat_colidx, std::vector &mat_val) { lno_t nRow = blockSize * mat_b1.numRows(); @@ -157,7 +157,7 @@ void check_bsrm_times_v(const char fOp[], scalar_t alpha, scalar_t beta, typename KokkosSparse::CrsMatrix crsMat_t; typedef typename KokkosSparse::CrsMatrix + void, size_type> h_crsMat_t; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef scalar_view_t x_vector_type; @@ -286,7 +286,7 @@ void check_bsrm_times_mv(const char fOp[], scalar_t alpha, scalar_t beta, mat_structure(2, 2) = 0; // Add BC to the top typedef typename KokkosSparse::CrsMatrix + void, size_type> h_crsMat_t; typedef typename KokkosSparse::CrsMatrix From f6051aa98c6443de61e9819364f94e98978dd7c5 Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 11 Jan 2022 11:10:16 -0700 Subject: [PATCH 14/15] Fix error on intel --- src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 6 ++---- unit_test/sparse/Test_Sparse.hpp | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 8c538c657c..a1ae213ea9 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -360,10 +360,9 @@ inline void spm_mv_block_impl_mkl( using YVector = Kokkos::View>; \ using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ \ static void spmv_bsrmatrix( \ - const KokkosKernels::Experimental::Controls& controls, \ + const KokkosKernels::Experimental::Controls& /*controls*/, \ const char mode[], const coefficient_type& alpha, const AMatrix& A, \ const XVector& X, const coefficient_type& beta, const YVector& Y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ @@ -419,10 +418,9 @@ KOKKOSSPARSE_SPMV_MKL(Kokkos::complex, Kokkos::OpenMP, using YVector = Kokkos::View>; \ using coefficient_type = typename YVector::non_const_value_type; \ - using Controls = KokkosKernels::Experimental::Controls; \ \ static void spmv_mv_bsrmatrix( \ - const KokkosKernels::Experimental::Controls& controls, \ + const KokkosKernels::Experimental::Controls& /*controls*/, \ const char mode[], const coefficient_type& alpha, const AMatrix& A, \ const XVector& X, const coefficient_type& beta, const YVector& Y) { \ std::string label = "KokkosSparse::spmv[TPL_MKL,BSRMATRIX" + \ diff --git a/unit_test/sparse/Test_Sparse.hpp b/unit_test/sparse/Test_Sparse.hpp index 26d2830f90..2afa0fb2db 100644 --- a/unit_test/sparse/Test_Sparse.hpp +++ b/unit_test/sparse/Test_Sparse.hpp @@ -15,7 +15,7 @@ #include "Test_Sparse_spiluk.hpp" #include "Test_Sparse_spmv.hpp" //#include "Test_Sparse_spmv_blockcrs.hpp" -#include "Test_Sparse_spmv_bsr.hpp" +//#include "Test_Sparse_spmv_bsr.hpp" #include "Test_Sparse_sptrsv.hpp" #include "Test_Sparse_trsv.hpp" From 319cb36103964947cae840aea8e6a8ecbb8e5dfc Mon Sep 17 00:00:00 2001 From: Ulrich Hetmaniuk Date: Tue, 11 Jan 2022 13:59:36 -0700 Subject: [PATCH 15/15] Remove test for extent = 1 as subview of LayoutRight yields LayoutStride --- .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 26 +++++-------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 5a0907dd7a..cbbbd39f12 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -288,27 +288,13 @@ struct SPMV_MV_BSRMATRIX impl_type; for (typename AMatrix::non_const_size_type j = 0; j < X.extent(1); ++j) { - auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); - auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); + const auto x_j = Kokkos::subview(X, Kokkos::ALL(), j); + auto y_j = Kokkos::subview(Y, Kokkos::ALL(), j); impl_type::spmv_bsrmatrix(controls, mode, alpha, A, x_j, beta, y_j); } }