From 86d8371fdb3d528095a86635b9add53a36bb516e Mon Sep 17 00:00:00 2001 From: Carl Pearson Date: Tue, 27 Jun 2023 14:04:16 -0700 Subject: [PATCH] Improve performance of the native BsrMatrix SpMV, especially for single-vector cases. * Adds a new `"v4.2"` BsrMatrix SpMV implementation for non-transpose mode. * It is the default (when TPLs are disabled or not supported) on the GPU for non-transpose mode * The old implementation is retained for all other modes * old implementation may be requested explicitly with `controls.setParameter("algorithm", "v4.1")` * Adds explicit invocation of old "4.1" impl to `KokkosKernels_sparse_spmv_bsr_benchmark` * When TPLs are enabled, the new implementation may be requested anyway with `controls.setParameter("algorithm", "v4.2")` * simplify `KokkosKernels::Impl::always_false_v` * Add `template class with_unmanaged` which provides a `type` alias reproducing `View` with `Kokkos::Unmanaged` added to its memory traits * Add `KokkosKernels::Impl::with_unmanaged_t` as an alias for `typename with_unamanged::type` * Add `template auto KokkosKernels::Impl:make_unmanaged(const View &v)` which constructs a `with_unmanaged_t` from v * Add `` to `KokkosKernels_Error.hpp` * Add `DieOnError` and `SkipOnError` wrapped `bool`s to give names to boolean function arguments * Link `KokkosKernels_sparse_spmv_bsr_benchmark` against `stdc++fs` for rocm 5.2 * More aggressive block size filtering in `KokkosSparse_csr_detect_block_size.hpp` * Removes a useless warning from `Controls::getParameter` since what happens when a parameter is unset was made explicit in https://github.com/kokkos/kokkos-kernels/commit/be87154a2f83f25c269eb3ce2bcca0b82356a8c5 * `BsrMatrix` constructor throws when combination of nnz, rows, and columns don't make sense * Change `BsrMatrix::block_layout` to `BsrMatrix::block_layout_type` for consistency * Adds `BsrMatrix::unmanaged_block` to return an unmanged view to a 2D block of values * Adds `BsrMatrix::unmanaged_block_const` to return a const unmanged view to a 2D block of values --- .../KokkosKernels_AlwaysFalse.hpp | 21 +-- common/impl/KokkosKernels_ViewUtils.hpp | 59 +++++++ common/src/KokkosKernels_Error.hpp | 1 + perf_test/Benchmark_Utils.hpp | 45 ++++++ perf_test/sparse/CMakeLists.txt | 6 + .../KokkosSparse_spmv_bsr_benchmark.cpp | 90 ++++++----- .../KokkosSparse_crs_detect_block_size.hpp | 26 ++-- sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp | 16 +- .../KokkosSparse_spmv_bsrmatrix_impl_v42.hpp | 144 ++++++++++++++++++ .../impl/KokkosSparse_spmv_bsrmatrix_spec.hpp | 103 ++++++++++--- sparse/src/KokkosKernels_Controls.hpp | 2 - sparse/src/KokkosSparse_BsrMatrix.hpp | 51 +++++-- sparse/src/KokkosSparse_spmv.hpp | 12 +- ...kosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp | 2 +- sparse/unit_test/Test_Sparse_spmv_bsr.hpp | 7 +- 15 files changed, 477 insertions(+), 108 deletions(-) rename common/{src => impl}/KokkosKernels_AlwaysFalse.hpp (63%) create mode 100644 common/impl/KokkosKernels_ViewUtils.hpp create mode 100644 perf_test/Benchmark_Utils.hpp create mode 100644 sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp diff --git a/common/src/KokkosKernels_AlwaysFalse.hpp b/common/impl/KokkosKernels_AlwaysFalse.hpp similarity index 63% rename from common/src/KokkosKernels_AlwaysFalse.hpp rename to common/impl/KokkosKernels_AlwaysFalse.hpp index 36f4572d29..12acf4a524 100644 --- a/common/src/KokkosKernels_AlwaysFalse.hpp +++ b/common/impl/KokkosKernels_AlwaysFalse.hpp @@ -17,23 +17,12 @@ #ifndef KOKKOSKERNELS_ALWAYSFALSE_HPP #define KOKKOSKERNELS_ALWAYSFALSE_HPP -#include +namespace KokkosKernels::Impl { -/*! \file KokkosKernels_AlwaysFalse.hpp - \brief A convenience type to be used in a static_assert that should always - fail -*/ +// for use in static asserts +template +inline constexpr bool always_false_v = false; -namespace KokkosKernels { -namespace Impl { - -template -using always_false = std::false_type; - -template -inline constexpr bool always_false_v = always_false::value; - -} // namespace Impl -} // namespace KokkosKernels +} // namespace KokkosKernels::Impl #endif // KOKKOSKERNELS_ALWAYSFALSE_HPP diff --git a/common/impl/KokkosKernels_ViewUtils.hpp b/common/impl/KokkosKernels_ViewUtils.hpp new file mode 100644 index 0000000000..2ae8fb609d --- /dev/null +++ b/common/impl/KokkosKernels_ViewUtils.hpp @@ -0,0 +1,59 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSKERNELS_VIEWUTILS_HPP +#define KOKKOSKERNELS_VIEWUTILS_HPP +#include "Kokkos_Core.hpp" + +namespace KokkosKernels::Impl { + +/*! \brief Yields a type that is View with Kokkos::Unmanaged added to the memory + * traits + */ +template +class with_unmanaged { + using data_type = typename View::data_type; + using layout_type = typename View::array_layout; + using memory_space = typename View::memory_space; + + using orig_traits = typename View::memory_traits; + static constexpr unsigned new_traits = + orig_traits::impl_value | Kokkos::Unmanaged; + + public: + using type = Kokkos::View >; +}; + +/*! \brief A type that is View with Kokkos::Unmanaged added to the memory traits + + \tparam View the type to add Kokkos::Unmanaged to + */ +template +using with_unmanaged_t = typename with_unmanaged::type; + +/*! \brief Returns an unmanaged version of v + + \tparam View the type of the input view v + */ +template +auto make_unmanaged(const View &v) { + return typename with_unmanaged::type(v); +} + +} // namespace KokkosKernels::Impl + +#endif diff --git a/common/src/KokkosKernels_Error.hpp b/common/src/KokkosKernels_Error.hpp index 4d732a8437..52aa6d88da 100644 --- a/common/src/KokkosKernels_Error.hpp +++ b/common/src/KokkosKernels_Error.hpp @@ -18,6 +18,7 @@ #define KOKKOSKERNELS_ERROR_HPP #include +#include namespace KokkosKernels { namespace Impl { diff --git a/perf_test/Benchmark_Utils.hpp b/perf_test/Benchmark_Utils.hpp new file mode 100644 index 0000000000..8f34182f41 --- /dev/null +++ b/perf_test/Benchmark_Utils.hpp @@ -0,0 +1,45 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER +*/ + +#ifndef KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP +#define KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP + +namespace KokkosKernelsBenchmark { + +class WrappedBool { + public: + WrappedBool(const bool &val) : val_(val) {} + + operator bool() const { return val_; } + + protected: + bool val_; +}; + +class DieOnError : public WrappedBool { + public: + DieOnError(const bool &val) : WrappedBool(val) {} +}; +class SkipOnError : public WrappedBool { + public: + SkipOnError(const bool &val) : WrappedBool(val) {} +}; + +} // namespace KokkosKernelsBenchmark + +#endif // KOKKOSKERNELS_PERFTEST_BENCHMARK_UTILS_HPP \ No newline at end of file diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index 2039276c79..8a994b4122 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -139,4 +139,10 @@ if (KokkosKernels_ENABLE_BENCHMARK) KOKKOSKERNELS_ADD_BENCHMARK( sparse_spmv_bsr_benchmark SOURCES KokkosSparse_spmv_bsr_benchmark.cpp ) + + # hipcc 5.2 has an underlying clang that has the std::filesystem + # in an experimental namespace and a different library + if (Kokkos_CXX_COMPILER_ID STREQUAL HIPCC AND Kokkos_CXX_COMPILER_VERSION VERSION_LESS 5.3) + target_link_libraries(KokkosKernels_sparse_spmv_bsr_benchmark PRIVATE -lstdc++fs) + endif() endif() diff --git a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp index 933917c1a6..770b09cfb1 100644 --- a/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp +++ b/perf_test/sparse/KokkosSparse_spmv_bsr_benchmark.cpp @@ -45,6 +45,7 @@ namespace fs = std::filesystem; #include +#include "Benchmark_Utils.hpp" #include "KokkosSparse_CrsMatrix.hpp" #include "KokkosSparse_IOUtils.hpp" #include "KokkosSparse_spmv.hpp" @@ -121,27 +122,6 @@ size_t detect_block_size(const fs::path &path) { return cache.at(path); } -// a bool by a different name, to make its purpose clear -class DieOnError { - public: - DieOnError(const bool &val) : val_(val) {} - - operator bool() const { return val_; } - - private: - bool val_; -}; - -// a bool by a different name, to make its purpose clear -class SkipOnError { - public: - SkipOnError(const bool &val) : val_(val) {} - operator bool() const { return val_; } - - private: - bool val_; -}; - // Test that y_act is close to y_exp. // This needs the matrix, alpha, and beta to compute the error tolerance // properly @@ -235,6 +215,20 @@ struct SpmvNative { static std::string name() { return "native"; } }; +// Wrapper to create a common interface for all SpMVs to benchmark +struct SpmvV41 { + template + static void spmv(const char *mode, const Alpha &alpha, const Matrix &crs, + const XView &x, const Beta &beta, const YView &y) { + KokkosKernels::Experimental::Controls controls; + controls.setParameter("algorithm", "v4.1"); + return KokkosSparse::spmv(controls, mode, alpha, crs, x, beta, y); + } + + static std::string name() { return "v4.1"; } +}; + template void run(benchmark::State &state, const Bsr &bsr, const size_t k) { using execution_space = typename Bsr::execution_space; @@ -269,10 +263,10 @@ void run(benchmark::State &state, const Bsr &bsr, const size_t k) { const char *mode = KokkosSparse::NoTranspose; // test the SpMV against whatever the default is - KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp); - Kokkos::fence(); Spmv::spmv(mode, alpha, bsr, x, beta, y_act); Kokkos::fence(); + KokkosSparse::spmv(mode, alpha, bsr, x, beta, y_exp); + Kokkos::fence(); check_correctness(state, y_exp, y_act, bsr, alpha, beta, DieOnError(false), SkipOnError(true)); @@ -299,7 +293,6 @@ void run(benchmark::State &state, const Bsr &bsr, const size_t k) { template void read_expand_run(benchmark::State &state, const fs::path &path, const size_t blockSize, const size_t k) { - using device_type = typename Bsr::device_type; using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; @@ -322,7 +315,6 @@ void read_expand_run(benchmark::State &state, const fs::path &path, template void read_convert_run(benchmark::State &state, const fs::path &path, const size_t blockSize, const size_t k) { - using device_type = typename Bsr::device_type; using scalar_type = typename Bsr::non_const_value_type; using ordinal_type = typename Bsr::non_const_ordinal_type; @@ -386,27 +378,53 @@ template void register_converts(const fs::path &path, const size_t bs) { std::cerr << "benchmarks will use detected blocksize\n"; // clang-format off - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); - register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + + register_convert_type(path, bs); + register_convert_type(path, bs); + register_convert_type(path, bs); + // clang-format on } template void register_expands(const fs::path &path) { - register_expand_type(path); - register_expand_type(path); + std::cerr << "benchmarks will expand each non-zero into a larger block\n"; + // clang-format off register_expand_type(path); register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); + register_expand_type(path); register_expand_type(path); + register_expand_type(path); + register_expand_type(path); register_expand_type(path); + register_expand_type(path); + // clang-format on } template @@ -425,10 +443,8 @@ void register_path(const fs::path &path) { Otherwise, expand the matrix to some arbitrary block sizes to test BSR */ if (detectedSize != 1) { - std::cerr << "benchmarks will use detected size\n"; register_converts(path, detectedSize); } else { - std::cerr << "benchmarks will expand each non-zero into a larger block\n"; register_expands(path); } } diff --git a/sparse/impl/KokkosSparse_crs_detect_block_size.hpp b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp index 42d4eddf89..418f2a74cc 100644 --- a/sparse/impl/KokkosSparse_crs_detect_block_size.hpp +++ b/sparse/impl/KokkosSparse_crs_detect_block_size.hpp @@ -28,8 +28,7 @@ for performance-sensitive use. */ -namespace KokkosSparse { -namespace Impl { +namespace KokkosSparse::Impl { /** * \class BlockPopulations @@ -86,14 +85,14 @@ class BlockPopulations { * @return The largest block size that results in completely dense blocks The smallest valid block size is 1 Since blocks must be dense, sqrt(nnz), num rows, num cols, and min nnz/row - among non-empty rows are all easy upper bounds of the block size Block sizes - are tested from 1 to the minimum of the above The matrix dimensions must divide - evenly into a trial block size (otherwise a block would not be full) - Furthermore, if a block size of N is not dense, any multiple of N will also not - be dense, and can be skipped. This is because blocks of 2N contain blocks of N, - at least one of which is already known not to be dense. In practice, this ends - up testing only small composite factors and all prime factors up to the upper - bound + among non-empty rows are all easy upper bounds of the block size. + Block sizes are tested from 1 to the minimum of the above. + The matrix dimensions must divide evenly into a trial block size (otherwise a + block would not be full). Furthermore, if a block size of N is not dense, any + multiple of N will also not be dense, and can be skipped. This is because + blocks of 2N contain blocks of N, at least one of which is already known not to + be dense. In practice, this ends up testing only small composite factors and + all prime factors up to the upper bound. */ template size_t detect_block_size(const Crs &crs) { @@ -124,12 +123,14 @@ size_t detect_block_size(const Crs &crs) { for (size_t trialSize = 2; trialSize <= upperBound; ++trialSize) { // trial size must be factor of rows / cols if ((crs.numRows() % trialSize) || (crs.numCols() % trialSize)) { + rejectedSizes.push_back(trialSize); continue; } // trial size must not be a multiple of previously-rejected size if (std::any_of(rejectedSizes.begin(), rejectedSizes.end(), [&](size_t f) { return trialSize % f == 0; })) { + rejectedSizes.push_back(trialSize); continue; } @@ -152,7 +153,6 @@ size_t detect_block_size(const Crs &crs) { return largestBlockSize; } -} // namespace Impl -} // namespace KokkosSparse +} // namespace KokkosSparse::Impl -#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP \ No newline at end of file +#endif // KOKKOSSPARSE_CRS_DETECT_BLOCK_SIZE_HPP diff --git a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp index 8e4c187b99..7f1ff2171e 100644 --- a/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp +++ b/sparse/impl/KokkosSparse_crs_to_bsr_impl.hpp @@ -23,7 +23,21 @@ Bsr expand_crs_to_bsr(const Crs &crs, size_t blockSize) { using crs_row_map_type = typename Crs::row_map_type; using bsr_row_map_type = Kokkos::View; + bsr_device_type>; // need non-const version + + using bsr_size_type = typename Bsr::non_const_size_type; + + { + size_t nnz = crs.nnz() * blockSize * blockSize; + if (nnz > size_t(Kokkos::ArithTraits::max())) { + std::stringstream ss; + ss << "expanding " << crs.nnz() + << " non-zeros of CrsMatrix into blocks of " << blockSize + << " would overflow size_type of requested BsrMatrix " + << Kokkos::ArithTraits::name(); + throw std::runtime_error(ss.str()); + } + } // construct the Bsr row map bsr_row_map_type bsrRowMap("bsrRowMap", crs.graph.row_map.size()); diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp new file mode 100644 index 0000000000..9c5858a307 --- /dev/null +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl_v42.hpp @@ -0,0 +1,144 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP +#define KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP + +#include + +#include + +namespace KokkosSparse { +namespace Impl { + +/* One thread for each entry in the product multivector + + Each thread accumulates the partial products for its entry, and writes it + out. +*/ +template +class BsrSpmvV42NonTrans { + Alpha alpha_; + AMatrix a_; + XVector x_; + Beta beta_; + YVector y_; + + public: + BsrSpmvV42NonTrans(const Alpha &alpha, const AMatrix &a, const XVector &x, + const Beta &beta, const YVector &y) + : alpha_(alpha), a_(a), x_(x), beta_(beta), y_(y) {} + + template + KOKKOS_INLINE_FUNCTION void impl(const size_t k) const { + using a_ordinal_type = typename AMatrix::non_const_ordinal_type; + using a_size_type = typename AMatrix::non_const_size_type; + using y_value_type = typename YVector::non_const_value_type; + using const_block_type = typename AMatrix::const_block_type; + + const a_ordinal_type irhs = k / y_.extent(0); + const a_ordinal_type row = k % y_.extent(0); + + // scale by beta + if (0 == beta_) { + y_(row, irhs) = 0; // convert NaN to 0 + } else if (1 != beta_) { + y_(row, irhs) *= beta_; + } + + // for non-zero template instantiations, + // constant propagation should optimize divmod + a_ordinal_type blocksz; + if constexpr (0 == BLOCK_SIZE) { + blocksz = a_.blockDim(); + } else { + blocksz = BLOCK_SIZE; + } + + if (0 != alpha_) { + const a_ordinal_type blockRow = row / blocksz; + const a_ordinal_type lclrow = row % blocksz; + y_value_type accum = 0; + const a_size_type j_begin = a_.graph.row_map(blockRow); + const a_size_type j_end = a_.graph.row_map(blockRow + 1); + for (a_size_type j = j_begin; j < j_end; ++j) { + const_block_type b = a_.unmanaged_block_const(j); + const a_ordinal_type blockcol = a_.graph.entries(j); + const a_ordinal_type x_start = blockcol * blocksz; + + const auto x_lcl = Kokkos::subview( + x_, Kokkos::make_pair(x_start, x_start + blocksz), irhs); + for (a_ordinal_type i = 0; i < blocksz; ++i) { + accum += b(lclrow, i) * x_lcl(i); + } + } + y_(row, irhs) += alpha_ * accum; + } + } + + KOKKOS_INLINE_FUNCTION void operator()(const size_t k) const { + if (false) { + } + // clang-format off + else if ( 1 == a_.blockDim()) { impl< 1>(k); } + else if ( 2 == a_.blockDim()) { impl< 2>(k); } + else if ( 3 == a_.blockDim()) { impl< 3>(k); } + else if ( 4 == a_.blockDim()) { impl< 4>(k); } + else if ( 5 == a_.blockDim()) { impl< 5>(k); } + else if ( 6 == a_.blockDim()) { impl< 6>(k); } + else if ( 7 == a_.blockDim()) { impl< 7>(k); } + else if ( 8 == a_.blockDim()) { impl< 8>(k); } + else if ( 9 == a_.blockDim()) { impl< 9>(k); } + else if (10 == a_.blockDim()) { impl<10>(k); } + else if (11 == a_.blockDim()) { impl<11>(k); } + // clang-format on + else { + impl<0>(k); + } + } +}; + +template +void apply_v42(const Alpha &alpha, const AMatrix &a, const XVector &x, + const Beta &beta, const YVector &y) { + using execution_space = typename YVector::execution_space; + + Kokkos::RangePolicy policy(0, y.size()); + if constexpr (YVector::rank == 1) { + // Implementation expects a 2D view, so create an unmanaged 2D view + // with extent 1 in the second dimension + using Y2D = KokkosKernels::Impl::with_unmanaged_t>; + using X2D = KokkosKernels::Impl::with_unmanaged_t>; + const Y2D yu(y.data(), y.extent(0), 1); + const X2D xu(x.data(), x.extent(0), 1); + BsrSpmvV42NonTrans op(alpha, a, xu, beta, yu); + Kokkos::parallel_for(policy, op); + } else { + BsrSpmvV42NonTrans op(alpha, a, x, beta, y); + Kokkos::parallel_for(policy, op); + } +} + +} // namespace Impl +} // namespace KokkosSparse + +#endif // KOKKOSSPARSE_BSRMATRIX_SPMV_IMPL_V42_HPP diff --git a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp index 678aaaa0c5..69ff744e9d 100644 --- a/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp +++ b/sparse/impl/KokkosSparse_spmv_bsrmatrix_spec.hpp @@ -25,6 +25,7 @@ #include "KokkosKernels_Error.hpp" #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY #include +#include "KokkosSparse_spmv_bsrmatrix_impl_v42.hpp" #endif namespace KokkosSparse { @@ -136,6 +137,11 @@ struct SPMV_MV_BSRMATRIX { // actual implementations to be compiled #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY +// these should all be different +constexpr inline const char *ALG_V41 = "v4.1"; +constexpr inline const char *ALG_V42 = "v4.2"; +constexpr inline const char *ALG_TC = "experimental_bsr_tc"; + template struct SPMV_BSRMATRIX() || + controls.getParameter("algorithm") == ALG_V42) { + if (modeIsNoTrans) { + ::KokkosSparse::Impl::apply_v42(alpha, A, X, beta, Y); + return; + } + } + + // fall back to V41 all else fails + if (modeIsNoTrans || modeIsConjugate) { return Bsr::spMatVec_no_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } else if ((mode[0] == Transpose[0]) || - (mode[0] == ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == ConjugateTranspose[0]); + modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { return Bsr::spMatVec_transpose(controls, alpha, A, X, beta, Y, - useConjugate); + modeIsConjugateTrans); + } + + { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " "; + ss << "Internal logic error: no applicable BsrMatrix SpMV implementation " + ". Please report this"; + throw std::runtime_error(ss.str()); } } }; @@ -194,7 +231,7 @@ struct SPMV_MV_BSRMATRIX::is_complex) method = Method::Fallback; @@ -289,17 +326,49 @@ struct SPMV_MV_BSRMATRIX() || + controls.getParameter("algorithm") == ALG_V42) { + if (modeIsNoTrans) { + ::KokkosSparse::Impl::apply_v42(alpha, A, X, beta, Y); + return; + } + } - if ((mode[0] == NoTranspose[0]) || (mode[0] == Conjugate[0])) { - bool useConjugate = (mode[0] == Conjugate[0]); + // use V41 as the ultimate fallback + if (modeIsNoTrans || modeIsConjugate) { return Bsr::spMatMultiVec_no_transpose(controls, alpha, A, X, beta, Y, - useConjugate); - } else if ((mode[0] == Transpose[0]) || - (mode[0] == ConjugateTranspose[0])) { - bool useConjugate = (mode[0] == ConjugateTranspose[0]); + modeIsConjugate); + } else if (modeIsTrans || modeIsConjugateTrans) { return Bsr::spMatMultiVec_transpose(controls, alpha, A, X, beta, Y, - useConjugate); + modeIsConjugateTrans); + } + + { + std::stringstream ss; + ss << __FILE__ << ":" << __LINE__ << " "; + ss << "Internal logic error: no applicable BsrMatrix SpMV implementation " + ". Please report this"; + throw std::runtime_error(ss.str()); } } }; diff --git a/sparse/src/KokkosKernels_Controls.hpp b/sparse/src/KokkosKernels_Controls.hpp index c600dad89a..0bb8f79ff0 100644 --- a/sparse/src/KokkosKernels_Controls.hpp +++ b/sparse/src/KokkosKernels_Controls.hpp @@ -64,8 +64,6 @@ class Controls { const std::string& orUnset = "") const { auto search = kernel_parameters.find(name); if (kernel_parameters.end() == search) { - std::cerr << "WARNING: Controls::getParameter for name \"" << name - << "\" was unset" << std::endl; return orUnset; } else { return search->second; diff --git a/sparse/src/KokkosSparse_BsrMatrix.hpp b/sparse/src/KokkosSparse_BsrMatrix.hpp index a366245a86..b36143c14b 100644 --- a/sparse/src/KokkosSparse_BsrMatrix.hpp +++ b/sparse/src/KokkosSparse_BsrMatrix.hpp @@ -393,10 +393,18 @@ class BsrMatrix { //! Nonconst version of the type of the entries in the sparse matrix. typedef typename values_type::non_const_value_type non_const_value_type; - // block values are actually a 1-D view, however they are implicitly - // arranged in LayoutRight, e.g. consecutive entries in the values view - // are consecutive entries within a row inside a block - using block_layout = Kokkos::LayoutRight; + //! block values are actually a 1-D view, however they are implicitly + //! arranged in LayoutRight, e.g. consecutive entries in the values view + //! are consecutive entries within a row inside a block + using block_layout_type = Kokkos::LayoutRight; + + //! Type returned by \c unmanaged_block + using block_type = Kokkos::View; + + //! Type returned by \c unmanaged_block_const + using const_block_type = Kokkos::View; /// \name Storage of the actual sparsity structure and values. /// @@ -480,15 +488,12 @@ class BsrMatrix { /// \param cols [in] The column indices. cols[k] is the column /// index of val[k]. /// \param blockdim [in] The block size of the constructed BsrMatrix. - /// \param pad [in] If true, pad the sparse matrix's storage with - /// zeros in order to improve cache alignment and / or - /// vectorization. + /// \param pad [in] Ignored /// /// The \c pad argument is currently not used. BsrMatrix(const std::string& label, OrdinalType nrows, OrdinalType ncols, size_type annz, ScalarType* vals, OrdinalType* rows, OrdinalType* cols, OrdinalType blockdim, bool pad = false) { - (void)label; (void)pad; blockDim_ = blockdim; @@ -517,6 +522,16 @@ class BsrMatrix { "BsrMatrix:: annz should be a multiple of the number of entries in a " "block"); } + if (annz % (blockDim_ * blockDim_)) { + throw std::runtime_error( + "BsrMatrix:: annz should be a multiple of the number of entries in a " + "block"); + } + if (annz % (blockDim_ * blockDim_)) { + throw std::runtime_error( + "BsrMatrix:: annz should be a multiple of the number of entries in a " + "block"); + } using Coord = std::pair; // row, col using CoordComp = std::function, + std::is_same_v, "A blocks must be stored layout-right"); rocsparse_direction dir = rocsparse_direction_row; diff --git a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp index b2883c1e91..695f03e67f 100644 --- a/sparse/unit_test/Test_Sparse_spmv_bsr.hpp +++ b/sparse/unit_test/Test_Sparse_spmv_bsr.hpp @@ -373,7 +373,8 @@ void test_spmv_combos(const char *mode, const Bsr &a) { auto [x, y] = random_vecs_for_spmv(mode, a); - for (auto alg : {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (auto alg : + {(const char *)(nullptr), "native", "experimental_tc", "v4.1", "v4.2"}) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1), scalar_type(-1), @@ -569,8 +570,8 @@ void test_spm_mv_combos(const char *mode, const Bsr &a) { for (size_t numVecs : {1, 2, 7}) { // num multivecs auto [x, y] = random_multivecs_for_spm_mv(mode, a, numVecs); - for (auto alg : - {(const char *)(nullptr), "native", "experimental_tc_bsr"}) { + for (auto alg : {(const char *)(nullptr), "native", "experimental_tc", + "v4.1", "v4.2"}) { for (scalar_type alpha : {scalar_type(0), scalar_type(1), scalar_type(-1), scalar_type(3.7)}) { for (scalar_type beta : {scalar_type(0), scalar_type(1),