diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp index cd8287b38e..705422ff33 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp @@ -195,93 +195,49 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail { // These versions of cuSPARSE require the ordinal and offset types to be the // same. For KokkosKernels, this means int/int only. - -#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ - SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \ - template <> \ - struct spmv_mv_bsrmatrix_tpl_spec_avail< \ - const SCALAR, const ORDINAL, Kokkos::Device, \ - Kokkos::MemoryTraits, const OFFSET, const SCALAR*, \ - XL, Kokkos::Device, \ - Kokkos::MemoryTraits, SCALAR*, \ - YL, Kokkos::Device, \ - Kokkos::MemoryTraits, true> { \ - enum : bool { value = true }; \ +// cuSapars level 3 does not currently support LayoutRight +#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \ + SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \ + template <> \ + struct spmv_mv_bsrmatrix_tpl_spec_avail< \ + const SCALAR, const ORDINAL, Kokkos::Device, \ + Kokkos::MemoryTraits, const OFFSET, const SCALAR**, \ + LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, \ + SCALAR**, LAYOUT, Kokkos::Device, \ + Kokkos::MemoryTraits, false> { \ + enum : bool { value = true }; \ }; #if (9000 <= CUDA_VERSION) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, Kokkos::CudaSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaUVMSpace) KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, - Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, - Kokkos::CudaUVMSpace) -KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex, - int, int, - Kokkos::LayoutRight, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace) #endif // CUDA/CUSPARSE >= 9.0? diff --git a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp index 6ef47f8008..f73c09c712 100644 --- a/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp @@ -42,8 +42,8 @@ //@HEADER */ -#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP -#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP #include "KokkosKernels_Controls.hpp" #include "KokkosKernels_SparseUtils_mkl.hpp" @@ -562,8 +562,24 @@ void spmv_block_impl_cusparse( // - Only blockDim > 1 is supported // - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported // - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported. +// - Only LayoutLeft for X and Y: +// for X,Y LayoutLeft we want cuSparse to do +// C = A * B + C +// and for X,Y LayoutRight we want cuSparse to do +// trans(C) = A * trans(B) + trans(C) +// -> t(t(C)) = t(A * t(B)) + t(t(C)) +// -> C = t(t(B)) * t(A) + C +// -> C = B * t(A) + C +// This is impossible in cuSparse without explicitly transposing C, +// so we just do not support LayoutRight in cuSparse TPL now // -template +template < + class AMatrix, class XVector, class YVector, + std::enable_if_t::value && + std::is_same::value, + bool> = true> void spm_mv_block_impl_cusparse( const KokkosKernels::Experimental::Controls& controls, const char mode[], typename YVector::non_const_value_type const& alpha, const AMatrix& A, @@ -587,8 +603,15 @@ void spm_mv_block_impl_cusparse( } int colx = static_cast(x.extent(1)); - int ldx = static_cast(x.stride_1()); - int ldy = static_cast(y.stride_1()); + + // ldx and ldy should be the leading dimension of X,Y respectively + const int ldx = static_cast(x.extent(0)); + const int ldy = static_cast(y.extent(0)); + if (!std::is_same::value) { + std::cerr << "X,Y must be LayoutLeft cusparse[*]bsrmv.\n"; + throw std::invalid_argument("Invalid layout"); + } #if (9000 <= CUDA_VERSION) @@ -745,29 +768,31 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutLeft, KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, Kokkos::LayoutRight, Kokkos::CudaUVMSpace, KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_CUSPARSE -#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \ - COMPILE_LIBRARY) \ +// cuSparse TPL does not support LayoutRight for this operation +// only specialize for LayoutLeft +#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, SPACE, \ + ETI_AVAIL) \ template <> \ struct SPMV_MV_BSRMATRIX< \ SCALAR const, ORDINAL const, Kokkos::Device, \ Kokkos::MemoryTraits, OFFSET const, SCALAR const**, \ - LAYOUT, Kokkos::Device, \ + Kokkos::LayoutLeft, Kokkos::Device, \ Kokkos::MemoryTraits, \ - SCALAR**, LAYOUT, Kokkos::Device, \ - Kokkos::MemoryTraits, true, true, COMPILE_LIBRARY> { \ + SCALAR**, Kokkos::LayoutLeft, Kokkos::Device, \ + Kokkos::MemoryTraits, false, true, ETI_AVAIL> { \ using device_type = Kokkos::Device; \ using memory_trait_type = Kokkos::MemoryTraits; \ using AMatrix = BsrMatrix; \ using XVector = Kokkos::View< \ - SCALAR const**, LAYOUT, device_type, \ + SCALAR const**, Kokkos::LayoutLeft, device_type, \ Kokkos::MemoryTraits>; \ - using YVector = \ - Kokkos::View; \ + using YVector = Kokkos::View; \ using Controls = KokkosKernels::Experimental::Controls; \ \ using coefficient_type = typename YVector::non_const_value_type; \ @@ -786,55 +811,32 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex, int, int, }; #if (9000 <= CUDA_VERSION) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight, - Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, false) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, true) +KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, false) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutLeft, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) + Kokkos::CudaUVMSpace, true) KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, - Kokkos::LayoutRight, Kokkos::CudaUVMSpace, - KOKKOSKERNELS_IMPL_COMPILE_LIBRARY) -#endif + Kokkos::CudaUVMSpace, false) + +#endif // 9000 <= CUDA_VERSION #undef KOKKOSSPARSE_SPMV_MV_CUSPARSE @@ -842,6 +844,6 @@ KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex, int, int, } // namespace Experimental } // namespace KokkosSparse -#endif +#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE -#endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP +#endif // KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP diff --git a/src/sparse/KokkosSparse_spmv.hpp b/src/sparse/KokkosSparse_spmv.hpp index 52c9b4e0bf..972bbc74ad 100644 --- a/src/sparse/KokkosSparse_spmv.hpp +++ b/src/sparse/KokkosSparse_spmv.hpp @@ -894,8 +894,10 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], // // Whether to call KokkosKernel's native implementation, even if a TPL impl is // available - bool useFallback = controls.isParameter("algorithm") && - controls.getParameter("algorithm") == "native"; + bool useFallback = + controls.isParameter("algorithm") && + (controls.getParameter("algorithm") == "native" || + controls.getParameter("algorithm") == "experimental_bsr_tc"); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE // cuSPARSE does not support the modes (C), (T), (H) @@ -936,6 +938,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename YVector_Internal::array_layout, typename YVector_Internal::device_type, typename YVector_Internal::memory_traits, + std::is_integral::value, false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); Kokkos::Profiling::popRegion(); } else { @@ -952,11 +955,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], typename YVector_Internal::value_type**, typename YVector_Internal::array_layout, typename YVector_Internal::device_type, - typename YVector_Internal::memory_traits>::spmv_mv_bsrmatrix(controls, - mode, - alpha, A_i, - x_i, beta, - y_i); + typename YVector_Internal::memory_traits, + std::is_integral::value>:: + spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i); } } @@ -1097,7 +1098,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[], /// entries of y; if alpha == 0, ignore the entries of A and x. /// /// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have -/// \c "algorithm" = \c "experimental_tc_bsr" to use Nvidia tensor cores on +/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on /// Volta or Ampere architectures. On Volta-architecture GPUs the only available /// precision is mixed-precision fp32 accumulator from fp16 inputs. On /// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16,