Skip to content

Commit

Permalink
Make cuSparse TPL available for Bsrmatrix SpMV
Browse files Browse the repository at this point in the history
The Kokkos::spmv function was improperly using template parameters to
select the native vs TPL version. A common thread of erros was to assume
the 3rd-to-last template parameter was for TPL availablility, when it
was not. There were also further errors in inverting the logic on that
parameter.

We also remove LayoutRight for the BsrMatrix SpMV, as it is not
supported by the underlying cuSparse function.

for X,Y LayoutLeft we want cuSparse to do

C = A * B + C

and for X,Y LayoutRight we want cuSparse to do

trans(C) = A * trans(B) + trans(C)
   -> t(t(C)) = t(A * t(B)) + t(t(C))
   ->       C = t(t(B)) * t(A) + C
   ->       C = B * t(A) + C

That is not possible with the current cuSparse level 3 functions.
  • Loading branch information
cwpearson committed May 26, 2022
1 parent 10dfb89 commit 22d9c98
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 121 deletions.
68 changes: 12 additions & 56 deletions src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_avail.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,93 +195,49 @@ struct spmv_mv_bsrmatrix_tpl_spec_avail {

// These versions of cuSPARSE require the ordinal and offset types to be the
// same. For KokkosKernels, this means int/int only.

#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \
SCALAR, ORDINAL, OFFSET, XL, YL, MEMSPACE) \
template <> \
struct spmv_mv_bsrmatrix_tpl_spec_avail< \
const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR*, \
XL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, SCALAR*, \
YL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged>, true> { \
enum : bool { value = true }; \
// cuSapars level 3 does not currently support LayoutRight
#define KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE( \
SCALAR, ORDINAL, OFFSET, LAYOUT, MEMSPACE) \
template <> \
struct spmv_mv_bsrmatrix_tpl_spec_avail< \
const SCALAR, const ORDINAL, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged>, const OFFSET, const SCALAR**, \
LAYOUT, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, \
SCALAR**, LAYOUT, Kokkos::Device<Kokkos::Cuda, MEMSPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged>, false> { \
enum : bool { value = true }; \
};

#if (9000 <= CUDA_VERSION)

KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaUVMSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaUVMSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(float, int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaUVMSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(double, int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaUVMSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaUVMSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
int, int,
Kokkos::LayoutLeft,
Kokkos::LayoutLeft,
Kokkos::CudaUVMSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<float>,
int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaUVMSpace)
KOKKOSSPARSE_SPMV_MV_BSRMATRIX_TPL_SPEC_AVAIL_CUSPARSE(Kokkos::complex<double>,
int, int,
Kokkos::LayoutRight,
Kokkos::LayoutRight,
Kokkos::CudaUVMSpace)

#endif // CUDA/CUSPARSE >= 9.0?
Expand Down
116 changes: 59 additions & 57 deletions src/impl/tpls/KokkosSparse_spmv_bsrmatrix_tpl_spec_decl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@
//@HEADER
*/

#ifndef KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
#define KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
#ifndef KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
#define KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP

#include "KokkosKernels_Controls.hpp"
#include "KokkosKernels_SparseUtils_mkl.hpp"
Expand Down Expand Up @@ -562,8 +562,24 @@ void spmv_block_impl_cusparse(
// - Only blockDim > 1 is supported
// - Only CUSPARSE_OPERATION_NON_TRANSPOSE is supported
// - Only CUSPARSE_MATRIX_TYPE_GENERAL is supported.
// - Only LayoutLeft for X and Y:
// for X,Y LayoutLeft we want cuSparse to do
// C = A * B + C
// and for X,Y LayoutRight we want cuSparse to do
// trans(C) = A * trans(B) + trans(C)
// -> t(t(C)) = t(A * t(B)) + t(t(C))
// -> C = t(t(B)) * t(A) + C
// -> C = B * t(A) + C
// This is impossible in cuSparse without explicitly transposing C,
// so we just do not support LayoutRight in cuSparse TPL now
//
template <class AMatrix, class XVector, class YVector>
template <
class AMatrix, class XVector, class YVector,
std::enable_if_t<std::is_same<Kokkos::LayoutLeft,
typename XVector::array_layout>::value &&
std::is_same<Kokkos::LayoutLeft,
typename YVector::array_layout>::value,
bool> = true>
void spm_mv_block_impl_cusparse(
const KokkosKernels::Experimental::Controls& controls, const char mode[],
typename YVector::non_const_value_type const& alpha, const AMatrix& A,
Expand All @@ -587,8 +603,15 @@ void spm_mv_block_impl_cusparse(
}

int colx = static_cast<int>(x.extent(1));
int ldx = static_cast<int>(x.stride_1());
int ldy = static_cast<int>(y.stride_1());

// ldx and ldy should be the leading dimension of X,Y respectively
const int ldx = static_cast<int>(x.extent(0));
const int ldy = static_cast<int>(y.extent(0));
if (!std::is_same<typename XVector::array_layout,
Kokkos::LayoutLeft>::value) {
std::cerr << "X,Y must be LayoutLeft cusparse[*]bsrmv.\n";
throw std::invalid_argument("Invalid layout");
}

#if (9000 <= CUDA_VERSION)

Expand Down Expand Up @@ -745,29 +768,31 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int, Kokkos::LayoutLeft,
KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
#endif
#endif // 9000 <= CUDA_VERSION

#undef KOKKOSSPARSE_SPMV_CUSPARSE

#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, LAYOUT, SPACE, \
COMPILE_LIBRARY) \
// cuSparse TPL does not support LayoutRight for this operation
// only specialize for LayoutLeft
#define KOKKOSSPARSE_SPMV_MV_CUSPARSE(SCALAR, ORDINAL, OFFSET, SPACE, \
ETI_AVAIL) \
template <> \
struct SPMV_MV_BSRMATRIX< \
SCALAR const, ORDINAL const, Kokkos::Device<Kokkos::Cuda, SPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged>, OFFSET const, SCALAR const**, \
LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>, \
Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, SPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>, \
SCALAR**, LAYOUT, Kokkos::Device<Kokkos::Cuda, SPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged>, true, true, COMPILE_LIBRARY> { \
SCALAR**, Kokkos::LayoutLeft, Kokkos::Device<Kokkos::Cuda, SPACE>, \
Kokkos::MemoryTraits<Kokkos::Unmanaged>, false, true, ETI_AVAIL> { \
using device_type = Kokkos::Device<Kokkos::Cuda, SPACE>; \
using memory_trait_type = Kokkos::MemoryTraits<Kokkos::Unmanaged>; \
using AMatrix = BsrMatrix<SCALAR const, ORDINAL const, device_type, \
memory_trait_type, OFFSET const>; \
using XVector = Kokkos::View< \
SCALAR const**, LAYOUT, device_type, \
SCALAR const**, Kokkos::LayoutLeft, device_type, \
Kokkos::MemoryTraits<Kokkos::Unmanaged | Kokkos::RandomAccess>>; \
using YVector = \
Kokkos::View<SCALAR**, LAYOUT, device_type, memory_trait_type>; \
using YVector = Kokkos::View<SCALAR**, Kokkos::LayoutLeft, device_type, \
memory_trait_type>; \
using Controls = KokkosKernels::Experimental::Controls; \
\
using coefficient_type = typename YVector::non_const_value_type; \
Expand All @@ -786,62 +811,39 @@ KOKKOSSPARSE_SPMV_CUSPARSE(Kokkos::complex<float>, int, int,
};

#if (9000 <= CUDA_VERSION)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaSpace, false)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaSpace, false)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
Kokkos::LayoutLeft, Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
Kokkos::CudaSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
Kokkos::LayoutRight, Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
Kokkos::CudaSpace, false)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
Kokkos::LayoutLeft, Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
Kokkos::CudaSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
Kokkos::LayoutRight, Kokkos::CudaSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutLeft,
Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::LayoutRight,
Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutLeft,
Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::LayoutRight,
Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
Kokkos::CudaSpace, false)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(double, int, int, Kokkos::CudaUVMSpace, false)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(float, int, int, Kokkos::CudaUVMSpace, false)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
Kokkos::CudaUVMSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<double>, int, int,
Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
Kokkos::CudaUVMSpace, false)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
Kokkos::LayoutLeft, Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
Kokkos::CudaUVMSpace, true)
KOKKOSSPARSE_SPMV_MV_CUSPARSE(Kokkos::complex<float>, int, int,
Kokkos::LayoutRight, Kokkos::CudaUVMSpace,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY)
#endif
Kokkos::CudaUVMSpace, false)

#endif // 9000 <= CUDA_VERSION

#undef KOKKOSSPARSE_SPMV_MV_CUSPARSE

} // namespace Impl
} // namespace Experimental
} // namespace KokkosSparse

#endif
#endif // KOKKOSKERNELS_ENABLE_TPL_CUSPARSE

#endif // KOKKOSKERNELS_KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
#endif // KOKKOSSPARSE_SPMV_BSRMATRIX_TPL_SPEC_DECL_HPP
17 changes: 9 additions & 8 deletions src/sparse/KokkosSparse_spmv.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -894,8 +894,10 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
//
// Whether to call KokkosKernel's native implementation, even if a TPL impl is
// available
bool useFallback = controls.isParameter("algorithm") &&
controls.getParameter("algorithm") == "native";
bool useFallback =
controls.isParameter("algorithm") &&
(controls.getParameter("algorithm") == "native" ||
controls.getParameter("algorithm") == "experimental_bsr_tc");

#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
// cuSPARSE does not support the modes (C), (T), (H)
Expand Down Expand Up @@ -936,6 +938,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
typename YVector_Internal::array_layout,
typename YVector_Internal::device_type,
typename YVector_Internal::memory_traits,
std::is_integral<typename AMatrix_Internal::const_value_type>::value,
false>::spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i);
Kokkos::Profiling::popRegion();
} else {
Expand All @@ -952,11 +955,9 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
typename YVector_Internal::value_type**,
typename YVector_Internal::array_layout,
typename YVector_Internal::device_type,
typename YVector_Internal::memory_traits>::spmv_mv_bsrmatrix(controls,
mode,
alpha, A_i,
x_i, beta,
y_i);
typename YVector_Internal::memory_traits,
std::is_integral<typename AMatrix_Internal::const_value_type>::value>::
spmv_mv_bsrmatrix(controls, mode, alpha, A_i, x_i, beta, y_i);
}
}

Expand Down Expand Up @@ -1097,7 +1098,7 @@ void spmv(KokkosKernels::Experimental::Controls controls, const char mode[],
/// entries of y; if alpha == 0, ignore the entries of A and x.
///
/// If \c AMatrix is a KokkosSparse::Experimental::BsrMatrix, controls may have
/// \c "algorithm" = \c "experimental_tc_bsr" to use Nvidia tensor cores on
/// \c "algorithm" = \c "experimental_bsr_tc" to use Nvidia tensor cores on
/// Volta or Ampere architectures. On Volta-architecture GPUs the only available
/// precision is mixed-precision fp32 accumulator from fp16 inputs. On
/// Ampere-architecture GPUs (cc >= 80), mixed precision is used when A is fp16,
Expand Down

0 comments on commit 22d9c98

Please sign in to comment.