diff --git a/blas/src/KokkosBlas1_dot.hpp b/blas/src/KokkosBlas1_dot.hpp index ebccce7d7c..aa995836eb 100644 --- a/blas/src/KokkosBlas1_dot.hpp +++ b/blas/src/KokkosBlas1_dot.hpp @@ -96,25 +96,37 @@ dot(const execution_space& space, const XVector& x, const YVector& y) { Kokkos::View>; - result_type result{}; - RVector_Result R = RVector_Result(&result); XVector_Internal X = x; YVector_Internal Y = y; - // Even though RVector is the template parameter, Dot::dot has an overload - // that accepts RVector_Internal (with the special accumulator, if dot_type is - // 32-bit precision). Impl::Dot needs to support both cases, and it's easier - // to do this with overloading than by extending the ETI to deal with two - // different scalar types. - Impl::DotSpecialAccumulator::dot(space, R, - X, Y); - space.fence(); - // mfh 22 Jan 2020: We need the line below because - // Kokkos::complex lacks a constructor that takes a - // Kokkos::complex with U != T. - return Kokkos::Details::CastPossiblyComplex::cast( - result); + bool useFallback = false; + if (useFallback) { + // Even though RVector is the template parameter, Dot::dot has an overload + // that accepts RVector_Internal (with the special accumulator, if dot_type + // is 32-bit precision). Impl::Dot needs to support both cases, and it's + // easier to do this with overloading than by extending the ETI to deal with + // two different scalar types. + result_type result{}; + RVector_Result R = RVector_Result(&result); + Impl::DotSpecialAccumulator::dot(space, + R, X, + Y); + space.fence(); + // mfh 22 Jan 2020: We need the line below because + // Kokkos::complex lacks a constructor that takes a + // Kokkos::complex with U != T. + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } else { + dot_type result{}; + RVector_Internal R = RVector_Internal(&result); + Impl::Dot::dot(space, R, X, Y); + space.fence(); + return Kokkos::Details::CastPossiblyComplex::cast( + result); + } } /// \brief Return the dot product of the two vectors x and y. diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp index ca2139980d..2375aae469 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp @@ -59,11 +59,7 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, #endif -// cuBLAS -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS -// double -#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(SCALAR, LAYOUT, EXECSPACE, \ - MEMSPACE) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC(SCALAR, LAYOUT, EXECSPACE, MEMSPACE) \ template <> \ struct dot_tpl_spec_avail< \ EXECSPACE, \ @@ -77,19 +73,27 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex, Kokkos::LayoutLeft, enum : bool { value = true }; \ }; -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(double, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(float, Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) -KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_CUBLAS(Kokkos::complex, - Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace) +#define KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(float, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(double, LAYOUT, EXECSPACE, MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, \ + MEMSPACE) \ + KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex, LAYOUT, EXECSPACE, MEMSPACE) + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda, + Kokkos::CudaSpace) +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::HIP, + Kokkos::Experimental::HIPSpace) #endif +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Experimental::SYCL, + Kokkos::Experimental::SYCLDeviceUSMSpace) +#endif } // namespace Impl } // namespace KokkosBlas #endif diff --git a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp index 718e32f14c..c5f13dcccd 100644 --- a/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp +++ b/blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp @@ -220,37 +220,47 @@ KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_BLAS(Kokkos::LayoutLeft, Kokkos::HostSpace, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + 1, 1, true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,double]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + /* TODO: CUDA-12's 64-bit indices allow larger numElems */ \ + if (numElems <= std::numeric_limits::max()) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ + const int N = static_cast(numElems); \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -259,81 +269,75 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ - template <> \ - struct Dot< \ - EXECSPACE, \ - Kokkos::View >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View > \ - RV; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XV; \ - typedef typename XV::size_type size_type; \ - \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_CUBLAS,float]"); \ - const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ - dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSdot(s.handle, N, X.data(), one, Y.data(), one, &R()); \ - } else { \ - Dot::dot(space, R, \ - X, Y); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, float, float, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasSdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, double, double, \ + Kokkos::Cuda, Kokkos::CudaSpace, \ + cublasDdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuComplex, Kokkos::Cuda, \ + Kokkos::CudaSpace, cublasCdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, cuDoubleComplex, \ + Kokkos::Cuda, Kokkos::CudaSpace, cublasZdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false) + +} // namespace Impl +} // namespace KokkosBlas + +#endif + +// rocBLAS +#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS +#include -#define KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ROCBLAS," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= std::numeric_limits::max()) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZdotc(s.handle, N, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const rocblas_int N = static_cast(numElems); \ + KokkosBlas::Impl::RocBlasSingleton& s = \ + KokkosBlas::Impl::RocBlasSingleton::singleton(); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + rocblas_set_stream(s.handle, space.hip_stream())); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL( \ + TPL_DOT(s.handle, N, reinterpret_cast(X.data()), \ + 1, reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R()))); \ + KOKKOS_ROCBLAS_SAFE_CALL_IMPL(rocblas_set_stream(s.handle, NULL)); \ } else { \ Dot::dot(space, R, \ X, Y); \ @@ -342,72 +346,103 @@ namespace Impl { } \ }; -#define KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(LAYOUT, EXECSPACE, MEMSPACE, \ - ETI_SPEC_AVAIL) \ +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, float, float, Kokkos::Experimental::HIP, \ + Kokkos::Experimental::HIPSpace, rocblas_sdot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, double, double, Kokkos::Experimental::HIP, \ + Kokkos::Experimental::HIPSpace, rocblas_ddot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_float_complex, \ + Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, \ + rocblas_cdotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS( \ + Kokkos::LayoutLeft, Kokkos::complex, rocblas_double_complex, \ + Kokkos::Experimental::HIP, Kokkos::Experimental::HIPSpace, \ + rocblas_zdotc, ETI_SPEC_AVAIL) + +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ROCBLAS_EXT(false) +} // namespace Impl +} // namespace KokkosBlas +#endif + +// ONEMKL +#if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) && defined(KOKKOS_ENABLE_SYCL) +#include +#include +#include + +namespace KokkosBlas { +namespace Impl { + +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL(LAYOUT, KOKKOS_TYPE, TPL_TYPE, \ + EXECSPACE, MEMSPACE, TPL_DOT, \ + ETI_SPEC_AVAIL) \ template <> \ struct Dot, LAYOUT, Kokkos::HostSpace, \ + Kokkos::View >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUT, \ + Kokkos::View, \ Kokkos::MemoryTraits >, \ 1, 1, true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::View, LAYOUT, Kokkos::HostSpace, \ + typedef Kokkos::View > \ RV; \ - typedef Kokkos::View*, LAYOUT, \ + typedef Kokkos::View, \ Kokkos::MemoryTraits > \ XV; \ typedef typename XV::size_type size_type; \ \ - static void dot(const EXECSPACE& space, RV& R, const XV& X, const XV& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::dot[TPL_CUBLAS,complex]"); \ + static void dot(const EXECSPACE& exec, RV& R, const XV& X, const XV& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::dot[TPL_ONEMKL," + \ + Kokkos::ArithTraits::name() + \ + "]"); \ const size_type numElems = X.extent(0); \ - if (numElems < static_cast(INT_MAX)) { \ + if (numElems <= std::numeric_limits::max()) { \ dot_print_specialization(); \ - const int N = static_cast(numElems); \ - constexpr int one = 1; \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCdotc(s.handle, N, reinterpret_cast(X.data()), \ - one, reinterpret_cast(Y.data()), one, \ - reinterpret_cast(&R())); \ + const std::int64_t N = static_cast(numElems); \ + TPL_DOT(exec.sycl_queue(), N, \ + reinterpret_cast(X.data()), 1, \ + reinterpret_cast(Y.data()), 1, \ + reinterpret_cast(&R())); \ } else { \ - Dot::dot(space, R, \ + Dot::dot(exec, R, \ X, Y); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_DDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_SDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +#define KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, float, float, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, double, double, Kokkos::Experimental::SYCL, \ + Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dot, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) \ + KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL( \ + Kokkos::LayoutLeft, Kokkos::complex, std::complex, \ + Kokkos::Experimental::SYCL, Kokkos::Experimental::SYCLDeviceUSMSpace, \ + oneapi::mkl::blas::row_major::dotc, ETI_SPEC_AVAIL) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_ZDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) - -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, true) -KOKKOSBLAS1_CDOT_TPL_SPEC_DECL_CUBLAS(Kokkos::LayoutLeft, Kokkos::Cuda, - Kokkos::CudaSpace, false) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(true) +KOKKOSBLAS1_DOT_TPL_SPEC_DECL_ONEMKL_EXT(false) } // namespace Impl } // namespace KokkosBlas - #endif #endif