From 693a19c7d59fd2b0b147ceaed859cd31ee496a1a Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 6 Oct 2021 19:34:14 -0400 Subject: [PATCH 1/3] Stream interface: adding stream support in GEMM This will allow users to queue up kernels in individual stream when the GPU cannot be fully utilized by a single kernel call. This might happen if the matrices are not so large to occupy the full GPU but also not small enough to be batched. --- src/blas/KokkosBlas3_gemm.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/blas/KokkosBlas3_gemm.hpp b/src/blas/KokkosBlas3_gemm.hpp index 52e02e8e60..6ac4186ae0 100644 --- a/src/blas/KokkosBlas3_gemm.hpp +++ b/src/blas/KokkosBlas3_gemm.hpp @@ -121,6 +121,7 @@ bool gemv_based_gemm( /// \tparam BViewType Input matrix, as a 2-D Kokkos::View /// \tparam CViewType Output matrix, as a nonconst 2-D Kokkos::View /// +/// \param space [in] an execution space instance /// \param transA [in] "N" for non-transpose, "T" for transpose, "C" /// for conjugate transpose. All characters after the first are /// ignored. This works just like the BLAS routines. From 557250c30045b3c66a7d80a112fc560d52103297 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Tue, 12 Oct 2021 00:19:09 -0600 Subject: [PATCH 2/3] Stream interface: adding gemv interface to support gemv_based_gemm This will probably now require some more testing gemv and maybe also additional testing in gemm to cover both gemv_based_gemm and dot_based_gemm. --- src/blas/KokkosBlas2_gemv.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/blas/KokkosBlas2_gemv.hpp b/src/blas/KokkosBlas2_gemv.hpp index 9353428da2..2c296d1f56 100644 --- a/src/blas/KokkosBlas2_gemv.hpp +++ b/src/blas/KokkosBlas2_gemv.hpp @@ -63,6 +63,9 @@ namespace KokkosBlas { /// \tparam AlphaCoeffType Type of input coefficient alpha /// \tparam BetaCoeffType Type of input coefficient beta /// +/// \param space [in] execution space instance on which to run the +/// kernel. This may contain information about which stream to +/// run on. /// \param trans [in] "N" for non-transpose, "T" for transpose, "C" /// for conjugate transpose. All characters after the first are /// ignored. This works just like the BLAS routines. From 064249bc61fcc22eae9f7f5ac17e03ff54424037 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 13 Oct 2021 09:20:09 -0600 Subject: [PATCH 3/3] Stream interface: clean-up some cublas TPL calls Adding a new macro KOKKOS_CUBLAS_SAFE_CALL_IMPL to check error codes returned by cublas API. If the error code is not CUBLAS_STATUS_SUCCESS, the macro prints the error type and message as well as the filename and line where the error was detected and throws a runtime exception. --- src/blas/KokkosBlas2_gemv.hpp | 36 +- src/blas/KokkosBlas3_gemm.hpp | 51 +- src/blas/impl/KokkosBlas2_gemv_impl.hpp | 28 +- src/blas/impl/KokkosBlas2_gemv_spec.hpp | 11 +- .../impl/KokkosBlas3_gemm_dotbased_impl.hpp | 7 +- src/blas/impl/KokkosBlas3_gemm_impl.hpp | 7 +- src/blas/impl/KokkosBlas3_gemm_spec.hpp | 23 +- .../tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp | 529 +++++++++--------- .../tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp | 472 ++++++++-------- src/impl/tpls/KokkosBlas_Cuda_tpl.hpp | 6 +- src/impl/tpls/KokkosBlas_tpl_spec.hpp | 60 ++ unit_test/blas/Test_Blas3_gemm.hpp | 137 ++++- 12 files changed, 838 insertions(+), 529 deletions(-) diff --git a/src/blas/KokkosBlas2_gemv.hpp b/src/blas/KokkosBlas2_gemv.hpp index 2c296d1f56..9b8189f994 100644 --- a/src/blas/KokkosBlas2_gemv.hpp +++ b/src/blas/KokkosBlas2_gemv.hpp @@ -75,9 +75,10 @@ namespace KokkosBlas { /// \param beta [in] Input coefficient of y /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View template -void gemv(const char trans[], typename AViewType::const_value_type& alpha, - const AViewType& A, const XViewType& x, - typename YViewType::const_value_type& beta, const YViewType& y) { +void gemv(const typename AViewType::execution_space& space, const char trans[], + typename AViewType::const_value_type& alpha, const AViewType& A, + const XViewType& x, typename YViewType::const_value_type& beta, + const YViewType& y) { static_assert(Kokkos::Impl::is_view::value, "AViewType must be a Kokkos::View."); static_assert(Kokkos::Impl::is_view::value, @@ -147,13 +148,38 @@ void gemv(const char trans[], typename AViewType::const_value_type& alpha, const bool eti_spec_avail = KokkosBlas::Impl::gemv_eti_spec_avail::value; typedef Impl::GEMV fallback_impl_type; - fallback_impl_type::gemv(trans, alpha, A, x, beta, y); + fallback_impl_type::gemv(space, trans, alpha, A, x, beta, y); } else { typedef Impl::GEMV impl_type; - impl_type::gemv(trans, alpha, A, x, beta, y); + impl_type::gemv(space, trans, alpha, A, x, beta, y); } } +/// \brief Dense matrix-vector multiply: y = beta*y + alpha*A*x. +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// \tparam XViewType Input vector, as a 1-D Kokkos::View +/// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View +/// \tparam AlphaCoeffType Type of input coefficient alpha +/// \tparam BetaCoeffType Type of input coefficient beta +/// +/// \param trans [in] "N" for non-transpose, "T" for transpose, "C" +/// for conjugate transpose. All characters after the first are +/// ignored. This works just like the BLAS routines. +/// \param alpha [in] Input coefficient of A*x +/// \param A [in] Input matrix, as a 2-D Kokkos::View +/// \param x [in] Input vector, as a 1-D Kokkos::View +/// \param beta [in] Input coefficient of y +/// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View +template +void gemv(const char trans[], typename AViewType::const_value_type& alpha, + const AViewType& A, const XViewType& x, + typename YViewType::const_value_type& beta, const YViewType& y) { + const typename AViewType::execution_space space = + typename AViewType::execution_space(); + gemv(space, trans, alpha, A, x, beta, y); +} + } // namespace KokkosBlas #endif // KOKKOS_BLAS2_MV_HPP_ diff --git a/src/blas/KokkosBlas3_gemm.hpp b/src/blas/KokkosBlas3_gemm.hpp index 6ac4186ae0..1e26516f01 100644 --- a/src/blas/KokkosBlas3_gemm.hpp +++ b/src/blas/KokkosBlas3_gemm.hpp @@ -67,10 +67,10 @@ namespace Impl { // cuBLAS. template bool gemv_based_gemm( - const char transA[], const char transB[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B, typename CViewType::const_value_type& beta, - const CViewType& C, + const typename CViewType::execution_space& space, const char transA[], + const char transB[], typename AViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B, + typename CViewType::const_value_type& beta, const CViewType& C, typename std::enable_if::value && !std::is_same> Cvec(C.data(), C.extent(0)); - KokkosBlas::gemv("N", alpha, A, Bvec, beta, Cvec); + KokkosBlas::gemv(space, "N", alpha, A, Bvec, beta, Cvec); return true; } return false; @@ -102,6 +102,7 @@ bool gemv_based_gemm( // tests. template bool gemv_based_gemm( + const typename CViewType::execution_space& /*space*/, const char /*transA*/[], const char /*transB*/[], typename AViewType::const_value_type& /*alpha*/, const AViewType& /*A*/, const BViewType& /*B*/, typename CViewType::const_value_type& /*beta*/, @@ -134,10 +135,10 @@ bool gemv_based_gemm( /// \param beta [in] Input coefficient of C /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View template -void gemm(const char transA[], const char transB[], - typename AViewType::const_value_type& alpha, const AViewType& A, - const BViewType& B, typename CViewType::const_value_type& beta, - const CViewType& C) { +void gemm(const typename CViewType::execution_space& space, const char transA[], + const char transB[], typename AViewType::const_value_type& alpha, + const AViewType& A, const BViewType& B, + typename CViewType::const_value_type& beta, const CViewType& C) { #if (KOKKOSKERNELS_DEBUG_LEVEL > 0) static_assert(Kokkos::Impl::is_view::value, "AViewType must be a Kokkos::View."); @@ -204,7 +205,8 @@ void gemm(const char transA[], const char transB[], } // Check if gemv code path is allowed and profitable, and if so run it. - if (Impl::gemv_based_gemm(transA, transB, alpha, A, B, beta, C)) return; + if (Impl::gemv_based_gemm(space, transA, transB, alpha, A, B, beta, C)) + return; // Minimize the number of Impl::GEMM instantiations, by // standardizing on particular View specializations for its template @@ -223,7 +225,34 @@ void gemm(const char transA[], const char transB[], Kokkos::MemoryTraits> CVT; typedef Impl::GEMM impl_type; - impl_type::gemm(transA, transB, alpha, A, B, beta, C); + impl_type::gemm(space, transA, transB, alpha, A, B, beta, C); +} + +/// \brief Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B). +/// +/// \tparam AViewType Input matrix, as a 2-D Kokkos::View +/// \tparam BViewType Input matrix, as a 2-D Kokkos::View +/// \tparam CViewType Output matrix, as a nonconst 2-D Kokkos::View +/// +/// \param transA [in] "N" for non-transpose, "T" for transpose, "C" +/// for conjugate transpose. All characters after the first are +/// ignored. This works just like the BLAS routines. +/// \param transB [in] "N" for non-transpose, "T" for transpose, "C" +/// for conjugate transpose. All characters after the first are +/// ignored. This works just like the BLAS routines. +/// \param alpha [in] Input coefficient of A*x +/// \param A [in] Input matrix, as a 2-D Kokkos::View +/// \param B [in] Input matrix, as a 2-D Kokkos::View +/// \param beta [in] Input coefficient of C +/// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View +template +void gemm(const char transA[], const char transB[], + typename AViewType::const_value_type& alpha, const AViewType& A, + const BViewType& B, typename CViewType::const_value_type& beta, + const CViewType& C) { + const typename CViewType::execution_space space = + typename CViewType::execution_space(); + gemm(space, transA, transB, alpha, A, B, beta, C); } } // namespace KokkosBlas diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index b3b72bd2fe..ba8389e559 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -228,7 +228,8 @@ struct SingleLevelTransposeGEMV { // Single-level parallel version of GEMV. template -void singleLevelGemv(const char trans[], +void singleLevelGemv(const typename AViewType::execution_space& space, + const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -255,7 +256,7 @@ void singleLevelGemv(const char trans[], using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - policy_type range(0, A.extent(0)); + policy_type range(space, 0, A.extent(0)); const char tr = trans[0]; // The transpose and conjugate transpose cases where A has zero rows @@ -666,7 +667,8 @@ struct TwoLevelTransposeGEMV { // Two-level parallel version of GEMV. template -void twoLevelGemv(const char trans[], +void twoLevelGemv(const typename AViewType::execution_space& space, + const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -717,7 +719,7 @@ void twoLevelGemv(const char trans[], IndexType>; functor_type functor(alpha, A, x, beta, y); Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]", - range_policy_type(0, y.extent(0)), functor); + range_policy_type(space, 0, y.extent(0)), functor); } return; } @@ -747,11 +749,11 @@ void twoLevelGemv(const char trans[], if ((size_t)teamSize > 32 * A.extent(1)) teamSize = 32 * A.extent(1); int numBlocks = teamSize / 32; functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks; - team = tagged_policy(numTeams, teamSize) + team = tagged_policy(space, numTeams, teamSize) .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam)); } else { // LayoutRight: one team per row - team = tagged_policy(A.extent(0), Kokkos::AUTO); + team = tagged_policy(space, A.extent(0), Kokkos::AUTO); } Kokkos::parallel_for("KokkosBlas::gemv[twoLevel]", team, functor); } else { @@ -762,7 +764,7 @@ void twoLevelGemv(const char trans[], // Do nothing (y := 1 * y) } else if (tr == 'T') { // transpose, and not conj transpose - team_policy_type team(A.extent(1), Kokkos::AUTO); + team_policy_type team(space, A.extent(1), Kokkos::AUTO); using functor_type = TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); @@ -770,7 +772,7 @@ void twoLevelGemv(const char trans[], functor); } else if (tr == 'C' || tr == 'H') { // conjugate transpose - team_policy_type team(A.extent(1), Kokkos::AUTO); + team_policy_type team(space, A.extent(1), Kokkos::AUTO); using functor_type = TwoLevelTransposeGEMV; functor_type functor(alpha, A, x, beta, y); @@ -786,23 +788,25 @@ void twoLevelGemv(const char trans[], template ()>::type* = nullptr> -void generalGemvImpl(const char trans[], +void generalGemvImpl(const typename AViewType::execution_space& space, + const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - singleLevelGemv(trans, alpha, A, x, beta, y); + singleLevelGemv(space, trans, alpha, A, x, beta, y); } template ()>::type* = nullptr> -void generalGemvImpl(const char trans[], +void generalGemvImpl(const typename AViewType::execution_space& space, + const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, const YViewType& y) { - twoLevelGemv(trans, alpha, A, x, beta, y); + twoLevelGemv(space, trans, alpha, A, x, beta, y); } } // namespace Impl diff --git a/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_gemv_spec.hpp index 68c90a3e72..e00b3ae283 100644 --- a/src/blas/impl/KokkosBlas2_gemv_spec.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_spec.hpp @@ -101,7 +101,8 @@ template ::value> struct GEMV { - static void gemv(const char trans[], + static void gemv(const typename AViewType::execution_space& space, + const char trans[], typename AViewType::const_value_type& alpha, const AViewType& A, const XViewType& x, typename YViewType::const_value_type& beta, @@ -130,11 +131,11 @@ struct GEMV { // Prefer int as the index type, but use a larger type if needed. if (numRows < static_cast(INT_MAX) && numCols < static_cast(INT_MAX)) { - generalGemvImpl(trans, alpha, A, x, - beta, y); + generalGemvImpl(space, trans, alpha, + A, x, beta, y); } else { - generalGemvImpl(trans, alpha, A, - x, beta, y); + generalGemvImpl( + space, trans, alpha, A, x, beta, y); } Kokkos::Profiling::popRegion(); } diff --git a/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp index 67e81ea737..29b2129710 100644 --- a/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp @@ -95,7 +95,7 @@ struct DotBasedGEMM { numCcols(C.extent(1)), dotSize(A.extent(0)) {} - void run(bool conjugateTranspose) { + void run(const typename CV::execution_space& space, bool conjugateTranspose) { // NOTE: these workPerTeam and approxNumTeams were used for TPL CUBLAS, // and may need to be retuned for other architectures constexpr size_C workPerTeam = 4096; // Amount of work per team @@ -143,11 +143,12 @@ struct DotBasedGEMM { // Multiply alpha*A^TB and add it to beta*C if (conjugateTranspose) { - Kokkos::TeamPolicy policyMult(numTeams, + Kokkos::TeamPolicy policyMult(space, numTeams, Kokkos::AUTO); Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this); } else { - Kokkos::TeamPolicy policyMult(numTeams, Kokkos::AUTO); + Kokkos::TeamPolicy policyMult(space, numTeams, + Kokkos::AUTO); Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this); } } diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp index 56e6ede229..2057c8d8ad 100644 --- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp @@ -631,7 +631,8 @@ struct GEMMImpl { beta = beta_; } - void run(int team_size, int vector_length, int scr_level) { + void run(const ExecSpace& space, int team_size, int vector_length, + int scr_level) { scratch_level = scr_level; int scratch_memory_size = ViewTypeAScratch::shmem_size() + ViewTypeBScratch::shmem_size() + @@ -645,10 +646,10 @@ struct GEMMImpl { // that problem but I'm not sure if that it a good perf // parameter or why it is set to 2 for Cuda? Kokkos::TeamPolicy> policy( - num_blocks_0 * num_blocks_1, team_size, vector_length); + space, num_blocks_0 * num_blocks_1, team_size, vector_length); #else Kokkos::TeamPolicy> policy( - num_blocks_0 * num_blocks_1, team_size, vector_length); + space, num_blocks_0 * num_blocks_1, team_size, vector_length); #endif Kokkos::parallel_for( diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp index f311a5ce0f..9494207d29 100644 --- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp +++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp @@ -118,7 +118,8 @@ template ::value> struct GEMM { - static void gemm(const char transA[], const char transB[], + static void gemm(const typename CViewType::execution_space& space, + const char transA[], const char transB[], typename AViewType::const_value_type& alpha, const AViewType& A, const BViewType& B, typename CViewType::const_value_type& beta, @@ -174,7 +175,7 @@ struct GEMM { bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c')); DotBasedGEMM dotBasedGemm( alpha, A, B, beta, C); - dotBasedGemm.run(A_is_conj); + dotBasedGemm.run(space, A_is_conj); } else { // Define Blocking sizes (this will be used for scratch spaces) @@ -233,7 +234,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 0, 0> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'N' || transB[0] == 'n')) { @@ -241,7 +242,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 1, 0> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'N' || transB[0] == 'n')) { @@ -249,7 +250,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 2, 0> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'T' || transB[0] == 't')) { @@ -257,7 +258,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 0, 1> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'T' || transB[0] == 't')) { @@ -265,7 +266,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 1, 1> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'T' || transB[0] == 't')) { @@ -273,7 +274,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 2, 1> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'N' || transA[0] == 'n') && (transB[0] == 'C' || transB[0] == 'c')) { @@ -281,7 +282,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 0, 2> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'T' || transA[0] == 't') && (transB[0] == 'C' || transB[0] == 'c')) { @@ -289,7 +290,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 1, 2> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } if ((transA[0] == 'C' || transA[0] == 'c') && (transB[0] == 'C' || transB[0] == 'c')) { @@ -297,7 +298,7 @@ struct GEMM { AViewType, BViewType, CViewType, blockA0, blockA1, blockB1, 2, 2> gemm(alpha, A, B, beta, C); - gemm.run(team_size, vector_length, scratch_level); + gemm.run(space, team_size, vector_length, scratch_level); } } Kokkos::Profiling::popRegion(); diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp index 3db1f3a763..1eb85e7f85 100644 --- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp @@ -71,44 +71,45 @@ namespace Impl { transa = 'C'; \ } -#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ - one, beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const typename AViewType::execution_space& /* space */, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + HostBlas::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \ + one, beta, Y.data(), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; #define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ @@ -138,7 +139,8 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const char trans[], \ + static void gemv(const typename AViewType::execution_space& /* space */, \ + const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -178,7 +180,8 @@ namespace Impl { Kokkos::MemoryTraits > \ YViewType; \ \ - static void gemv(const char trans[], \ + static void gemv(const typename AViewType::execution_space& /* space */, \ + const char trans[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const XViewType& X, \ typename YViewType::const_value_type& beta, \ @@ -196,49 +199,50 @@ namespace Impl { } \ }; -#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_BLAS,complex]"); \ - KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ - const std::complex alpha_val = alpha, beta_val = beta; \ - HostBlas >::gemv( \ - transa, M, N, alpha_val, \ - reinterpret_cast*>(A.data()), LDA, \ - reinterpret_cast*>(X.data()), one, \ - beta_val, reinterpret_cast*>(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const typename AViewType::execution_space& /* space */, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_BLAS,complex]"); \ + KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA); \ + const std::complex alpha_val = alpha, beta_val = beta; \ + HostBlas >::gemv( \ + transa, M, N, alpha_val, \ + reinterpret_cast*>(A.data()), LDA, \ + reinterpret_cast*>(X.data()), one, \ + beta_val, reinterpret_cast*>(Y.data()), one); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, @@ -308,182 +312,201 @@ namespace Impl { transa = CUBLAS_OP_C; \ } -#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasDgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), \ - one, &beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDgemv(s.handle, transa, M, N, &alpha, \ + A.data(), LDA, X.data(), one, \ + &beta, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasSgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), \ - one, &beta, Y.data(), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSgemv(s.handle, transa, M, N, &alpha, \ + A.data(), LDA, X.data(), one, \ + &beta, Y.data(), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasZgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasZgemv(s.handle, transa, M, N, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMV**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTX, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View*, LAYOUTY, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - XViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - YViewType; \ - \ - static void gemv(const char trans[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const XViewType& X, \ - typename YViewType::const_value_type& beta, \ - const YViewType& Y) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ - KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - cublasCgemv(s.handle, transa, M, N, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(X.data()), one, \ - reinterpret_cast(&beta), \ - reinterpret_cast(Y.data()), one); \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMV**, LAYOUTA, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTX, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View*, LAYOUTY, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + XViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + YViewType; \ + \ + static void gemv(const typename AViewType::execution_space& space, \ + const char trans[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const XViewType& X, \ + typename YViewType::const_value_type& beta, \ + const YViewType& Y) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemv[TPL_CUBLAS,complex]"); \ + KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA); \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgemv( \ + s.handle, transa, M, N, reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(X.data()), one, \ + reinterpret_cast(&beta), \ + reinterpret_cast(Y.data()), one)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + Kokkos::Profiling::popRegion(); \ + } \ }; KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp index 4b2ae4a45d..be550d3a18 100644 --- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp @@ -78,7 +78,8 @@ namespace Impl { Kokkos::MemoryTraits > \ CViewType; \ \ - static void gemm(const char transA[], const char transB[], \ + static void gemm(const typename CViewType::execution_space& /* space*/, \ + const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -137,7 +138,8 @@ namespace Impl { Kokkos::MemoryTraits > \ CViewType; \ \ - static void gemm(const char transA[], const char transB[], \ + static void gemm(const typename CViewType::execution_space& /* space*/, \ + const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -196,7 +198,8 @@ namespace Impl { Kokkos::MemoryTraits > \ CViewType; \ \ - static void gemm(const char transA[], const char transB[], \ + static void gemm(const typename CViewType::execution_space& /* space*/, \ + const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -263,7 +266,8 @@ namespace Impl { Kokkos::MemoryTraits > \ CViewType; \ \ - static void gemm(const char transA[], const char transB[], \ + static void gemm(const typename CViewType::execution_space& /* space*/, \ + const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ @@ -351,192 +355,112 @@ KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight, namespace KokkosBlas { namespace Impl { -#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMM< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef double SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const char transA[], const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,double]"); \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool A_is_lr = std::is_same::value; \ - bool B_is_lr = std::is_same::value; \ - bool C_is_lr = std::is_same::value; \ - \ - const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - cublasOperation_t transa, transb; \ - if ((transA[0] == 'N') || (transA[0] == 'n')) \ - transa = CUBLAS_OP_N; \ - else if ((transA[0] == 'T') || (transA[0] == 't')) \ - transa = CUBLAS_OP_T; \ - else \ - transa = CUBLAS_OP_C; \ - if ((transB[0] == 'N') || (transB[0] == 'n')) \ - transb = CUBLAS_OP_N; \ - else if ((transB[0] == 'T') || (transB[0] == 't')) \ - transb = CUBLAS_OP_T; \ - else \ - transb = CUBLAS_OP_C; \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutLeftThreshold) || \ - (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm( \ - alpha, A, B, beta, C); \ - gemm.run(false); \ - } else { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (!A_is_lr && !B_is_lr && !C_is_lr) \ - cublasDgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), \ - LDA, B.data(), LDB, &beta, C.data(), LDC); \ - if (A_is_lr && B_is_lr && C_is_lr) \ - cublasDgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), \ - LDB, A.data(), LDA, &beta, C.data(), LDC); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ - }; - -#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ - ETI_SPEC_AVAIL) \ - template \ - struct GEMM< \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - Kokkos::View, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef float SCALAR; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - AViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - BViewType; \ - typedef Kokkos::View, \ - Kokkos::MemoryTraits > \ - CViewType; \ - \ - static void gemm(const char transA[], const char transB[], \ - typename AViewType::const_value_type& alpha, \ - const AViewType& A, const BViewType& B, \ - typename CViewType::const_value_type& beta, \ - const CViewType& C) { \ - Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,float]"); \ - const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ - const int M = static_cast(C.extent(0)); \ - const int N = static_cast(C.extent(1)); \ - const int K = static_cast(A.extent(A_t ? 0 : 1)); \ - \ - bool A_is_lr = std::is_same::value; \ - bool B_is_lr = std::is_same::value; \ - bool C_is_lr = std::is_same::value; \ - \ - const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ - LDA = AST == 0 ? 1 : AST; \ - const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ - LDB = BST == 0 ? 1 : BST; \ - const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ - LDC = CST == 0 ? 1 : CST; \ - \ - cublasOperation_t transa, transb; \ - if ((transA[0] == 'N') || (transA[0] == 'n')) \ - transa = CUBLAS_OP_N; \ - else if ((transA[0] == 'T') || (transA[0] == 't')) \ - transa = CUBLAS_OP_T; \ - else \ - transa = CUBLAS_OP_C; \ - if ((transB[0] == 'N') || (transB[0] == 'n')) \ - transb = CUBLAS_OP_N; \ - else if ((transB[0] == 'T') || (transB[0] == 't')) \ - transb = CUBLAS_OP_T; \ - else \ - transb = CUBLAS_OP_C; \ - \ - constexpr int numDotsLayoutLeftThreshold = 1600; \ - constexpr int numDotsLayoutRightThreshold = 100; \ - if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutLeftThreshold) || \ - (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ - M * N < numDotsLayoutRightThreshold)) { \ - DotBasedGEMM gemm( \ - alpha, A, B, beta, C); \ - gemm.run(false); \ - } else { \ - KokkosBlas::Impl::CudaBlasSingleton& s = \ - KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ - if (!A_is_lr && !B_is_lr && !C_is_lr) \ - cublasSgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), \ - LDA, B.data(), LDB, &beta, C.data(), LDC); \ - if (A_is_lr && B_is_lr && C_is_lr) \ - cublasSgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), \ - LDB, A.data(), LDA, &beta, C.data(), LDC); \ - } \ - Kokkos::Profiling::popRegion(); \ - } \ +#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMM< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef double SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const typename CViewType::execution_space& space, \ + const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, \ + const CViewType& C) { \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,double]"); \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ + LDA = AST == 0 ? 1 : AST; \ + const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ + LDB = BST == 0 ? 1 : BST; \ + const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ + LDC = CST == 0 ? 1 : CST; \ + \ + cublasOperation_t transa, transb; \ + if ((transA[0] == 'N') || (transA[0] == 'n')) \ + transa = CUBLAS_OP_N; \ + else if ((transA[0] == 'T') || (transA[0] == 't')) \ + transa = CUBLAS_OP_T; \ + else \ + transa = CUBLAS_OP_C; \ + if ((transB[0] == 'N') || (transB[0] == 'n')) \ + transb = CUBLAS_OP_N; \ + else if ((transB[0] == 'T') || (transB[0] == 't')) \ + transb = CUBLAS_OP_T; \ + else \ + transb = CUBLAS_OP_C; \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ + M * N < numDotsLayoutLeftThreshold) || \ + (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ + M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm( \ + alpha, A, B, beta, C); \ + gemm.run(space, false); \ + } else { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (!A_is_lr && !B_is_lr && !C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), \ + LDA, B.data(), LDB, &beta, C.data(), LDC)); \ + if (A_is_lr && B_is_lr && C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasDgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), \ + LDB, A.data(), LDA, &beta, C.data(), LDC)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ }; -#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM**, LAYOUTA, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTC, \ - Kokkos::Device, \ - Kokkos::MemoryTraits >, \ - true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ + struct GEMM< \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + Kokkos::View, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef float SCALAR; \ typedef Kokkos::View, \ Kokkos::MemoryTraits > \ @@ -550,13 +474,13 @@ namespace Impl { Kokkos::MemoryTraits > \ CViewType; \ \ - static void gemm(const char transA[], const char transB[], \ + static void gemm(const typename CViewType::execution_space& space, \ + const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ const CViewType& C) { \ - Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemm[TPL_BLAS,complex]"); \ + Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,float]"); \ const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ const int M = static_cast(C.extent(0)); \ const int N = static_cast(C.extent(1)); \ @@ -595,43 +519,40 @@ namespace Impl { M * N < numDotsLayoutRightThreshold)) { \ DotBasedGEMM gemm( \ alpha, A, B, beta, C); \ - gemm.run(transa == CUBLAS_OP_C ? true : false); \ + gemm.run(space, false); \ } else { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ if (!A_is_lr && !B_is_lr && !C_is_lr) \ - cublasZgemm(s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), \ + LDA, B.data(), LDB, &beta, C.data(), LDC)); \ if (A_is_lr && B_is_lr && C_is_lr) \ - cublasZgemm(s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), \ + LDB, A.data(), LDA, &beta, C.data(), LDC)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; -#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ +#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ ETI_SPEC_AVAIL) \ template \ - struct GEMM**, LAYOUTA, \ + struct GEMM**, LAYOUTA, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTB, \ + Kokkos::View**, LAYOUTB, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ - Kokkos::View**, LAYOUTC, \ + Kokkos::View**, LAYOUTC, \ Kokkos::Device, \ Kokkos::MemoryTraits >, \ true, ETI_SPEC_AVAIL> { \ - typedef Kokkos::complex SCALAR; \ + typedef Kokkos::complex SCALAR; \ typedef Kokkos::View, \ Kokkos::MemoryTraits > \ @@ -645,13 +566,14 @@ namespace Impl { Kokkos::MemoryTraits > \ CViewType; \ \ - static void gemm(const char transA[], const char transB[], \ + static void gemm(const typename CViewType::execution_space& space, \ + const char transA[], const char transB[], \ typename AViewType::const_value_type& alpha, \ const AViewType& A, const BViewType& B, \ typename CViewType::const_value_type& beta, \ const CViewType& C) { \ Kokkos::Profiling::pushRegion( \ - "KokkosBlas::gemm[TPL_BLAS,complex]"); \ + "KokkosBlas::gemm[TPL_BLAS,complex]"); \ const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ const int M = static_cast(C.extent(0)); \ const int N = static_cast(C.extent(1)); \ @@ -690,29 +612,135 @@ namespace Impl { M * N < numDotsLayoutRightThreshold)) { \ DotBasedGEMM gemm( \ alpha, A, B, beta, C); \ - gemm.run(transa == CUBLAS_OP_C ? true : false); \ + gemm.run(space, transa == CUBLAS_OP_C ? true : false); \ } else { \ KokkosBlas::Impl::CudaBlasSingleton& s = \ KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ if (!A_is_lr && !B_is_lr && !C_is_lr) \ - cublasCgemm(s.handle, transa, transb, M, N, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemm( \ + s.handle, transa, transb, M, N, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ if (A_is_lr && B_is_lr && C_is_lr) \ - cublasCgemm(s.handle, transb, transa, N, M, K, \ - reinterpret_cast(&alpha), \ - reinterpret_cast(B.data()), LDB, \ - reinterpret_cast(A.data()), LDA, \ - reinterpret_cast(&beta), \ - reinterpret_cast(C.data()), LDC); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemm( \ + s.handle, transb, transa, N, M, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ } \ Kokkos::Profiling::popRegion(); \ } \ }; +#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE, \ + ETI_SPEC_AVAIL) \ + template \ + struct GEMM**, LAYOUTA, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTB, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + Kokkos::View**, LAYOUTC, \ + Kokkos::Device, \ + Kokkos::MemoryTraits >, \ + true, ETI_SPEC_AVAIL> { \ + typedef Kokkos::complex SCALAR; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + AViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + BViewType; \ + typedef Kokkos::View, \ + Kokkos::MemoryTraits > \ + CViewType; \ + \ + static void gemm(const typename CViewType::execution_space& space, \ + const char transA[], const char transB[], \ + typename AViewType::const_value_type& alpha, \ + const AViewType& A, const BViewType& B, \ + typename CViewType::const_value_type& beta, \ + const CViewType& C) { \ + Kokkos::Profiling::pushRegion( \ + "KokkosBlas::gemm[TPL_BLAS,complex]"); \ + const bool A_t = (transA[0] != 'N') && (transA[0] != 'n'); \ + const int M = static_cast(C.extent(0)); \ + const int N = static_cast(C.extent(1)); \ + const int K = static_cast(A.extent(A_t ? 0 : 1)); \ + \ + bool A_is_lr = std::is_same::value; \ + bool B_is_lr = std::is_same::value; \ + bool C_is_lr = std::is_same::value; \ + \ + const int AST = A_is_lr ? A.stride(0) : A.stride(1), \ + LDA = AST == 0 ? 1 : AST; \ + const int BST = B_is_lr ? B.stride(0) : B.stride(1), \ + LDB = BST == 0 ? 1 : BST; \ + const int CST = C_is_lr ? C.stride(0) : C.stride(1), \ + LDC = CST == 0 ? 1 : CST; \ + \ + cublasOperation_t transa, transb; \ + if ((transA[0] == 'N') || (transA[0] == 'n')) \ + transa = CUBLAS_OP_N; \ + else if ((transA[0] == 'T') || (transA[0] == 't')) \ + transa = CUBLAS_OP_T; \ + else \ + transa = CUBLAS_OP_C; \ + if ((transB[0] == 'N') || (transB[0] == 'n')) \ + transb = CUBLAS_OP_N; \ + else if ((transB[0] == 'T') || (transB[0] == 't')) \ + transb = CUBLAS_OP_T; \ + else \ + transb = CUBLAS_OP_C; \ + \ + constexpr int numDotsLayoutLeftThreshold = 1600; \ + constexpr int numDotsLayoutRightThreshold = 100; \ + if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ + M * N < numDotsLayoutLeftThreshold) || \ + (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N && \ + M * N < numDotsLayoutRightThreshold)) { \ + DotBasedGEMM gemm( \ + alpha, A, B, beta, C); \ + gemm.run(space, transa == CUBLAS_OP_C ? true : false); \ + } else { \ + KokkosBlas::Impl::CudaBlasSingleton& s = \ + KokkosBlas::Impl::CudaBlasSingleton::singleton(); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasSetStream(s.handle, space.cuda_stream())); \ + if (!A_is_lr && !B_is_lr && !C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCgemm(s.handle, transa, transb, M, N, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + if (A_is_lr && B_is_lr && C_is_lr) \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL( \ + cublasCgemm(s.handle, transb, transa, N, M, K, \ + reinterpret_cast(&alpha), \ + reinterpret_cast(B.data()), LDB, \ + reinterpret_cast(A.data()), LDA, \ + reinterpret_cast(&beta), \ + reinterpret_cast(C.data()), LDC)); \ + KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL)); \ + } \ + Kokkos::Profiling::popRegion(); \ + } \ + }; + KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::LayoutLeft, Kokkos::CudaSpace, true) KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft, diff --git a/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp index 256b5a17a3..9e0bff4c55 100644 --- a/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp +++ b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp @@ -22,7 +22,7 @@ CudaBlasSingleton& CudaBlasSingleton::singleton() { } // namespace Impl } // namespace KokkosBlas -#endif +#endif // defined (KOKKOSKERNELS_ENABLE_TPL_CUBLAS) #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) #include @@ -44,6 +44,6 @@ MagmaSingleton& MagmaSingleton::singleton() { } // namespace Impl } // namespace KokkosBlas -#endif +#endif // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA) -#endif +#endif // KOKKOSBLAS_CUDA_TPL_HPP_ diff --git a/src/impl/tpls/KokkosBlas_tpl_spec.hpp b/src/impl/tpls/KokkosBlas_tpl_spec.hpp index bcb1344279..f4ab92c3a6 100644 --- a/src/impl/tpls/KokkosBlas_tpl_spec.hpp +++ b/src/impl/tpls/KokkosBlas_tpl_spec.hpp @@ -60,6 +60,66 @@ struct CudaBlasSingleton { static CudaBlasSingleton& singleton(); }; +inline void cublas_internal_error_throw(cublasStatus_t cublasState, + const char* name, const char* file, + const int line) { + std::ostringstream out; + // out << name << " error( " << cublasGetStatusName(cublasState) + // << "): " << cublasGetStatusString(cublasState); + out << name << " error( "; + switch (cublasState) { + case CUBLAS_STATUS_NOT_INITIALIZED: + out << "CUBLAS_STATUS_NOT_INITIALIZED): the library was not initialized."; + break; + case CUBLAS_STATUS_ALLOC_FAILED: + out << "CUBLAS_STATUS_ALLOC_FAILED): the resource allocation failed."; + break; + case CUBLAS_STATUS_INVALID_VALUE: + out << "CUBLAS_STATUS_INVALID_VALUE): an invalid numerical value was " + "used as an argument."; + break; + case CUBLAS_STATUS_ARCH_MISMATCH: + out << "CUBLAS_STATUS_ARCH_MISMATCH): an absent device architectural " + "feature is required."; + break; + case CUBLAS_STATUS_MAPPING_ERROR: + out << "CUBLAS_STATUS_MAPPING_ERROR): an access to GPU memory space " + "failed."; + break; + case CUBLAS_STATUS_EXECUTION_FAILED: + out << "CUBLAS_STATUS_EXECUTION_FAILED): the GPU program failed to " + "execute."; + break; + case CUBLAS_STATUS_INTERNAL_ERROR: + out << "CUBLAS_STATUS_INTERNAL_ERROR): an internal operation failed."; + break; + case CUBLAS_STATUS_NOT_SUPPORTED: + out << "CUBLAS_STATUS_NOT_SUPPORTED): the feature required is not " + "supported."; + break; + default: out << "unrecognized error code): this is bad!"; break; + } + if (file) { + out << " " << file << ":" << line; + } + throw std::runtime_error(out.str()); +} + +inline void cublas_internal_safe_call(cublasStatus_t cublasState, + const char* name, + const char* file = nullptr, + const int line = 0) { + if (CUBLAS_STATUS_SUCCESS != cublasState) { + cublas_internal_error_throw(cublasState, name, file, line); + } +} + +// The macro below defines the interface for the safe cublas calls. +// The functions themselves are protected by impl namespace and this +// is not meant to be used by external application or libraries. +#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) \ + KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__) + } // namespace Impl } // namespace KokkosBlas #endif // KOKKOSKERNELS_ENABLE_TPL_CUBLAS diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp index 70c7dd35e1..8ee7bd1915 100644 --- a/unit_test/blas/Test_Blas3_gemm.hpp +++ b/unit_test/blas/Test_Blas3_gemm.hpp @@ -59,6 +59,62 @@ struct gemm_VanillaGEMM { } }; +template +void build_matrices(const int M, const int N, const int K, + const typename ViewTypeA::value_type alpha, ViewTypeA& A, + ViewTypeB& B, const typename ViewTypeA::value_type beta, + ViewTypeC& C, ViewTypeC& Cref) { + using execution_space = TestExecSpace; + using ScalarA = typename ViewTypeA::non_const_value_type; + using ScalarB = typename ViewTypeB::non_const_value_type; + using ScalarC = typename ViewTypeC::non_const_value_type; + + A = ViewTypeA("A", M, K); + B = ViewTypeB("B", K, N); + C = ViewTypeC("C", M, N); + Cref = ViewTypeC("Cref", M, N); + + // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in + // Trilinos issue #6418 + const uint64_t seed = Kokkos::Impl::clock_tic(); + Kokkos::Random_XorShift64_Pool rand_pool(seed); + Kokkos::fill_random(A, rand_pool, + Kokkos::rand::generator_type, + ScalarA>::max()); + Kokkos::fill_random(B, rand_pool, + Kokkos::rand::generator_type, + ScalarB>::max()); + Kokkos::fill_random(C, rand_pool, + Kokkos::rand::generator_type, + ScalarC>::max()); + + Kokkos::deep_copy(Cref, C); + Kokkos::fence(); + + struct Test::gemm_VanillaGEMM + vgemm; + vgemm.A_t = false; + vgemm.B_t = false; + vgemm.A_c = false; + vgemm.B_c = false; + vgemm.N = N; + vgemm.K = K; + vgemm.A = A; + vgemm.B = B; + vgemm.C = Cref; + vgemm.alpha = alpha; + vgemm.beta = beta; + + Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM", + Kokkos::TeamPolicy(M, Kokkos::AUTO, 16), + vgemm); + Kokkos::fence(); +} + template struct DiffGEMM { int N; @@ -110,7 +166,7 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, ViewTypeC C("C", M, N); ViewTypeC C2("C", M, N); - uint64_t seed = Kokkos::Impl::clock_tic(); + const uint64_t seed = Kokkos::Impl::clock_tic(); Kokkos::Random_XorShift64_Pool rand_pool(seed); // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in @@ -176,6 +232,79 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K, EXPECT_TRUE((diff_C_average < 1.05 * diff_C_expected)); } } + +template +void impl_test_stream_gemm(const int M, const int N, const int K, + const Scalar alpha, const Scalar beta) { + using execution_space = TestExecSpace; + using ViewTypeA = Kokkos::View; + using ViewTypeB = Kokkos::View; + using ViewTypeC = Kokkos::View; + using ScalarC = typename ViewTypeC::value_type; + using APT = Kokkos::Details::ArithTraits; + using mag_type = typename APT::mag_type; + + const char tA[] = {"N"}; + const char tB[] = {"N"}; + const double machine_eps = APT::epsilon(); + + ViewTypeA A1, A2; + ViewTypeB B1, B2; + ViewTypeC C1, C1ref, C2, C2ref; + + Test::build_matrices(M, N, K, alpha, A1, B1, beta, C1, C1ref); + Test::build_matrices(N, M, K, alpha, A2, B2, beta, C2, C2ref); + + auto instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1); + KokkosBlas::gemm(instances[0], tA, tB, alpha, A1, B1, beta, C1); + KokkosBlas::gemm(instances[1], tA, tB, alpha, A2, B2, beta, C2); + Kokkos::fence(); + + mag_type diff_C1 = 0; + struct Test::DiffGEMM diffgemm1; + diffgemm1.N = N; + diffgemm1.C = C1; + diffgemm1.C2 = C1ref; + + Kokkos::parallel_reduce( + "KokkosBlas::Test::DiffGEMM1", + Kokkos::TeamPolicy(M, Kokkos::AUTO, 16), diffgemm1, + diff_C1); + + mag_type diff_C2 = 0; + struct Test::DiffGEMM diffgemm2; + diffgemm2.N = M; + diffgemm2.C = C2; + diffgemm2.C2 = C2ref; + + Kokkos::parallel_reduce( + "KokkosBlas::Test::DiffGEMM2", + Kokkos::TeamPolicy(N, Kokkos::AUTO, 16), diffgemm2, + diff_C2); + Kokkos::fence(); + + if (N != 0 && M != 0) { + int K_eff = (K == 0) ? 1 : K; + // Expected Result: Random Walk in the least significant bit (i.e. ~ + // sqrt(K)*eps eps scales with the total sum and has a factor in it for the + // accuracy of the operations -> eps = K * 75 * machine_eps * 7 + const double diff_C_expected = + 1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7; + + const double diff_C1_average = diff_C1 / (N * M); + if ((diff_C1_average >= 1.05 * diff_C_expected)) { + printf("Result: %e %e\n", diff_C1_average, diff_C_expected); + } + EXPECT_TRUE((diff_C1_average < 1.05 * diff_C_expected)); + + const double diff_C2_average = diff_C2 / (N * M); + if ((diff_C2_average >= 1.05 * diff_C_expected)) { + printf("Result: %e %e\n", diff_C2_average, diff_C_expected); + } + EXPECT_TRUE((diff_C2_average < 1.05 * diff_C_expected)); + } +} } // namespace Test template @@ -215,6 +344,12 @@ void test_gemm() { } } } + Test::impl_test_stream_gemm(53, 42, 17, 4.5, + 3.0); // General code path + Test::impl_test_stream_gemm( + 13, 1, 17, 4.5, 3.0); // gemv based gemm code path + Test::impl_test_stream_gemm(7, 13, 17, 4.5, + 3.0); // dot based gemm code path } template