diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 6022b1b607..3e97c97d72 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -283,19 +283,8 @@ struct Direct { struct Backward {}; }; -struct Mode { - struct Serial { - static const char *name() { return "Serial"; } - }; - struct Team { - static const char *name() { return "Team"; } - }; - struct TeamVector { - static const char *name() { return "TeamVector"; } - }; -}; - using KokkosBlas::Algo; +using KokkosBlas::Mode; struct Util { template diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp index 0cad2c6c80..73e146ab01 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Impl.hpp @@ -30,17 +30,15 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - if (AViewType::Rank == 2) - return TeamVectorGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); - else - return TeamVectorGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), - A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + static_assert(AViewType::Rank == 3, + "Batched TeamVectorGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); + return TeamVectorGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), + A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -48,12 +46,18 @@ template struct TeamVectorGemv { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamVectorGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, + const ScalarType /*alpha*/, + const AViewType & /*A*/, + const xViewType & /*x*/, + const ScalarType /*beta*/, + const yViewType & /*y*/) { + static_assert(AViewType::Rank == 3, + "Batched TeamVectorGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); + Kokkos::abort( + "KokkosBatched::TeamVectorGemv for rank-3 matrix " + "is NOT implemented"); } }; @@ -68,17 +72,15 @@ struct TeamVectorGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - if (AViewType::Rank == 2) - return TeamVectorGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); - else - return TeamVectorGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + static_assert(AViewType::Rank == 3, + "Batched TeamVectorGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); + return TeamVectorGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -86,12 +88,18 @@ template struct TeamVectorGemv { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamVectorGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, + const ScalarType /*alpha*/, + const AViewType & /*A*/, + const xViewType & /*x*/, + const ScalarType /*beta*/, + const yViewType & /*y*/) { + static_assert(AViewType::Rank == 3, + "Batched TeamVectorGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamVectorGemv for regular rank-2 matrix)"); + Kokkos::abort( + "KokkosBatched::TeamVectorGemv for rank-3 matrix " + "is NOT implemented"); } }; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp index 4f47212b94..f3b71196f1 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_TeamVector_Internal.hpp @@ -4,10 +4,9 @@ /// \author Kyungjoo Kim (kyukim@sandia.gov) #include "KokkosBatched_Util.hpp" - -#include "KokkosBlas1_set_impl.hpp" -#include "KokkosBlas1_team_scal_impl.hpp" -#include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" +// #include "KokkosBlas1_set_impl.hpp" +// #include "KokkosBlas1_team_scal_impl.hpp" +// #include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" namespace KokkosBatched { @@ -16,17 +15,6 @@ namespace KokkosBatched { /// ==================== template struct TeamVectorGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType & /*member*/, const int /*m*/, const int /*n*/, - const ScalarType /*alpha*/, const ValueType *KOKKOS_RESTRICT /*A*/, - const int /*as0*/, const int /*as1*/, - const ValueType *KOKKOS_RESTRICT /*x*/, const int /*xs0*/, - const ScalarType /*beta*/, - /**/ ValueType *KOKKOS_RESTRICT /*y*/, const int /*ys0*/) { - assert(false && "Error: encounter dummy impl"); - return 0; - } template KOKKOS_INLINE_FUNCTION static int invoke( @@ -43,45 +31,6 @@ struct TeamVectorGemvInternal { } }; -template <> -template -KOKKOS_INLINE_FUNCTION int -TeamVectorGemvInternal::invoke( - const MemberType &member, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) { - const ScalarType one(1.0), zero(0.0); - - // y = beta y + alpha A x - // y (m), A(m x n), B(n) - - if (beta == zero) - KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, ys0); - else if (beta != one) - KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, y, ys0); - - if (alpha != zero) { - if (m <= 0 || n <= 0) return 0; - - if (beta != one) member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { - ValueType t(0); - const ValueType *KOKKOS_RESTRICT tA = (A + i * as0); - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(member, n), - [&](const int &j, ValueType &update) { - update += tA[j * as1] * x[j * xs0]; - }, - t); - Kokkos::single(Kokkos::PerThread(member), - [&]() { y[i * ys0] += alpha * t; }); - }); - } - return 0; -} - template <> template @@ -98,6 +47,8 @@ TeamVectorGemvInternal::invoke( // y_l (m), A_l(m x n), B_l(n) if (beta == zero) + // TODO: KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, + // ys0); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { int iRow, iMatrix; @@ -105,6 +56,8 @@ TeamVectorGemvInternal::invoke( Y[ys0 * iMatrix + ys1 * iRow] = zero; }); else if (beta != one) + // TODO: KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, + // y, ys0); Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, N * m), [&](const int &iTemp) { int iRow, iMatrix; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp index d32232524a..274cfab523 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Impl.hpp @@ -30,17 +30,15 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - if (AViewType::Rank == 2) - return TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); - else - return TeamGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), - A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + static_assert(AViewType::Rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + return TeamGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(1), A.extent(2), alpha, A.data(), + A.stride_0(), A.stride_1(), A.stride_2(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -48,12 +46,18 @@ template struct TeamGemv { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamGemvInternal::invoke( - member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), - A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, + const ScalarType /*alpha*/, + const AViewType & /*A*/, + const xViewType & /*x*/, + const ScalarType /*beta*/, + const yViewType & /*y*/) { + static_assert(AViewType::Rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + Kokkos::abort( + "KokkosBlas::TeamGemv for rank-3 matrix is NOT " + "implemented"); } }; @@ -68,17 +72,15 @@ struct TeamGemv { KOKKOS_INLINE_FUNCTION static int invoke( const MemberType &member, const ScalarType alpha, const AViewType &A, const xViewType &x, const ScalarType beta, const yViewType &y) { - if (AViewType::Rank == 2) - return TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); - else - return TeamGemvInternal::template invoke< - MemberType, ScalarType, typename AViewType::array_layout, - typename AViewType::non_const_value_type>( - member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), - A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), - x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); + static_assert(AViewType::Rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + return TeamGemvInternal::template invoke< + MemberType, ScalarType, typename AViewType::array_layout, + typename AViewType::non_const_value_type>( + member, A.extent(0), A.extent(2), A.extent(1), alpha, A.data(), + A.stride_0(), A.stride_2(), A.stride_1(), x.data(), x.stride_0(), + x.stride_1(), beta, y.data(), y.stride_0(), y.stride_1()); } }; @@ -86,12 +88,18 @@ template struct TeamGemv { template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const ScalarType alpha, const AViewType &A, - const xViewType &x, const ScalarType beta, const yViewType &y) { - return TeamGemvInternal::invoke( - member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), - A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType & /*member*/, + const ScalarType /*alpha*/, + const AViewType & /*A*/, + const xViewType & /*x*/, + const ScalarType /*beta*/, + const yViewType & /*y*/) { + static_assert(AViewType::Rank == 3, + "Batched TeamGemv requires rank-3 A matrix (use " + "KokkosBlas::TeamGemv for regular rank-2 matrix)"); + Kokkos::abort( + "KokkosBlas::TeamGemv for rank-3 matrix is NOT " + "implemented"); } }; diff --git a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp index c48a2cd866..5ee01069d5 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemv_Team_Internal.hpp @@ -5,9 +5,9 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBlas1_set_impl.hpp" -#include "KokkosBlas1_team_scal_impl.hpp" -#include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" +// #include "KokkosBlas1_set_impl.hpp" +// #include "KokkosBlas1_team_scal_impl.hpp" +// #include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" namespace KokkosBatched { @@ -16,14 +16,6 @@ namespace KokkosBatched { /// ==================== template struct TeamGemvInternal { - template - KOKKOS_INLINE_FUNCTION static int invoke( - const MemberType &member, const int m, const int n, - const ScalarType alpha, const ValueType *KOKKOS_RESTRICT A, const int as0, - const int as1, const ValueType *KOKKOS_RESTRICT x, const int xs0, - const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT y, const int ys0); - template KOKKOS_INLINE_FUNCTION static int invoke( @@ -34,86 +26,6 @@ struct TeamGemvInternal { /**/ ValueType *KOKKOS_RESTRICT y, const int ys0, const int ys1); }; -template <> -template -KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) { - const ScalarType one(1.0), zero(0.0); - - // y = beta y + alpha A x - // y (m), A(m x n), B(n) - - if (beta == zero) - KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); - else if (beta != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); - - if (alpha != zero) { - if (m <= 0 || n <= 0) return 0; - - if (beta != one) member.team_barrier(); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m), - [&](const int &i) { - ValueType t(0); - const ValueType *KOKKOS_RESTRICT tA = (A + i * as0); -#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) -#pragma unroll -#endif - for (int j = 0; j < n; ++j) - t += tA[j * as1] * x[j * xs0]; - y[i * ys0] += alpha * t; - }); - } - return 0; -} - -template <> -template -KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( - const MemberType &member, const int m, const int n, const ScalarType alpha, - const ValueType *KOKKOS_RESTRICT A, const int as0, const int as1, - const ValueType *KOKKOS_RESTRICT x, const int xs0, const ScalarType beta, - /**/ ValueType *KOKKOS_RESTRICT y, const int ys0) { - const ScalarType one(1.0), zero(0.0); - - // y = beta y + alpha A x - // y (m), A(m x n), B(n) - - constexpr int mbAlgo = Algo::Gemv::Blocked::mb(); - - if (beta == zero) - KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); - else if (beta != one) - KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); - - if (alpha != zero) { - if (m <= 0 || n <= 0) return 0; - - if (beta != one) member.team_barrier(); - - KokkosBlas::Impl::InnerMultipleDotProduct inner(as0, as1, xs0, ys0); - const int tsize = member.team_size(); - const int mb_a = m / tsize + (m % tsize > 0), mb_b = mbAlgo; - // Made this non-const in order to WORKAROUND issue #349 - int mb = mb_a < mb_b ? mb_a : mb_b, mp = m % mb; - - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, (m / mb) + (mp > 0)), - [&](const int &ii) { - const int i = ii * mb; - inner.serial_invoke( - alpha, A + i * as0, x, - (i + mb) > m ? (m - i) : mb, n, y + i * ys0); - }); - member.team_barrier(); - } - - return 0; -} - template <> template diff --git a/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp b/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp index fa0d4c2a31..88d0bfe561 100644 --- a/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_SolveUTV_TeamVector_Internal.hpp @@ -5,7 +5,7 @@ #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Gemv_TeamVector_Internal.hpp" +#include "KokkosBlas2_team_gemv_impl.hpp" #include "KokkosBatched_Trsv_TeamVector_Internal.hpp" #include "KokkosBatched_Gemm_TeamVector_Internal.hpp" @@ -34,7 +34,7 @@ struct TeamVectorSolveUTV_Internal { if (matrix_rank < m) { /// w = U^T b - TeamVectorGemvInternal::invoke( + KokkosBlas::Impl::TeamVectorGemvInternal::invoke( member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, w, ws0); /// w = T^{-1} w @@ -42,10 +42,10 @@ struct TeamVectorSolveUTV_Internal { member, false, matrix_rank, one, T, ts0, ts1, w, ws0); /// x = V^T w - TeamVectorGemvInternal::invoke( + KokkosBlas::Impl::TeamVectorGemvInternal::invoke( member, m, matrix_rank, one, V, vs1, vs0, w, ws0, zero, x, xs0); } else { - TeamVectorGemvInternal::invoke( + KokkosBlas::Impl::TeamVectorGemvInternal::invoke( member, matrix_rank, m, one, U, us1, us0, b, bs0, zero, x, xs0); TeamVectorTrsvInternalUpper::invoke( diff --git a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp index aaf72e9876..5583b58537 100644 --- a/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp +++ b/src/batched/dense/impl/KokkosBatched_Trsv_Team_Internal.hpp @@ -8,7 +8,7 @@ #include "KokkosBlas1_set_impl.hpp" #include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosBatched_InnerTrsm_Serial_Impl.hpp" -#include "KokkosBatched_Gemv_Team_Internal.hpp" +#include "KokkosBlas2_team_gemv_spec.hpp" namespace KokkosBatched { @@ -119,7 +119,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalLower::invoke( // gemv update member.team_barrier(); - TeamGemvInternal::invoke( + KokkosBlas::Impl::TeamGemvInternal::invoke( member, m - p - pb, pb, minus_one, Ap + pb * as0, as0, as1, bp, 1, one, bp + pb * bs0, bs0); } @@ -227,7 +227,7 @@ KOKKOS_INLINE_FUNCTION int TeamTrsvInternalUpper::invoke( // gemv update member.team_barrier(); - TeamGemvInternal::invoke( + KokkosBlas::Impl::TeamGemvInternal::invoke( member, p, pb, minus_one, Ap - p * as0, as0, as1, bp, 1, one, b, bs0); } } diff --git a/src/blas/KokkosBlas2_gemv.hpp b/src/blas/KokkosBlas2_gemv.hpp index 5c37b74c9b..fe8418cc40 100644 --- a/src/blas/KokkosBlas2_gemv.hpp +++ b/src/blas/KokkosBlas2_gemv.hpp @@ -49,6 +49,8 @@ /// Tpetra::MultiVector use cases. #include +#include +#include #include #include #include @@ -206,6 +208,57 @@ void gemv(const char trans[], typename AViewType::const_value_type& alpha, gemv(space, trans, alpha, A, x, beta, y); } +namespace Experimental { +/// +/// Selective Interface +/// +template +struct Gemv { + template + static void KOKKOS_INLINE_FUNCTION + invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y); +}; + +template +struct Gemv { + template + static void KOKKOS_INLINE_FUNCTION + invoke(const MemberType& /*member*/, const char trans, + const ScalarType& alpha, const MatrixType& A, const XVector& x, + const ScalarType& beta, const YVector& y) { + serial_gemv(trans, alpha, A, x, beta, y); + } +}; + +template +struct Gemv { + template + static void KOKKOS_INLINE_FUNCTION + invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { + team_gemv(member, trans, alpha, A, x, beta, y); + } +}; + +template +struct Gemv { + template + static void KOKKOS_INLINE_FUNCTION + invoke(const MemberType& member, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { + teamvector_gemv(member, trans, alpha, A, x, beta, y); + } +}; + +} // namespace Experimental } // namespace KokkosBlas #endif // KOKKOS_BLAS2_MV_HPP_ diff --git a/src/blas/KokkosBlas2_serial_gemv.hpp b/src/blas/KokkosBlas2_serial_gemv.hpp index 7b26fdeeb7..cb568095b2 100644 --- a/src/blas/KokkosBlas2_serial_gemv.hpp +++ b/src/blas/KokkosBlas2_serial_gemv.hpp @@ -53,9 +53,11 @@ namespace Experimental { template -void KOKKOS_INLINE_FUNCTION gemv(const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, const YVector& y) { +void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, + const ScalarType& alpha, + const MatrixType& A, const XVector& x, + const ScalarType& beta, + const YVector& y) { if (trans == 'N' || trans == 'n') { using mode = KokkosBlas::Trans::NoTranspose; KokkosBlas::SerialGemv::invoke(alpha, A, x, beta, y); @@ -70,11 +72,14 @@ void KOKKOS_INLINE_FUNCTION gemv(const char trans, const ScalarType& alpha, } } +// default AlgoTag template -void KOKKOS_INLINE_FUNCTION gemv(const char trans, const ScalarType& alpha, - const MatrixType& A, const XVector& x, - const ScalarType& beta, const YVector& y) { - gemv(trans, alpha, A, x, beta, y); +void KOKKOS_INLINE_FUNCTION serial_gemv(const char trans, + const ScalarType& alpha, + const MatrixType& A, const XVector& x, + const ScalarType& beta, + const YVector& y) { + serial_gemv(trans, alpha, A, x, beta, y); } } // namespace Experimental diff --git a/src/blas/KokkosBlas2_team_gemv.hpp b/src/blas/KokkosBlas2_team_gemv.hpp index 874f8919df..ddc216b8af 100644 --- a/src/blas/KokkosBlas2_team_gemv.hpp +++ b/src/blas/KokkosBlas2_team_gemv.hpp @@ -50,21 +50,67 @@ namespace KokkosBlas { namespace Experimental { -template -void KOKKOS_INLINE_FUNCTION -gemv(const TeamType& team, const char trans, - const typename MatrixType::non_const_value_type& alpha, - const MatrixType& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { +template +void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, + const ScalarType& alpha, + const MatrixType& A, const XVector& x, + const ScalarType& beta, + const YVector& y) { if (trans == 'N' || trans == 'n') - return Impl::TeamGEMV::team_gemv( + TeamGemv::invoke(team, alpha, A, x, + beta, y); + else if (trans == 'T' || trans == 't') + TeamGemv::invoke(team, alpha, A, x, + beta, y); + else if (trans == 'C' || trans == 'c') + TeamGemv::invoke(team, alpha, A, x, + beta, y); + else { + Kokkos::abort("Matrix mode not supported"); + } +} + +// default AlgoTag +template +void KOKKOS_INLINE_FUNCTION team_gemv(const TeamType& team, const char trans, + const ScalarType& alpha, + const MatrixType& A, const XVector& x, + const ScalarType& beta, + const YVector& y) { + team_gemv(team, trans, alpha, A, x, beta, y); +} + +template +void KOKKOS_INLINE_FUNCTION +teamvector_gemv(const TeamType& team, const char trans, const ScalarType& alpha, + const MatrixType& A, const XVector& x, const ScalarType& beta, + const YVector& y) { + if (trans == 'N' || trans == 'n') { + KokkosBlas::TeamVectorGemv::invoke( team, alpha, A, x, beta, y); - if (trans == 'T' || trans == 't') - return Impl::TeamGEMV::team_gemv( + } else if (trans == 'T' || trans == 't') { + KokkosBlas::TeamVectorGemv::invoke( team, alpha, A, x, beta, y); - if (trans == 'C' || trans == 'c') - return Impl::TeamGEMV::team_gemv( + } else if (trans == 'C' || trans == 'c') { + KokkosBlas::TeamVectorGemv::invoke( team, alpha, A, x, beta, y); + } else { + Kokkos::abort("Matrix mode not supported"); + } +} + +// default AlgoTag +template +void KOKKOS_INLINE_FUNCTION +team_vector_gemv(const TeamType& team, const char trans, + const ScalarType& alpha, const MatrixType& A, const XVector& x, + const ScalarType& beta, const YVector& y) { + teamvector_gemv(team, trans, alpha, A, x, + beta, y); } } // namespace Experimental diff --git a/src/blas/impl/KokkosBlas2_serial_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_serial_gemv_impl.hpp index 75164f87ab..0d7f52702b 100644 --- a/src/blas/impl/KokkosBlas2_serial_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_serial_gemv_impl.hpp @@ -99,6 +99,7 @@ SerialGemv::invoke( A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); } + /// /// T /// diff --git a/src/blas/impl/KokkosBlas2_team_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_team_gemv_impl.hpp new file mode 100644 index 0000000000..a4cf662cc9 --- /dev/null +++ b/src/blas/impl/KokkosBlas2_team_gemv_impl.hpp @@ -0,0 +1,241 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOSBLAS2_TEAM_GEMV_IMPL_HPP_ +#define KOKKOSBLAS2_TEAM_GEMV_IMPL_HPP_ + +#include "KokkosBlas1_set_impl.hpp" +#include "KokkosBlas1_team_scal_impl.hpp" +#include "KokkosBlas2_serial_gemv_inner_multiple_dot.hpp" + +namespace KokkosBlas { +namespace Impl { + +template +struct TeamGemvInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + + // default OpA = OpID + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, + ys0); + } +}; + +template +struct TeamVectorGemvInternal { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0); + + // default OpA = OpID + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType &member, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, + const int as0, const int as1, const ValueXType *KOKKOS_RESTRICT x, + const int xs0, const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + return invoke(member, OpID{}, m, n, alpha, A, as0, as1, x, xs0, beta, y, + ys0); + } +}; + +/// +/// Team Internal Impl +/// ==================== + +template <> +template +KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( + const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, + const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + const ScalarType one(1.0), zero(0.0); + + // y = beta y + alpha A x + // y (m), A(m x n), B(n) + + if (beta == zero) + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); + else if (beta != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); + + if (alpha != zero) { + if (m <= 0 || n <= 0) return 0; + + if (beta != one) member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, m), + [&](const int &i) { + ValueYType t(0); + const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) +#pragma unroll +#endif + for (int j = 0; j < n; ++j) + t += op(tA[j * as1]) * x[j * xs0]; + y[i * ys0] += alpha * t; + }); + } + return 0; +} + +template <> +template +KOKKOS_INLINE_FUNCTION int TeamGemvInternal::invoke( + const MemberType &member, OpA /* op */, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, + const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + const ScalarType one(1.0), zero(0.0); + + // y = beta y + alpha A x + // y (m), A(m x n), B(n) + + constexpr int mbAlgo = Algo::Gemv::Blocked::mb(); + + if (beta == zero) + KokkosBlas::Impl::TeamSetInternal::invoke(member, m, zero, y, ys0); + else if (beta != one) + KokkosBlas::Impl::TeamScaleInternal::invoke(member, m, beta, y, ys0); + + if (alpha != zero) { + if (m <= 0 || n <= 0) return 0; + + if (beta != one) member.team_barrier(); + + KokkosBlas::Impl::InnerMultipleDotProduct inner(as0, as1, xs0, ys0); + const int tsize = member.team_size(); + const int mb_a = m / tsize + (m % tsize > 0), mb_b = mbAlgo; + // Made this non-const in order to WORKAROUND issue #349 + int mb = mb_a < mb_b ? mb_a : mb_b, mp = m % mb; + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, (m / mb) + (mp > 0)), + [&](const int &ii) { + const int i = ii * mb; + inner.serial_invoke(alpha, A + i * as0, x, + (i + mb) > m ? (m - i) : mb, + n, y + i * ys0); + }); + member.team_barrier(); + } + + return 0; +} + +/// +/// TeamVector Internal Impl +/// ==================== + +template <> +template +KOKKOS_INLINE_FUNCTION int +TeamVectorGemvInternal::invoke( + const MemberType &member, OpA op, const int m, const int n, + const ScalarType alpha, const ValueAType *KOKKOS_RESTRICT A, const int as0, + const int as1, const ValueXType *KOKKOS_RESTRICT x, const int xs0, + const ScalarType beta, + /**/ ValueYType *KOKKOS_RESTRICT y, const int ys0) { + const ScalarType one(1.0), zero(0.0); + + // y = beta y + alpha A x + // y (m), A(m x n), B(n) + + if (beta == zero) + KokkosBlas::Impl::TeamVectorSetInternal::invoke(member, m, zero, y, ys0); + else if (beta != one) + KokkosBlas::Impl::TeamVectorScaleInternal::invoke(member, m, beta, y, ys0); + + if (alpha != zero) { + if (m <= 0 || n <= 0) return 0; + + if (beta != one) member.team_barrier(); + + Kokkos::parallel_for(Kokkos::TeamThreadRange(member, m), [&](const int &i) { + ValueYType t(0); + const ValueAType *KOKKOS_RESTRICT tA = (A + i * as0); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(member, n), + [&](const int &j, ValueYType &update) { + update += op(tA[j * as1]) * x[j * xs0]; + }, + t); + Kokkos::single(Kokkos::PerThread(member), + [&]() { y[i * ys0] += alpha * t; }); + }); + } + return 0; +} + +} // namespace Impl +} // namespace KokkosBlas + +#endif diff --git a/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp index b6cc7bf125..92aac23f26 100644 --- a/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp +++ b/src/blas/impl/KokkosBlas2_team_gemv_spec.hpp @@ -49,119 +49,197 @@ #include #include #include +#include namespace KokkosBlas { -namespace Experimental { -namespace Impl { -template -struct team_gemv_tpl_spec_avail { - constexpr static bool value = false; +template +struct TeamGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, + const ScalarType /*alpha*/, + const AViewType& /*A*/, + const xViewType& /*x*/, + const ScalarType /*beta*/, + const yViewType& /*y*/); }; -// Unification and Specialization layer -template ::value> -struct TeamGEMV { - static KOKKOS_INLINE_FUNCTION void team_gemv( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const MatrixType& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y); +template +struct TeamVectorGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke(const MemberType& /*member*/, + const ScalarType /*alpha*/, + const AViewType& /*A*/, + const xViewType& /*x*/, + const ScalarType /*beta*/, + const yViewType& /*y*/); }; -template -struct TeamGEMV { - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename MatrixType::non_const_value_type>::dot_type dot_type; - static KOKKOS_INLINE_FUNCTION void team_gemv( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const MatrixType& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { - const int N = A.extent(0); - const int M = A.extent(1); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - dot_type Ax_i; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, M), - [&](const int& j, dot_type& val) { val += A(i, j) * x(j); }, Ax_i); - y(i) = beta * y(i) + alpha * Ax_i; - }); +/// +/// NT +/// + +template +struct TeamGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "KokkosBlas::TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke( + member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + } +}; + +template +struct TeamGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "KokkosBlas::TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke( + member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + } +}; + +/// +/// T +/// + +template +struct TeamGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke( + member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); + } +}; + +template +struct TeamGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke( + member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); } }; -template -struct TeamGEMV { - typedef typename MatrixType::non_const_value_type value_type; - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename MatrixType::non_const_value_type>::dot_type dot_type; - static KOKKOS_INLINE_FUNCTION void team_gemv( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const MatrixType& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { - const int N = A.extent(0); - const int M = A.extent(1); - - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - dot_type Ax_i; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, M), - [&](const int& j, dot_type& val) { - val += Kokkos::ArithTraits::conj(A(i, j)) * x(j); - }, - Ax_i); - y(i) = beta * y(i) + alpha * Ax_i; - }); +/// +/// CT +/// + +template +struct TeamGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke( + member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); + } +}; + +template +struct TeamGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "BLAS TeamGemv requires rank-2 A matrix"); + return Impl::TeamGemvInternal::invoke( + member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); } }; -template -struct TeamGEMV { - typedef typename Kokkos::Details::InnerProductSpaceTraits< - typename MatrixType::non_const_value_type>::dot_type dot_type; - static KOKKOS_INLINE_FUNCTION void team_gemv( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const MatrixType& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { - const int N = A.extent(1); - const int M = A.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - dot_type Ax_i; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, M), - [&](const int& j, dot_type& val) { val += A(j, i) * x(j); }, Ax_i); - y(i) = beta * y(i) + alpha * Ax_i; - }); +/// +/// NT +/// + +template +struct TeamVectorGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "Batched TeamVectorGemv requires rank-2 A matrix"); + return Impl::TeamVectorGemvInternal::invoke( + member, A.extent(0), A.extent(1), alpha, A.data(), A.stride_0(), + A.stride_1(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); } }; -template -struct TeamGEMV { - typedef typename MatrixType::non_const_value_type value_type; - typedef - typename Kokkos::Details::InnerProductSpaceTraits::dot_type - dot_type; - static KOKKOS_INLINE_FUNCTION void team_gemv( - const TeamType& team, const typename XVector::non_const_value_type& alpha, - const MatrixType& A, const XVector& x, - const typename YVector::non_const_value_type& beta, const YVector& y) { - const int N = A.extent(1); - const int M = A.extent(0); - Kokkos::parallel_for(Kokkos::TeamThreadRange(team, N), [&](const int& i) { - dot_type Ax_i; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, M), - [&](const int& j, dot_type& val) { - val += Kokkos::ArithTraits::conj(A(j, i)) * x(j); - }, - Ax_i); - y(i) = beta * y(i) + alpha * Ax_i; - }); +/// +/// T +/// + +template +struct TeamVectorGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "Batched TeamVectorGemv requires rank-2 A matrix"); + return Impl::TeamVectorGemvInternal::invoke( + member, A.extent(1), A.extent(0), alpha, A.data(), A.stride_1(), + A.stride_0(), x.data(), x.stride_0(), beta, y.data(), y.stride_0()); } }; -} // namespace Impl -} // namespace Experimental + +/// +/// CT +/// + +template +struct TeamVectorGemv { + template + KOKKOS_INLINE_FUNCTION static int invoke( + const MemberType& member, const ScalarType alpha, const AViewType& A, + const xViewType& x, const ScalarType beta, const yViewType& y) { + static_assert(AViewType::Rank == 2, + "Batched TeamVectorGemv requires rank-2 A matrix"); + return Impl::TeamVectorGemvInternal::invoke( + member, Impl::OpConj{}, A.extent(1), A.extent(0), alpha, A.data(), + A.stride_1(), A.stride_0(), x.data(), x.stride_0(), beta, y.data(), + y.stride_0()); + } +}; + } // namespace KokkosBlas #endif diff --git a/src/blas/impl/KokkosBlas_util.hpp b/src/blas/impl/KokkosBlas_util.hpp index 3ddf0afbd5..dcee8283d6 100644 --- a/src/blas/impl/KokkosBlas_util.hpp +++ b/src/blas/impl/KokkosBlas_util.hpp @@ -50,6 +50,19 @@ namespace KokkosBlas { //////// Tags for BLAS //////// + +struct Mode { + struct Serial { + static const char *name() { return "Serial"; } + }; + struct Team { + static const char *name() { return "Team"; } + }; + struct TeamVector { + static const char *name() { return "TeamVector"; } + }; +}; + struct Trans { struct Transpose {}; struct NoTranspose {}; diff --git a/src/impl/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp index c994929b5b..77aa5a6713 100644 --- a/src/impl/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp +++ b/src/impl/tpls/KokkosBlas2_serial_gemv_tpl_spec_decl.hpp @@ -64,7 +64,7 @@ namespace Impl { // Note: using GEMM because there is no GEMV in MKL compact routines #define __IMPL_KK_MKL_DGEMM_COMPACT(SCALAR, MKL_ROUTINE) \ - void kk_mkl_gemm_compact( \ + inline void kk_mkl_gemm_compact( \ MKL_LAYOUT layout, MKL_TRANSPOSE transa, MKL_TRANSPOSE transb, \ MKL_INT m, MKL_INT n, MKL_INT k, SCALAR alpha, const SCALAR *a, \ MKL_INT ldap, const SCALAR *b, MKL_INT ldbp, SCALAR beta, SCALAR *c, \ @@ -81,31 +81,31 @@ __IMPL_KK_MKL_DGEMM_COMPACT(float, mkl_sgemm_compact) #undef __IMPL_KK_MKL_DGEMM_COMPACT template -MKL_COMPACT_PACK mkl_compact_format() { +inline MKL_COMPACT_PACK mkl_compact_format() { Kokkos::abort("vector size not supported"); } template <> -MKL_COMPACT_PACK mkl_compact_format() { +inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_SSE; } template <> -MKL_COMPACT_PACK mkl_compact_format() { +inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_SSE; } template <> -MKL_COMPACT_PACK mkl_compact_format() { +inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_AVX; } template <> -MKL_COMPACT_PACK mkl_compact_format() { +inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_AVX; } template <> -MKL_COMPACT_PACK mkl_compact_format() { +inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_AVX512; } template <> -MKL_COMPACT_PACK mkl_compact_format() { +inline MKL_COMPACT_PACK mkl_compact_format() { return MKL_COMPACT_AVX512; } diff --git a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp index 959982ca30..a0bf8c96ec 100644 --- a/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp +++ b/src/sparse/impl/KokkosSparse_spmv_bsrmatrix_impl.hpp @@ -539,14 +539,12 @@ struct BsrMatrixSpMVTensorCoreDispatcher { #include "KokkosBlas.hpp" #include "KokkosBlas2_serial_gemv_internal.hpp" -#include "KokkosBatched_Gemv_TeamVector_Internal.hpp" +#include "KokkosBlas2_team_gemv_impl.hpp" #include "KokkosBatched_Gemm_Serial_Internal.hpp" #include "KokkosBatched_Gemm_TeamVector_Internal.hpp" #include "KokkosBlas1_team_scal_impl.hpp" #include "KokkosKernels_ExecSpaceUtils.hpp" -#include "KokkosBlas2_team_gemv_spec.hpp" - namespace KokkosSparse { namespace Experimental { namespace Impl { @@ -650,20 +648,18 @@ struct BSR_GEMV_Functor { const auto count = myRow.length; if (conjugate) { - typedef Kokkos::View - block_values_type; - for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) { const auto A_cur = myRow.block(jBlock); const auto X_blkCol = myRow.block_colidx(jBlock); const auto X_ptBeg = X_blkCol * block_dim; const auto X_cur = Kokkos::subview( m_x, ::Kokkos::make_pair(X_ptBeg, X_ptBeg + block_dim)); - KokkosBlas::Experimental::Impl::TeamGEMV< - team_member, block_values_type, XVector, YVector, -1, - false>::team_gemv(dev, alpha, A_cur, X_cur, val_one, Y_cur); + KokkosBlas::Impl:: + TeamVectorGemvInternal::invoke( + dev, KokkosBlas::Impl::OpConj{}, A_cur.extent(0), + A_cur.extent(1), alpha, A_cur.data(), A_cur.stride_0(), + A_cur.stride_1(), X_cur.data(), X_cur.stride_0(), val_one, + Y_cur.data(), Y_cur.stride_0()); } } else { for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) { @@ -672,13 +668,13 @@ struct BSR_GEMV_Functor { const auto X_ptBeg = X_blkCol * block_dim; const auto X_cur = Kokkos::subview( m_x, ::Kokkos::make_pair(X_ptBeg, X_ptBeg + block_dim)); - KokkosBatched::TeamVectorGemvInternal< - KokkosBatched::Algo::Gemv::Unblocked>:: - invoke(dev, block_dim, block_dim, alpha, A_cur.data(), - static_cast(A_cur.stride_0()), - static_cast(A_cur.stride_1()), X_cur.data(), - static_cast(X_cur.stride_0()), val_one, Y_cur.data(), - static_cast(Y_cur.stride_0())); + KokkosBlas::Impl:: + TeamVectorGemvInternal::invoke( + dev, block_dim, block_dim, alpha, A_cur.data(), + static_cast(A_cur.stride_0()), + static_cast(A_cur.stride_1()), X_cur.data(), + static_cast(X_cur.stride_0()), val_one, Y_cur.data(), + static_cast(Y_cur.stride_0())); } } } @@ -944,19 +940,16 @@ struct BSR_GEMV_Transpose_Functor { block_dim * sizeof(y_value_type)); if (conjugate) { - typedef Kokkos::View - block_values_type; Kokkos::View shared_view(shared_y, block_dim); for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) { const auto A_cur = myRow.block(jBlock); // - KokkosBlas::Experimental::Impl::TeamGEMV< - team_member, block_values_type, XVector, YVector, 2, - false>::team_gemv(dev, alpha, A_cur, X_cur, val_zero, shared_view); + KokkosBlas::TeamVectorGemv< + team_member, KokkosBlas::Trans::ConjTranspose, + KokkosBlas::Algo::Gemv::Default>::invoke(dev, alpha, A_cur, X_cur, + val_zero, shared_view); // dev.team_barrier(); // @@ -976,19 +969,12 @@ struct BSR_GEMV_Transpose_Functor { for (ordinal_type jBlock = 0; jBlock < count; ++jBlock) { const auto A_cur = myRow.block(jBlock); // - KokkosBatched::TeamVectorGemvInternal< - KokkosBatched::Algo::Gemv::Unblocked>::invoke(dev, block_dim, - block_dim, alpha, - A_cur.data(), - static_cast( - A_cur.stride_1()), - static_cast( - A_cur.stride_0()), - X_cur.data(), - static_cast( - X_cur.stride_0()), - val_zero, shared_y, - 1); + KokkosBlas::Impl:: + TeamVectorGemvInternal::invoke( + dev, block_dim, block_dim, alpha, A_cur.data(), + static_cast(A_cur.stride_1()), + static_cast(A_cur.stride_0()), X_cur.data(), + static_cast(X_cur.stride_0()), val_zero, shared_y, 1); // dev.team_barrier(); // diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 49b9d69b24..0e3ed0235a 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -57,15 +57,12 @@ #ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV // Enable supernodal sptrsv -#include "KokkosBlas2_gemv.hpp" -#include "KokkosBlas2_team_gemv.hpp" #include "KokkosBlas3_trsm.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosBatched_Util.hpp" -#include "KokkosBatched_Gemv_Decl.hpp" -#include "KokkosBatched_Gemv_Team_Impl.hpp" +#include "KokkosBlas2_team_gemv_spec.hpp" #include "KokkosBatched_Trsm_Team_Impl.hpp" #endif @@ -897,10 +894,12 @@ struct LowerTriSupernodalFunctor { workoffset, workoffset + nsrow)); // needed for gemv instead of trmv/trsv auto Ljj = Kokkos::subview(viewL, range_type(0, nsrow), Kokkos::ALL()); - KokkosBatched::TeamGemv< - member_type, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Ljj, Xj, - zero, Y); + KokkosBlas::TeamGemv::invoke(team, + one, + Ljj, Xj, + zero, + Y); team.team_barrier(); for (int ii = team_rank; ii < nscol; ii += team_size) { Xj(ii) = Y(ii); @@ -923,10 +922,13 @@ struct LowerTriSupernodalFunctor { team.team_barrier(); // calling team-level "Unblocked" gemv on small-size diagonal in // KokkosBatched - KokkosBatched::TeamGemv< - member_type, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Ljj, Y, - zero, Xj); + KokkosBlas::TeamGemv::invoke(team, + one, + Ljj, + Y, + zero, + Xj); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View::invoke(team, one, Lij, Xj, - zero, Z); + KokkosBlas::TeamGemv::invoke(team, + one, + Lij, Xj, + zero, + Z); team.team_barrier(); } } @@ -1103,8 +1107,8 @@ struct UpperTriSupernodalFunctor { auto Uij = Kokkos::subview(viewU, range_type(nscol, nsrow), Kokkos::ALL()); using Uij_type = decltype(Uij); - KokkosBatched::TeamGemv:: + KokkosBlas::TeamGemv:: template invoke( team, -one, Uij, Z, one, Xj); team.team_barrier(); @@ -1128,8 +1132,8 @@ struct UpperTriSupernodalFunctor { team.team_barrier(); // caling team-level kernel in KokkosBatched on a small-size diagonal - KokkosBatched::TeamGemv:: + KokkosBlas::TeamGemv:: template invoke( team, one, Ujj, Y, zero, Xj); } else { @@ -1267,10 +1271,12 @@ struct UpperTriTranSupernodalFunctor { workoffset, workoffset + nsrow)); // needed with gemv for update&scatter auto Uij = Kokkos::subview(viewU, range_type(0, nsrow), Kokkos::ALL()); - KokkosBatched::TeamGemv< - member_type, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Uij, Xj, - zero, Y); + KokkosBlas::TeamGemv::invoke(team, + one, + Uij, Xj, + zero, + Y); team.team_barrier(); // copy the diagonal back to output for (int ii = team_rank; ii < nscol; ii += team_size) { @@ -1289,10 +1295,13 @@ struct UpperTriTranSupernodalFunctor { Y(ii) = Xj(ii); } team.team_barrier(); - KokkosBatched::TeamGemv< - member_type, KokkosBatched::Trans::NoTranspose, - KokkosBatched::Algo::Gemv::Unblocked>::invoke(team, one, Ujj, Y, - zero, Xj); + KokkosBlas::TeamGemv::invoke(team, + one, + Ujj, + Y, + zero, + Xj); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View::invoke(team, one, Uij, Xj, - zero, Z); + KokkosBlas::TeamGemv::invoke(team, + one, + Uij, Xj, + zero, + Z); team.team_barrier(); } diff --git a/unit_test/batched/dense/Test_Batched_Dense.hpp b/unit_test/batched/dense/Test_Batched_Dense.hpp index a771bcada1..a154e9e14f 100644 --- a/unit_test/batched/dense/Test_Batched_Dense.hpp +++ b/unit_test/batched/dense/Test_Batched_Dense.hpp @@ -45,9 +45,6 @@ #include "Test_Batched_TeamGemm.hpp" #include "Test_Batched_TeamGemm_Real.hpp" #include "Test_Batched_TeamGemm_Complex.hpp" -#include "Test_Batched_TeamGemv.hpp" -#include "Test_Batched_TeamGemv_Real.hpp" -#include "Test_Batched_TeamGemv_Complex.hpp" #include "Test_Batched_TeamGesv.hpp" #include "Test_Batched_TeamGesv_Real.hpp" #include "Test_Batched_TeamInverseLU.hpp" diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv.hpp deleted file mode 100644 index 103aa069ce..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamGemv.hpp +++ /dev/null @@ -1,155 +0,0 @@ -/// \author Kyungjoo Kim (kyukim@sandia.gov) - -#include "gtest/gtest.h" -#include "Kokkos_Core.hpp" -#include "Kokkos_Random.hpp" - -//#include "KokkosBatched_Vector.hpp" - -#include "KokkosBatched_Gemv_Decl.hpp" - -#include "KokkosKernels_TestUtils.hpp" - -using namespace KokkosBatched; - -namespace Test { -namespace TeamGemv { - -template -struct ParamTag { - typedef T trans; -}; - -template -struct Functor_TestBatchedTeamGemv { - ViewType _a, _b, _c; - - ScalarType _alpha, _beta; - - KOKKOS_INLINE_FUNCTION - Functor_TestBatchedTeamGemv(const ScalarType alpha, const ViewType &a, - const ViewType &b, const ScalarType beta, - const ViewType &c) - : _a(a), _b(b), _c(c), _alpha(alpha), _beta(beta) {} - - template - KOKKOS_INLINE_FUNCTION void operator()(const ParamTagType &, - const MemberType &member) const { - const int k = member.league_rank(); - - auto aa = Kokkos::subview(_a, k, Kokkos::ALL(), Kokkos::ALL()); - auto bb = Kokkos::subview(_b, k, Kokkos::ALL(), 0); - auto cc = Kokkos::subview(_c, k, Kokkos::ALL(), 0); - - KokkosBatched::TeamGemv::invoke(member, _alpha, aa, bb, _beta, - cc); - } - - inline void run() { - typedef typename ViewType::value_type value_type; - std::string name_region("KokkosBatched::Test::SerialGemm"); - const std::string name_value_type = Test::value_type_name(); - std::string name = name_region + name_value_type; - Kokkos::Profiling::pushRegion(name.c_str()); - const int league_size = _c.extent(0); - Kokkos::TeamPolicy policy(league_size, - Kokkos::AUTO); - Kokkos::parallel_for(name.c_str(), policy, *this); - Kokkos::Profiling::popRegion(); - } -}; - -template -void impl_test_batched_gemv(const int N, const int BlkSize) { - typedef typename ViewType::value_type value_type; - typedef Kokkos::Details::ArithTraits ats; - - /// randomized input testing views - ScalarType alpha = 1.5, beta = 3.0; - - ViewType a0("a0", N, BlkSize, BlkSize), a1("a1", N, BlkSize, BlkSize), - b0("b0", N, BlkSize, 1), b1("b1", N, BlkSize, 1), c0("c0", N, BlkSize, 1), - c1("c1", N, BlkSize, 1); - - Kokkos::Random_XorShift64_Pool random( - 13718); - Kokkos::fill_random(a0, random, value_type(1.0)); - Kokkos::fill_random(b0, random, value_type(1.0)); - Kokkos::fill_random(c0, random, value_type(1.0)); - - Kokkos::fence(); - - Kokkos::deep_copy(a1, a0); - Kokkos::deep_copy(b1, b0); - Kokkos::deep_copy(c1, c0); - - /// test body - Functor_TestBatchedTeamGemv(alpha, a0, b0, beta, c0) - .run(); - Functor_TestBatchedTeamGemv(alpha, a1, b1, beta, c1) - .run(); - - Kokkos::fence(); - - /// for comparison send it to host - typename ViewType::HostMirror c0_host = Kokkos::create_mirror_view(c0); - typename ViewType::HostMirror c1_host = Kokkos::create_mirror_view(c1); - - Kokkos::deep_copy(c0_host, c0); - Kokkos::deep_copy(c1_host, c1); - - /// check c0 = c1 ; this eps is about 10^-14 - typedef typename ats::mag_type mag_type; - mag_type sum(1), diff(0); - const mag_type eps = 1.0e3 * ats::epsilon(); - - for (int k = 0; k < N; ++k) - for (int i = 0; i < BlkSize; ++i) - for (int j = 0; j < 1; ++j) { - sum += ats::abs(c0_host(k, i, j)); - diff += ats::abs(c0_host(k, i, j) - c1_host(k, i, j)); - } - EXPECT_NEAR_KK(diff / sum, 0, eps); -} -} // namespace TeamGemv -} // namespace Test - -template -int test_batched_team_gemv() { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) - { - typedef Kokkos::View - ViewType; - Test::TeamGemv::impl_test_batched_gemv(0, 10); - for (int i = 0; i < 10; ++i) { - // printf("Testing: LayoutLeft, Blksize %d\n", i); - Test::TeamGemv::impl_test_batched_gemv(1024, - i); - } - } -#endif -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) - { - typedef Kokkos::View - ViewType; - Test::TeamGemv::impl_test_batched_gemv(0, 10); - for (int i = 0; i < 10; ++i) { - // printf("Testing: LayoutRight, Blksize %d\n", i); - Test::TeamGemv::impl_test_batched_gemv(1024, - i); - } - } -#endif - - return 0; -} diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp deleted file mode 100644 index 3ffc34db23..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamGemv_Complex.hpp +++ /dev/null @@ -1,45 +0,0 @@ -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) - -/// dcomplex, dcomplex - -TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_dcomplex) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv, - Kokkos::complex, param_tag_type, - algo_tag_type>(); -} -TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_dcomplex) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv, - Kokkos::complex, param_tag_type, - algo_tag_type>(); -} -// TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_dcomplex ) { -// typedef ::Test::TeamGemv::ParamTag param_tag_type; -// typedef Algo::Gemv::Blocked algo_tag_type; -// test_batched_team_gemv,Kokkos::complex,param_tag_type,algo_tag_type>(); -// } - -/// dcomplex, double - -TEST_F(TestCategory, batched_scalar_team_gemv_nt_dcomplex_double) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv, double, - param_tag_type, algo_tag_type>(); -} -TEST_F(TestCategory, batched_scalar_team_gemv_t_dcomplex_double) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv, double, - param_tag_type, algo_tag_type>(); -} -// TEST_F( TestCategory, batched_scalar_team_gemv_ct_dcomplex_double ) { -// typedef ::Test::TeamGemv::ParamTag param_tag_type; -// typedef Algo::Gemv::Blocked algo_tag_type; -// test_batched_team_gemv,double,param_tag_type,algo_tag_type>(); -// } - -#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp b/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp deleted file mode 100644 index 2c4db11b2d..0000000000 --- a/unit_test/batched/dense/Test_Batched_TeamGemv_Real.hpp +++ /dev/null @@ -1,30 +0,0 @@ - -#if defined(KOKKOSKERNELS_INST_FLOAT) -TEST_F(TestCategory, batched_scalar_team_gemv_nt_float_float) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv(); -} -TEST_F(TestCategory, batched_scalar_team_gemv_t_float_float) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv(); -} -#endif - -#if defined(KOKKOSKERNELS_INST_DOUBLE) -TEST_F(TestCategory, batched_scalar_team_gemv_nt_double_double) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv(); -} -TEST_F(TestCategory, batched_scalar_team_gemv_t_double_double) { - typedef ::Test::TeamGemv::ParamTag param_tag_type; - typedef Algo::Gemv::Blocked algo_tag_type; - test_batched_team_gemv(); -} -#endif diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp index 80bc7b246a..bb5cd89c9b 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR.hpp @@ -6,7 +6,7 @@ #include "KokkosBlas1_set.hpp" #include "KokkosBatched_Copy_Decl.hpp" -#include "KokkosBatched_Gemv_Decl.hpp" +#include "KokkosBlas2_team_gemv_spec.hpp" #include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_QR_Decl.hpp" #include "KokkosBatched_ApplyQ_Decl.hpp" @@ -53,9 +53,9 @@ struct Functor_TestBatchedTeamVectorQR { member.team_barrier(); /// bb = AA*xx - TeamVectorGemv::invoke(member, one, aa, xx, zero, - bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, + xx, zero, bb); member.team_barrier(); /// AA = QR diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp index 72754a5e00..743810d4ce 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorQR_WithColumnPivoting.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Copy_Decl.hpp" #include "KokkosBatched_ApplyPivot_Decl.hpp" -#include "KokkosBatched_Gemv_Decl.hpp" +#include "KokkosBlas2_team_gemv_spec.hpp" #include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_QR_WithColumnPivoting_Decl.hpp" #include "KokkosBatched_ApplyQ_Decl.hpp" @@ -53,9 +53,9 @@ struct Functor_TestBatchedTeamVectorQR_WithColumnPivoting { member.team_barrier(); /// bb = AA*xx - TeamVectorGemv::invoke(member, one, aa, xx, zero, - bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, + xx, zero, bb); member.team_barrier(); /// AA P^T = QR diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp index 6610383d12..08375a95f5 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorSolveUTV.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Copy_Decl.hpp" #include "KokkosBatched_ApplyPivot_Decl.hpp" -#include "KokkosBatched_Gemv_Decl.hpp" +#include "KokkosBlas2_team_gemv_spec.hpp" #include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_UTV_Decl.hpp" #include "KokkosBatched_SolveUTV_Decl.hpp" @@ -79,9 +79,9 @@ struct Functor_TestBatchedTeamVectorSolveUTV { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - TeamVectorGemv::invoke(member, one, aa, xx, zero, - bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, + xx, zero, bb); member.team_barrier(); /// Solving Ax = b using UTV transformation diff --git a/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp b/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp index 0a49db7dce..06ca4b2fb8 100644 --- a/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp +++ b/unit_test/batched/dense/Test_Batched_TeamVectorUTV.hpp @@ -6,7 +6,7 @@ #include "KokkosBatched_Copy_Decl.hpp" #include "KokkosBatched_ApplyPivot_Decl.hpp" -#include "KokkosBatched_Gemv_Decl.hpp" +#include "KokkosBlas2_team_gemv_spec.hpp" #include "KokkosBatched_Trsv_Decl.hpp" #include "KokkosBatched_UTV_Decl.hpp" @@ -78,9 +78,9 @@ struct Functor_TestBatchedTeamVectorUTV { TeamVectorCopy::invoke(member, aa, ac); /// bb = AA*xx - TeamVectorGemv::invoke(member, one, aa, xx, zero, - bb); + KokkosBlas::TeamVectorGemv::invoke(member, one, aa, + xx, zero, bb); member.team_barrier(); /// Solving Ax = b using UTV transformation @@ -98,9 +98,9 @@ struct Functor_TestBatchedTeamVectorUTV { auto vm = Kokkos::subview(vv, range_upto_rank, Kokkos::ALL()); if (matrix_rank < m) { /// w = U^T b - TeamVectorGemv::invoke(member, one, um, bb, zero, - ww); + KokkosBlas::TeamVectorGemv::invoke(member, one, um, + bb, zero, ww); member.team_barrier(); /// w = T^{-1} w @@ -109,15 +109,15 @@ struct Functor_TestBatchedTeamVectorUTV { member.team_barrier(); /// x = V^T w - TeamVectorGemv::invoke(member, one, vm, ww, zero, - xx); + KokkosBlas::TeamVectorGemv::invoke(member, one, vm, + ww, zero, xx); member.team_barrier(); } else { /// x = U^T b - TeamVectorGemv::invoke(member, one, um, bb, zero, - xx); + KokkosBlas::TeamVectorGemv::invoke(member, one, um, + bb, zero, xx); member.team_barrier(); /// x = T^{-1} x diff --git a/unit_test/blas/Test_Blas.hpp b/unit_test/blas/Test_Blas.hpp index 3b4f4355c3..b6d4f88314 100644 --- a/unit_test/blas/Test_Blas.hpp +++ b/unit_test/blas/Test_Blas.hpp @@ -44,8 +44,10 @@ // Serial Blas 2 #include "Test_Blas2_serial_gemv.hpp" + // Team Blas 2 #include "Test_Blas2_team_gemv.hpp" +#include "Test_Blas2_teamvector_gemv.hpp" // Blas 3 #include "Test_Blas3_gemm.hpp" diff --git a/unit_test/blas/Test_Blas2_gemv_util.hpp b/unit_test/blas/Test_Blas2_gemv_util.hpp new file mode 100644 index 0000000000..635f02c558 --- /dev/null +++ b/unit_test/blas/Test_Blas2_gemv_util.hpp @@ -0,0 +1,322 @@ +#ifndef TEST_BLAS2_GEMV_UTIL_HPP +#define TEST_BLAS2_GEMV_UTIL_HPP + +#include +#include +#include +#include + +namespace Test { + +template ::value> +using simd_vector = + KokkosBatched::Vector, length>; + +template +struct GemvOpBase { + GemvOpBase(char trans_, ScalarType alpha_, AType A_, XType x_, + ScalarType beta_, YType y_) + : trans(trans_), alpha(alpha_), beta(beta_), A(A_), x(x_), y(y_) {} + + protected: + // parameters + char trans; + ScalarType alpha; + ScalarType beta; + // data + AType A; + XType x; + YType y; +}; + +// Note: vanillaGEMV is called on device here - alternatively one can move +// _strided_ data using safe_device_to_host_deep_copy() etc. +template +struct RefGEMVOp : public GemvOpBase { + using params = GemvOpBase; + + RefGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, + ScalarType beta_, YType y_) + : params(trans_, alpha_, A_, x_, beta_, y_) {} + + template + KOKKOS_INLINE_FUNCTION void operator()( + const TeamMember & /* member */) const { + vanillaGEMV(params::trans, params::alpha, params::A, params::x, + params::beta, params::y); + } +}; // RefGEMVOp + +// fill regular view with random values +template +typename std::enable_if::value>::type +fill_random_view(ViewType A, PoolType &rand_pool, + const ScalarType max_val = 10.0) { + Kokkos::fill_random(A, rand_pool, max_val); + Kokkos::fence(); +} + +// fill rank-1 view of SIMD vectors with random values +template +void fill_random_view( + Kokkos::View< + KokkosBatched::Vector, VecLength> *, + Layout, Props...> + x, + PoolType &rand_pool, const ValueType max_val = 10.0) { + // the view can be strided and have Vector values, so randoms + // are generated in a plain, linear view first and then copied + using device_type = typename decltype(x)::device_type; + Kokkos::View rnd("random_vals", + x.extent(0) * VecLength); + Kokkos::fill_random(rnd, rand_pool, max_val); + using size_type = decltype(x.extent(0)); + for (size_type i = 0; i < x.extent(0); ++i) { + x(i).loadUnaligned(&rnd(i * VecLength)); + } +} + +// fill rank-2 view of SIMD vectors with random values +template +static void fill_random_view( + Kokkos::View< + KokkosBatched::Vector, VecLength> **, + Layout, Props...> + A, + PoolType &rand_pool, const ValueType max_val = 10.0) { + // the view can be strided and have Vector values, so randoms + // are generated in a plain, linear view first and then copied + using device_type = typename decltype(A)::device_type; + Kokkos::View rnd( + "random_vals", A.extent(0) * A.extent(1) * VecLength); + Kokkos::fill_random(rnd, rand_pool, max_val); + using size_type = decltype(A.extent(0)); + size_type idx = 0; + for (size_type i = 0; i < A.extent(0); ++i) { + for (size_type j = 0; j < A.extent(1); ++j) { + A(i, j).loadUnaligned(&rnd(idx)); + idx += VecLength; + } + } +} + +template +struct GEMVTest { + static void run(const char *mode) { + run_algorithms<0, typename GemvFunc::algorithms>(mode); + } + + private: + // ScalarCoef==void default behavior is to derive alpha/beta scalar types + // from A and X scalar types + using ScalarType = typename std::conditional< + !std::is_void::value, ScalarCoef, + typename std::common_type::type>::type; + + template + static std::enable_if_t::value> + run_algorithms(const char * /*mode*/) {} + + template + static + typename std::enable_if<(Idx < + std::tuple_size::value)>::type + run_algorithms(const char *mode) { + run_layouts::type>(mode); + run_algorithms(mode); + } + + // Note: all layouts listed here are subview'ed to test Kokkos::LayoutStride + template + static void run_layouts(const char *mode) { +#ifdef KOKKOSKERNELS_TEST_LAYOUTLEFT + run_view_types(mode); +#endif +#ifdef KOKKOSKERNELS_TEST_LAYOUTRIGHT + run_view_types(mode); +#endif +#if defined(KOKKOSKERNELS_TEST_LAYOUTLEFT) && \ + defined(KOKKOSKERNELS_TEST_LAYOUTRIGHT) + using A_t = typename Kokkos::View; + using x_t = typename Kokkos::View; + using y_t = typename Kokkos::View; + run_sizes(mode); +#endif + } + + template + static void run_view_types(const char *mode) { + typedef Kokkos::View view_type_A; + typedef Kokkos::View view_type_x; + typedef Kokkos::View view_type_y; + run_sizes(mode); + } + + template + static void run_sizes(const char *mode) { + // zero cases + run_size(mode, 0, 0); + run_size(mode, 0, 4); + run_size(mode, 4, 0); + // small block sizes + for (int n = 1; n <= 16; ++n) { + run_size(mode, n, n); + } + // other cases + run_size(mode, 1024, 1); + run_size(mode, 1024, 13); + run_size(mode, 1024, 124); + } + + template + static void run_size(const char *mode, int N, int M) { + using A_layout = typename ViewTypeA::array_layout; + using x_layout = typename ViewTypeX::array_layout; + using y_layout = typename ViewTypeY::array_layout; + static_assert(!std::is_same::value, ""); + static_assert(!std::is_same::value, ""); + static_assert(!std::is_same::value, ""); + + const auto trans = mode[0]; + const bool transposed = trans == (char)'T' || trans == (char)'C'; + const auto Nt = transposed ? M : N; + const auto Mt = transposed ? N : M; + + // 1. run on regular (non-strided) views + ViewTypeA A1("A1", Nt, Mt); + ViewTypeX x1("X1", M); + ViewTypeY y1("Y1", N); + run_views(trans, A1, x1, y1); + + // 2. run on strided subviews (enforced by adding extra rank on both sides) + // Note: strided views are not supported by MKL routines + if (!std::is_same::value) { + typedef Kokkos::View BaseTypeA; + typedef Kokkos::View BaseTypeX; + typedef Kokkos::View BaseTypeY; + + BaseTypeA b_A("A", 2, Nt, Mt, 2); + BaseTypeX b_x("X", 2, M, 2); + BaseTypeY b_y("Y", 2, N, 2); + auto A = Kokkos::subview(b_A, 0, Kokkos::ALL(), Kokkos::ALL(), 0); + auto x = Kokkos::subview(b_x, 0, Kokkos::ALL(), 0); + auto y = Kokkos::subview(b_y, 0, Kokkos::ALL(), 0); + + // make sure it's actually LayoutStride there + static_assert(std::is_same::value, + ""); + static_assert(std::is_same::value, + ""); + static_assert(std::is_same::value, + ""); + run_views(trans, A, x, y); + } + } + + template + static void run_views(const char trans, ViewTypeA A, ViewTypeX x, + ViewTypeY y) { + Kokkos::TeamPolicy teams(1, 1); // just run on device + fill_inputs(A, x, y); + ScalarType alpha = 3; // TODO: test also with zero alpha/beta ? + ScalarType beta = 5; + + // get reference results + Kokkos::View y_ref("Y_ref", y.extent(0)); + Kokkos::deep_copy(y_ref, y); + RefGEMVOp gemv_ref( + trans, alpha, A, x, beta, y_ref); + Kokkos::parallel_for(teams, gemv_ref); + + // 1. check non-consts + run_case(trans, alpha, A, x, beta, y, y_ref); + + // 2. check const x + typename ViewTypeX::const_type c_x = x; + run_case(trans, alpha, A, c_x, beta, y, y_ref); + + // 3. check const A and x + typename ViewTypeA::const_type c_A = A; + run_case(trans, alpha, c_A, c_x, beta, y, y_ref); + } + + template + static void run_case(const char trans, ScalarType alpha, ViewTypeA A, + ViewTypeX x, ScalarType beta, ViewTypeY y, + ViewTypeYRef y_ref) { + // run on original y view (not to alter the test) + // but backup it and restore, so it can be reused + Kokkos::View y_backup("Y2", y.extent(0)); + Kokkos::deep_copy(y_backup, y); + + // fetch GEMV functor from the factory + using op_type = + typename GemvFunc::template functor_type; + + op_type gemv_op(trans, alpha, A, x, beta, y); + Kokkos::parallel_for(Kokkos::TeamPolicy(1, 1), gemv_op); + + const double eps = epsilon(ScalarY{}); + EXPECT_NEAR_KK_REL_1DVIEW(y, y_ref, eps); + Kokkos::deep_copy(y, y_backup); + } + + //----- utilities -----// + + // GEMV tolerance for scalar types + static double epsilon(float) { return 2 * 1e-5; } + static double epsilon(double) { return 1e-7; } + static double epsilon(int) { return 0; } + // tolerance for derived types + template + static double epsilon(Kokkos::complex) { + return epsilon(ScalarType{}); + } + template + static double epsilon(simd_vector) { + return epsilon(ScalarType{}); + } + + template + static void fill_inputs(ViewTypeA A, ViewTypeX x, ViewTypeY y) { + using exec_space = typename Device::execution_space; + Kokkos::Random_XorShift64_Pool rand_pool(13718); + fill_random_view(A, rand_pool); + fill_random_view(x, rand_pool); + fill_random_view(y, rand_pool); + } +}; // struct GEMVTest + +} // namespace Test + +#define TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR_A, SCALAR_X, SCALAR_Y, \ + SCALAR_COEF) \ + using PREFIX##_##NAME##_gemv_test = \ + ::Test::GEMVTest<::Test::FACTORY, SCALAR_A, SCALAR_X, SCALAR_Y, \ + TestExecSpace, SCALAR_COEF>; \ + TEST_F(TestCategory, PREFIX##_gemv_nt_##NAME) { \ + PREFIX##_##NAME##_gemv_test::run("N"); \ + } \ + TEST_F(TestCategory, PREFIX##_gemv_t_##NAME) { \ + PREFIX##_##NAME##_gemv_test::run("T"); \ + } \ + TEST_F(TestCategory, PREFIX##_gemv_ct_##NAME) { \ + PREFIX##_##NAME##_gemv_test::run("C"); \ + } + +#define TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR_COEF) \ + TEST_CASE4(PREFIX, FACTORY, NAME, SCALAR, SCALAR, SCALAR, SCALAR_COEF) +#define TEST_CASE(PREFIX, FACTORY, NAME, SCALAR) \ + TEST_CASE2(PREFIX, FACTORY, NAME, SCALAR, SCALAR) + +#endif // TEST_BLAS2_GEMV_UTIL_HPP diff --git a/unit_test/blas/Test_Blas2_serial_gemv.hpp b/unit_test/blas/Test_Blas2_serial_gemv.hpp index e97f8e71a9..fd73707c9a 100644 --- a/unit_test/blas/Test_Blas2_serial_gemv.hpp +++ b/unit_test/blas/Test_Blas2_serial_gemv.hpp @@ -1,388 +1,107 @@ -#include -#include -#include -#include -#include #include -#include +#include // for ETI test guards +// Note: include serial gemv before util so it knows if CompactMKL is available +#include +#include namespace Test { -template ::value> -using simd_vector = - KokkosBatched::Vector, length>; - -// Note: vanillaGEMV is called on device here - alternatively one can move -// _strided_ data using safe_device_to_host_deep_copy() etc. -template -struct RefGEMVOp { - RefGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, - ScalarType beta_, YType y_) - : trans(trans_), alpha(alpha_), beta(beta_), A(A_), x(x_), y(y_) {} - - template - KOKKOS_INLINE_FUNCTION void operator()( - const TeamMember & /* member */) const { - vanillaGEMV(trans, alpha, A, x, beta, y); - } - - private: - // parameters - char trans; - ScalarType alpha; - ScalarType beta; - // data - AType A; - XType x; - YType y; -}; - template -struct SerialGEMVOp { +struct SerialGEMVOp : public GemvOpBase { + using params = GemvOpBase; + SerialGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, ScalarType beta_, YType y_) - : trans(trans_), alpha(alpha_), beta(beta_), A(A_), x(x_), y(y_) {} + : params(trans_, alpha_, A_, x_, beta_, y_) {} template - KOKKOS_INLINE_FUNCTION void operator()( - const TeamMember & /* member */) const { - KokkosBlas::Experimental::gemv(trans, alpha, A, x, beta, y); + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { + KokkosBlas::Experimental::Gemv::invoke( + member, params::trans, params::alpha, params::A, params::x, + params::beta, params::y); } - - private: - // parameters - char trans; - ScalarType alpha; - ScalarType beta; - // data - AType A; - XType x; - YType y; }; -// fill regular view with random values -template -typename std::enable_if::value>::type -fill_random_view(ViewType A, PoolType &rand_pool, - const ScalarType max_val = 10.0) { - Kokkos::fill_random(A, rand_pool, max_val); - Kokkos::fence(); -} - -// fill rank-1 view of SIMD vectors with random values -template -void fill_random_view( - Kokkos::View< - KokkosBatched::Vector, VecLength> *, - Layout, Props...> - x, - PoolType &rand_pool, const ValueType max_val = 10.0) { - // the view can be strided and have Vector values, so randoms - // are generated in a plain, linear view first and then copied - using device_type = typename decltype(x)::device_type; - Kokkos::View rnd("random_vals", - x.extent(0) * VecLength); - Kokkos::fill_random(rnd, rand_pool, max_val); - using size_type = decltype(x.extent(0)); - for (size_type i = 0; i < x.extent(0); ++i) { - x(i).loadUnaligned(&rnd(i * VecLength)); - } -} - -// fill rank-2 view of SIMD vectors with random values -template -static void fill_random_view( - Kokkos::View< - KokkosBatched::Vector, VecLength> **, - Layout, Props...> - A, - PoolType &rand_pool, const ValueType max_val = 10.0) { - // the view can be strided and have Vector values, so randoms - // are generated in a plain, linear view first and then copied - using device_type = typename decltype(A)::device_type; - Kokkos::View rnd( - "random_vals", A.extent(0) * A.extent(1) * VecLength); - Kokkos::fill_random(rnd, rand_pool, max_val); - using size_type = decltype(A.extent(0)); - size_type idx = 0; - for (size_type i = 0; i < A.extent(0); ++i) { - for (size_type j = 0; j < A.extent(1); ++j) { - A(i, j).loadUnaligned(&rnd(idx)); - idx += VecLength; - } - } -} - -// -template -struct SerialGEMVTestBase { - // ScalarCoef==void default behavior is to derive alpha/beta scalar types - // from A and X scalar types - using ScalarType = typename std::conditional< - !std::is_void::value, ScalarCoef, - typename std::common_type::type>::type; - - template - static void run_layouts(const char *mode) { - // Note: all layouts listed here are subview'ed to test Kokkos::LayoutStride -#ifdef KOKKOSKERNELS_TEST_LAYOUTLEFT - run_view_types(mode); -#endif -#ifdef KOKKOSKERNELS_TEST_LAYOUTRIGHT - run_view_types(mode); -#endif -#if defined(KOKKOSKERNELS_TEST_LAYOUTLEFT) && \ - defined(KOKKOSKERNELS_TEST_LAYOUTRIGHT) - using A_t = typename Kokkos::View; - using x_t = typename Kokkos::View; - using y_t = typename Kokkos::View; - run_sizes(mode); -#endif - } - - template - static void run_view_types(const char *mode) { - typedef Kokkos::View view_type_A; - typedef Kokkos::View view_type_x; - typedef Kokkos::View view_type_y; - run_sizes(mode); - } - - template - static void run_sizes(const char *mode) { - // zero cases - run_size(mode, 0, 0); - run_size(mode, 0, 4); - run_size(mode, 4, 0); - // small block sizes - for (int n = 1; n <= 16; ++n) { - run_size(mode, n, n); - } - // other cases - run_size(mode, 1024, 1); - run_size(mode, 1024, 13); - run_size(mode, 1024, 124); - } - - template - static void run_size(const char *mode, int N, int M) { - using A_layout = typename ViewTypeA::array_layout; - using x_layout = typename ViewTypeX::array_layout; - using y_layout = typename ViewTypeY::array_layout; - static_assert(!std::is_same::value, ""); - static_assert(!std::is_same::value, ""); - static_assert(!std::is_same::value, ""); - - const auto trans = mode[0]; - const bool transposed = trans == (char)'T' || trans == (char)'C'; - const auto Nt = transposed ? M : N; - const auto Mt = transposed ? N : M; - - // 1. run on regular (non-strided) views - ViewTypeA A1("A1", Nt, Mt); - ViewTypeX x1("X1", M); - ViewTypeY y1("Y1", N); - run_views(trans, A1, x1, y1); - - // 2. run on strided subviews (enforced by adding extra rank on both sides) - // Note: strided views are not supported by MKL routines - if (!std::is_same::value) { - typedef Kokkos::View BaseTypeA; - typedef Kokkos::View BaseTypeX; - typedef Kokkos::View BaseTypeY; - - BaseTypeA b_A("A", 2, Nt, Mt, 2); - BaseTypeX b_x("X", 2, M, 2); - BaseTypeY b_y("Y", 2, N, 2); - auto A = Kokkos::subview(b_A, 0, Kokkos::ALL(), Kokkos::ALL(), 0); - auto x = Kokkos::subview(b_x, 0, Kokkos::ALL(), 0); - auto y = Kokkos::subview(b_y, 0, Kokkos::ALL(), 0); - - // make sure it's actually LayoutStride there - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - static_assert(std::is_same::value, - ""); - run_views(trans, A, x, y); - } - } - - template - static void run_views(const char trans, ViewTypeA A, ViewTypeX x, - ViewTypeY y) { - Kokkos::TeamPolicy teams(1, 1); // just run on device - fill_inputs(A, x, y); - ScalarType alpha = 3; // TODO: test also with zero alpha/beta ? - ScalarType beta = 5; - - // get reference results - Kokkos::View y_ref("Y_ref", y.extent(0)); - Kokkos::deep_copy(y_ref, y); - RefGEMVOp gemv_ref( - trans, alpha, A, x, beta, y_ref); - Kokkos::parallel_for(teams, gemv_ref); - - // 1. check non-consts - run_case(trans, alpha, A, x, beta, y, y_ref); - - // 2. check const x - typename ViewTypeX::const_type c_x = x; - run_case(trans, alpha, A, c_x, beta, y, y_ref); - - // 3. check const A and x - typename ViewTypeA::const_type c_A = A; - run_case(trans, alpha, c_A, c_x, beta, y, y_ref); - } - +struct SerialGemvFactory { template - static void run_case(const char trans, ScalarType alpha, ViewTypeA A, - ViewTypeX x, ScalarType beta, ViewTypeY y, - ViewTypeYRef y_ref) { - // run on original y view (not to alter the test) - // but backup it and restore, so it can be reused - Kokkos::View y_backup("Y2", y.extent(0)); - Kokkos::deep_copy(y_backup, y); - - SerialGEMVOp gemv_op( - trans, alpha, A, x, beta, y); - Kokkos::parallel_for(Kokkos::TeamPolicy(1, 1), gemv_op); - - const double eps = epsilon(ScalarY{}); - EXPECT_NEAR_KK_REL_1DVIEW(y, y_ref, eps); - Kokkos::deep_copy(y, y_backup); - } - - //----- utilities -----// - - // GEMV tolerance for scalar types - static double epsilon(float) { return 2 * 1e-5; } - static double epsilon(double) { return 1e-7; } - static double epsilon(int) { return 0; } - // tolerance for derived types - template - static double epsilon(Kokkos::complex) { - return epsilon(ScalarType{}); - } - template - static double epsilon(simd_vector) { - return epsilon(ScalarType{}); - } + class Device, class ScalarType> + using functor_type = + SerialGEMVOp; - template - static void fill_inputs(ViewTypeA A, ViewTypeX x, ViewTypeY y) { - using exec_space = typename Device::execution_space; - Kokkos::Random_XorShift64_Pool rand_pool(13718); - fill_random_view(A, rand_pool); - fill_random_view(x, rand_pool); - fill_random_view(y, rand_pool); - } + using algorithms = std::tuple; }; -template -struct SerialGEMVTest { - static void run(const char *mode) { - using base = - SerialGEMVTestBase; - base::template run_layouts(mode); - base::template run_layouts(mode); - } -}; - -// Special handling of Vector> (instead of plain scalars) -// Note: MKL compact routines don't allow mixed scalar types -template -struct SerialGEMVTest, - simd_vector, - simd_vector, Device, ScalarCoef> { - static void run(const char *mode) { - using vector_type = simd_vector; - using base = SerialGEMVTestBase; - // run all usual, non-vector tests - base::template run_layouts(mode); - base::template run_layouts(mode); - // run vector tests #ifdef __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ - base::template run_layouts(mode); -#endif - } +struct SerialMKLGemvFactory { + template + using functor_type = + SerialGEMVOp; + + using algorithms = std::tuple; }; +#endif } // namespace Test -#define TEST_CASE4(NAME, SCALAR_A, SCALAR_X, SCALAR_Y, SCALAR_COEF) \ - TEST_F(TestCategory, serial_gemv_nt_##NAME) { \ - ::Test::SerialGEMVTest::run("N"); \ - } \ - TEST_F(TestCategory, serial_gemv_t_##NAME) { \ - ::Test::SerialGEMVTest::run("T"); \ - } \ - TEST_F(TestCategory, serial_gemv_ct_##NAME) { \ - ::Test::SerialGEMVTest::run("C"); \ - } - -#define TEST_CASE2(NAME, SCALAR, SCALAR_COEF) \ - TEST_CASE4(NAME, SCALAR, SCALAR, SCALAR, SCALAR_COEF) -#define TEST_CASE(NAME, SCALAR) TEST_CASE2(NAME, SCALAR, SCALAR) +#define TEST_SERIAL_CASE4(N, A, X, Y, SC) \ + TEST_CASE4(serial, SerialGemvFactory, N, A, X, Y, SC) +#define TEST_SERIAL_CASE2(N, S, SC) \ + TEST_CASE2(serial, SerialGemvFactory, N, S, SC) +#define TEST_SERIAL_CASE(N, S) TEST_CASE(serial, SerialGemvFactory, N, S) #ifdef KOKKOSKERNELS_TEST_FLOAT -TEST_CASE(float, float) +TEST_SERIAL_CASE(float, float) // MKL vector types #ifdef __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ using simd_float_sse = ::Test::simd_vector; using simd_float_avx = ::Test::simd_vector; using simd_float_avx512 = ::Test::simd_vector; -TEST_CASE2(mkl_float_sse, simd_float_sse, float) -TEST_CASE2(mkl_float_avx, simd_float_avx, float) -TEST_CASE2(mkl_float_avx512, simd_float_avx512, float) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_sse, simd_float_sse, float) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx, simd_float_avx, float) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_float_avx512, simd_float_avx512, + float) #endif #endif #ifdef KOKKOSKERNELS_TEST_DOUBLE -TEST_CASE(double, double) +TEST_SERIAL_CASE(double, double) // MKL vector types #ifdef __KOKKOSBLAS_ENABLE_INTEL_MKL_COMPACT__ using simd_double_sse = ::Test::simd_vector; using simd_double_avx = ::Test::simd_vector; using simd_double_avx512 = ::Test::simd_vector; -TEST_CASE2(mkl_double_sse, simd_double_sse, double) -TEST_CASE2(mkl_double_avx, simd_double_avx, double) -TEST_CASE2(mkl_double_avx512, simd_double_avx512, double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_sse, simd_double_sse, + double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx, simd_double_avx, + double) +TEST_CASE2(serial, SerialMKLGemvFactory, mkl_double_avx512, simd_double_avx512, + double) #endif #endif #ifdef KOKKOSKERNELS_TEST_COMPLEX_DOUBLE -TEST_CASE(complex_double, Kokkos::complex) +TEST_SERIAL_CASE(complex_double, Kokkos::complex) #endif #ifdef KOKKOSKERNELS_TEST_COMPLEX_FLOAT -TEST_CASE(complex_float, Kokkos::complex) +TEST_SERIAL_CASE(complex_float, Kokkos::complex) #endif #ifdef KOKKOSKERNELS_TEST_INT -TEST_CASE(int, int) +TEST_SERIAL_CASE(int, int) #endif #ifdef KOKKOSKERNELS_TEST_ALL_TYPES // test mixed scalar types (void -> default alpha/beta) -TEST_CASE4(mixed, double, int, float, void) +TEST_SERIAL_CASE4(mixed, double, int, float, void) // test arbitrary double alpha/beta with complex values -TEST_CASE2(alphabeta, Kokkos::complex, double) +TEST_SERIAL_CASE2(alphabeta, Kokkos::complex, double) #endif + +#undef TEST_SERIAL_CASE4 +#undef TEST_SERIAL_CASE2 +#undef TEST_SERIAL_CASE diff --git a/unit_test/blas/Test_Blas2_team_gemv.hpp b/unit_test/blas/Test_Blas2_team_gemv.hpp index dc2d158a4c..722aca1938 100644 --- a/unit_test/blas/Test_Blas2_team_gemv.hpp +++ b/unit_test/blas/Test_Blas2_team_gemv.hpp @@ -4,271 +4,78 @@ // the CUDA backend before including this test. #if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) -#include -#include -#include -#include -#include -#include +#include +#include // for test/inst guards +// Note: include serial gemv before util so it knows if CompactMKL is available +#include +#include namespace Test { -template -void impl_test_team_gemv(const char *mode, int N, int M) { - typedef Kokkos::TeamPolicy team_policy; - typedef typename team_policy::member_type team_member; - // Launch K teams of the maximum number of threads per team - int K = 4; - const team_policy policy(K, Kokkos::AUTO); - const int team_data_siz = (N % K == 0) ? (N / K) : (N / K + 1); +template +struct TeamGEMVOp : public GemvOpBase { + using params = GemvOpBase; - typedef typename ViewTypeA::value_type ScalarA; - typedef typename ViewTypeX::value_type ScalarX; - typedef typename ViewTypeY::value_type ScalarY; + TeamGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, + ScalarType beta_, YType y_) + : params(trans_, alpha_, A_, x_, beta_, y_) {} - typedef multivector_layout_adapter vfA_type; - typedef Kokkos::View< - ScalarX * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeX; - typedef Kokkos::View< - ScalarY * [2], - typename std::conditional::value, - Kokkos::LayoutRight, Kokkos::LayoutLeft>::type, - Device> - BaseTypeY; - - ScalarA a = 3; - ScalarX b = 5; - double eps = std::is_same::value ? 2 * 1e-5 : 1e-7; - - typename vfA_type::BaseType b_A("A", N, M); - BaseTypeX b_x("X", M); - BaseTypeY b_y("Y", N); - BaseTypeY b_org_y("Org_Y", N); - - ViewTypeA A = vfA_type::view(b_A); - ViewTypeX x = Kokkos::subview(b_x, Kokkos::ALL(), 0); - ViewTypeY y = Kokkos::subview(b_y, Kokkos::ALL(), 0); - typename ViewTypeX::const_type c_x = x; - typename ViewTypeA::const_type c_A = A; - - typedef multivector_layout_adapter h_vfA_type; - - typename h_vfA_type::BaseType h_b_A = Kokkos::create_mirror_view(b_A); - typename BaseTypeX::HostMirror h_b_x = Kokkos::create_mirror_view(b_x); - typename BaseTypeY::HostMirror h_b_y = Kokkos::create_mirror_view(b_y); - - typename ViewTypeA::HostMirror h_A = h_vfA_type::view(h_b_A); - typename ViewTypeX::HostMirror h_x = Kokkos::subview(h_b_x, Kokkos::ALL(), 0); - typename ViewTypeY::HostMirror h_y = Kokkos::subview(h_b_y, Kokkos::ALL(), 0); - - Kokkos::Random_XorShift64_Pool rand_pool( - 13718); - - Kokkos::fill_random(b_x, rand_pool, ScalarX(10)); - Kokkos::fill_random(b_y, rand_pool, ScalarY(10)); - Kokkos::fill_random(b_A, rand_pool, ScalarA(10)); - - Kokkos::deep_copy(b_org_y, b_y); - - Kokkos::deep_copy(h_b_x, b_x); - Kokkos::deep_copy(h_b_y, b_y); - Kokkos::deep_copy(h_b_A, b_A); - - ScalarY expected_result = 0; - - if (mode[0] == 'N') { - for (int i = 0; i < N; i++) { - ScalarY y_i = ScalarY(); - for (int j = 0; j < M; j++) { - y_i += h_A(i, j) * h_x(j); - } - expected_result += (b * h_y(i) + a * y_i) * (b * h_y(i) + a * y_i); - } + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { + KokkosBlas::Experimental::Gemv::invoke( + member, params::trans, params::alpha, params::A, params::x, + params::beta, params::y); } +}; - char trans = mode[0]; - - // KokkosBlas::gemv(mode,a,A,x,b,y); - Kokkos::parallel_for( - "KokkosBlas::Test::TeamGemm", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { - const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::gemv( - teamMember, trans, a, - Kokkos::subview( - A, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < K - 1) ? (teamId + 1) * team_data_siz : N), - Kokkos::ALL()), - x, b, - Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < K - 1) ? (teamId + 1) * team_data_siz : N))); - }); - - ScalarY nonconst_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK(nonconst_nonconst_result, expected_result, - eps * expected_result); - - Kokkos::deep_copy(b_y, b_org_y); - - // KokkosBlas::gemv(mode,a,A,c_x,b,y); - Kokkos::parallel_for( - "KokkosBlas::Test::TeamGemm", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { - const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::gemv( - teamMember, trans, a, - Kokkos::subview( - A, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < K - 1) ? (teamId + 1) * team_data_siz : N), - Kokkos::ALL()), - c_x, b, - Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < K - 1) ? (teamId + 1) * team_data_siz : N))); - }); - - ScalarY const_nonconst_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK(const_nonconst_result, expected_result, eps * expected_result); +struct TeamGemvFactory { + template + using functor_type = + TeamGEMVOp; - Kokkos::deep_copy(b_y, b_org_y); + using algorithms = std::tuple; +}; - // KokkosBlas::gemv(mode,a,c_A,c_x,b,y); - Kokkos::parallel_for( - "KokkosBlas::Test::TeamGemm", policy, - KOKKOS_LAMBDA(const team_member &teamMember) { - const int teamId = teamMember.league_rank(); - KokkosBlas::Experimental::gemv( - teamMember, trans, a, - Kokkos::subview( - c_A, - Kokkos::make_pair( - teamId * team_data_siz, - (teamId < K - 1) ? (teamId + 1) * team_data_siz : N), - Kokkos::ALL()), - c_x, b, - Kokkos::subview( - y, Kokkos::make_pair( - teamId * team_data_siz, - (teamId < K - 1) ? (teamId + 1) * team_data_siz : N))); - }); - - ScalarY const_const_result = KokkosBlas::dot(y, y); - EXPECT_NEAR_KK(const_const_result, expected_result, eps * expected_result); -} } // namespace Test -template -int test_team_gemv(const char *mode) { -#if defined(KOKKOSKERNELS_INST_LAYOUTLEFT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ll; - typedef Kokkos::View view_type_b_ll; - typedef Kokkos::View view_type_c_ll; - Test::impl_test_team_gemv(mode, 0, 1024); - Test::impl_test_team_gemv(mode, 13, 1024); - Test::impl_test_team_gemv(mode, 124, 124); - // Test::impl_test_team_gemv(mode,132231,1024); -#endif +#define TEST_TEAM_CASE4(N, A, X, Y, SC) \ + TEST_CASE4(team, TeamGemvFactory, N, A, X, Y, SC) +#define TEST_TEAM_CASE2(N, S, SC) TEST_CASE2(team, TeamGemvFactory, N, S, SC) +#define TEST_TEAM_CASE(N, S) TEST_CASE(team, TeamGemvFactory, N, S) -#if defined(KOKKOSKERNELS_INST_LAYOUTRIGHT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_lr; - typedef Kokkos::View view_type_b_lr; - typedef Kokkos::View view_type_c_lr; - Test::impl_test_team_gemv(mode, 0, 1024); - Test::impl_test_team_gemv(mode, 13, 1024); - Test::impl_test_team_gemv(mode, 124, 124); - // Test::impl_test_team_gemv(mode,132231,1024); +#ifdef KOKKOSKERNELS_TEST_FLOAT +TEST_TEAM_CASE(float, float) #endif -#if defined(KOKKOSKERNELS_INST_LAYOUTSTRIDE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) - typedef Kokkos::View view_type_a_ls; - typedef Kokkos::View view_type_b_ls; - typedef Kokkos::View view_type_c_ls; - Test::impl_test_team_gemv(mode, 0, 1024); - Test::impl_test_team_gemv(mode, 13, 1024); - Test::impl_test_team_gemv(mode, 124, 124); - // Test::impl_test_team_gemv(mode,132231,1024); +#ifdef KOKKOSKERNELS_TEST_DOUBLE +TEST_TEAM_CASE(double, double) #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) - Test::impl_test_team_gemv(mode, 124, 124); - Test::impl_test_team_gemv(mode, 124, 124); +#ifdef KOKKOSKERNELS_TEST_COMPLEX_DOUBLE +TEST_TEAM_CASE(complex_double, Kokkos::complex) #endif - return 1; -} - -#if defined(KOKKOSKERNELS_INST_FLOAT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_gemv_float) { - test_team_gemv("N"); -} +#ifdef KOKKOSKERNELS_TEST_COMPLEX_FLOAT +TEST_TEAM_CASE(complex_float, Kokkos::complex) #endif -#if defined(KOKKOSKERNELS_INST_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_gemv_double) { - test_team_gemv("N"); -} +#ifdef KOKKOSKERNELS_TEST_INT +TEST_TEAM_CASE(int, int) #endif -#if defined(KOKKOSKERNELS_INST_COMPLEX_DOUBLE) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_gemv_complex_double) { - test_team_gemv, Kokkos::complex, - Kokkos::complex, TestExecSpace>("N"); -} -#endif +#ifdef KOKKOSKERNELS_TEST_ALL_TYPES +// test mixed scalar types (void -> default alpha/beta) +TEST_TEAM_CASE4(mixed, double, int, float, void) -#if defined(KOKKOSKERNELS_INST_INT) || \ - (!defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS)) -TEST_F(TestCategory, team_gemv_int) { - test_team_gemv("N"); -} +// test arbitrary double alpha/beta with complex values +TEST_TEAM_CASE2(alphabeta, Kokkos::complex, double) #endif -#if !defined(KOKKOSKERNELS_ETI_ONLY) && \ - !defined(KOKKOSKERNELS_IMPL_CHECK_ETI_CALLS) -TEST_F(TestCategory, team_gemv_double_int) { - test_team_gemv("N"); -} -#endif +#undef TEST_TEAM_CASE4 +#undef TEST_TEAM_CASE2 +#undef TEST_TEAM_CASE #endif // Check for lambda availability on CUDA backend diff --git a/unit_test/blas/Test_Blas2_teamvector_gemv.hpp b/unit_test/blas/Test_Blas2_teamvector_gemv.hpp new file mode 100644 index 0000000000..5814541bb2 --- /dev/null +++ b/unit_test/blas/Test_Blas2_teamvector_gemv.hpp @@ -0,0 +1,85 @@ +// Note: Luc Berger-Vergiat 04/14/21 +// This tests uses KOKKOS_LAMBDA so we need +// to make sure that these are enabled in +// the CUDA backend before including this test. +#if !defined(TEST_CUDA_BLAS_CPP) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) + +#include +#include // for test/inst guards +// Note: include serial gemv before util so it knows if CompactMKL is available +#include +#include + +namespace Test { + +template +struct TeamVectorGEMVOp : public GemvOpBase { + using params = GemvOpBase; + + TeamVectorGEMVOp(char trans_, ScalarType alpha_, AType A_, XType x_, + ScalarType beta_, YType y_) + : params(trans_, alpha_, A_, x_, beta_, y_) {} + + template + KOKKOS_INLINE_FUNCTION void operator()(const TeamMember& member) const { + KokkosBlas::Experimental::Gemv::invoke(member, params::trans, + params::alpha, params::A, + params::x, params::beta, + params::y); + } +}; + +struct TeamVectorGemvFactory { + template + using functor_type = + TeamVectorGEMVOp; + + // no Blocked implementation + using algorithms = std::tuple; +}; + +} // namespace Test + +#define TEST_TEAMVECTOR_CASE4(N, A, X, Y, SC) \ + TEST_CASE4(teamvector, TeamVectorGemvFactory, N, A, X, Y, SC) +#define TEST_TEAMVECTOR_CASE2(N, S, SC) \ + TEST_CASE2(teamvector, TeamVectorGemvFactory, N, S, SC) +#define TEST_TEAMVECTOR_CASE(N, S) \ + TEST_CASE(teamvector, TeamVectorGemvFactory, N, S) + +#ifdef KOKKOSKERNELS_TEST_FLOAT +TEST_TEAMVECTOR_CASE(float, float) +#endif + +#ifdef KOKKOSKERNELS_TEST_DOUBLE +TEST_TEAMVECTOR_CASE(double, double) +#endif + +#ifdef KOKKOSKERNELS_TEST_COMPLEX_DOUBLE +TEST_TEAMVECTOR_CASE(complex_double, Kokkos::complex) +#endif + +#ifdef KOKKOSKERNELS_TEST_COMPLEX_FLOAT +TEST_TEAMVECTOR_CASE(complex_float, Kokkos::complex) +#endif + +#ifdef KOKKOSKERNELS_TEST_INT +TEST_TEAMVECTOR_CASE(int, int) +#endif + +#ifdef KOKKOSKERNELS_TEST_ALL_TYPES +// test mixed scalar types (void -> default alpha/beta) +TEST_TEAMVECTOR_CASE4(mixed, double, int, float, void) + +// test arbitrary double alpha/beta with complex values +TEST_TEAMVECTOR_CASE2(alphabeta, Kokkos::complex, double) +#endif + +#undef TEST_TEAMVECTOR_CASE4 +#undef TEST_TEAMVECTOR_CASE2 +#undef TEST_TEAMVECTOR_CASE + +#endif // Check for lambda availability on CUDA backend