From 718fe40db84fb6619fcff400a43f5321aa083eb9 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 17 Aug 2021 10:28:31 -0600 Subject: [PATCH] Use float as accumulator for GEMV on half_t This avoid atomics with half_t, which are currently very slow on CUDA (use lock tables, not native atomics). This also improves the numerical accuracy of the output for all modes (not just TwoLevel/LayoutLeft, which was using the atomics). --- src/blas/impl/KokkosBlas2_gemv_impl.hpp | 88 +++++++++++-------------- 1 file changed, 39 insertions(+), 49 deletions(-) diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp index 1708089dfe..c3f93e3315 100644 --- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp +++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp @@ -63,6 +63,8 @@ template::value, float, y_value_type>::type; SingleLevelNontransposeGEMV (const AlphaCoeffType& alpha, const AViewType& A, @@ -96,17 +98,15 @@ struct SingleLevelNontransposeGEMV { KOKKOS_INLINE_FUNCTION void operator () (const IndexType& i) const { - using y_value_type = typename YViewType::non_const_value_type; - - y_value_type y_i; + AccumScalar y_i; if (betaPreset == 0) { - y_i = Kokkos::ArithTraits::zero (); + y_i = Kokkos::ArithTraits::zero (); } else if (betaPreset == 1) { - y_i = y_(i); + y_i = AccumScalar(y_(i)); } else { // beta_ != 0 and beta != 1 - y_i = beta_ * y_(i); + y_i = beta_ * AccumScalar(y_(i)); } const IndexType numCols = A_.extent(1); @@ -115,12 +115,12 @@ struct SingleLevelNontransposeGEMV { } else if (alphaPreset == 1) { for (IndexType j = 0; j < numCols; ++j) { - y_i += A_(i,j) * x_(j); + y_i += AccumScalar(A_(i,j)) * x_(j); } } else { // alpha_ != 0 and alpha_ != 1 for (IndexType j = 0; j < numCols; ++j) { - y_i += alpha_ * A_(i,j) * x_(j); + y_i += alpha_ * AccumScalar(A_(i,j)) * x_(j); } } @@ -155,8 +155,9 @@ struct SingleLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; + using AccumScalar = typename std::conditional::value, float, y_value_type>::type; - typedef y_value_type value_type[]; + typedef AccumScalar value_type[]; IndexType value_count; // Kokkos needs this for reductions w/ array results SingleLevelTransposeGEMV (const AlphaCoeffType& alpha, @@ -192,7 +193,7 @@ struct SingleLevelTransposeGEMV { init (value_type y_cur) const { for (IndexType j = 0; j < value_count; ++j) { - y_cur[j] = Kokkos::Details::ArithTraits::zero (); + y_cur[j] = Kokkos::ArithTraits::zero (); } } @@ -208,14 +209,12 @@ struct SingleLevelTransposeGEMV { KOKKOS_INLINE_FUNCTION void final (value_type y_result) const { - using Kokkos::Details::ArithTraits; - for (IndexType j = 0; j < value_count; ++j) { // Sum into initial y_ values; use beta as a pre-multiplier if nonzero. if(betaPreset == 0) - y_(j) = y_result[j]; + y_(j) = y_value_type(alpha_ * y_result[j]); else - y_(j) = beta_ * y_(j) + y_result[j]; + y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * y_result[j]); } } @@ -226,17 +225,9 @@ struct SingleLevelTransposeGEMV { using KAT = ArithTraits; const auto x_i = x_(i); - if (alphaPreset == 1) { - for (IndexType j = 0; j < value_count; ++j) { - const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j); - y_cur[j] += A_ij * x_i; - } - } - else if (alphaPreset != 0) { // alpha_ != 0 and alpha_ != 1 - for (IndexType j = 0; j < value_count; ++j) { - const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j); - y_cur[j] += alpha_ * A_ij * x_i; - } + for (IndexType j = 0; j < value_count; ++j) { + const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j); + y_cur[j] += AccumScalar(A_ij) * x_i; } } @@ -494,7 +485,7 @@ struct TwoLevelGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - + using AccumScalar = typename std::conditional::value, float, y_value_type>::type; using execution_space = typename AViewType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -531,15 +522,14 @@ struct TwoLevelGEMV { KOKKOS_INLINE_FUNCTION void operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const { - using Kokkos::Details::ArithTraits; - using Scalar = typename YViewType::non_const_value_type; - using KAT = ArithTraits; + using KAT = Kokkos::ArithTraits; + using AKAT = Kokkos::ArithTraits; //Allocate a Scalar in shared for each thread - Scalar* blockResult = (Scalar*) team.team_shmem().get_shmem(32 * sizeof(Scalar)); + AccumScalar* blockResult = (AccumScalar*) team.team_shmem().get_shmem(32 * sizeof(AccumScalar)); Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32), [&](int i) { - blockResult[i] = KAT::zero(); + blockResult[i] = AKAT::zero(); }); team.team_barrier(); //Which block this thread will work on @@ -547,14 +537,14 @@ struct TwoLevelGEMV { //Which row in the block this thread will work on IndexType row = team.league_rank() * 32 + team.team_rank() % 32; IndexType blockColStart = columnsPerThread * block; - Scalar localSum = KAT::zero(); + AccumScalar localSum = AKAT::zero(); //compute local sum if(row < (IndexType) A_.extent(0)) { for(IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++) { //A access is coalesced, x access is a broadcast - localSum += A_(row, col) * x_(col); + localSum += AccumScalar(A_(row, col)) * AccumScalar(x_(col)); } } //atomically combine local result into shared @@ -567,9 +557,9 @@ struct TwoLevelGEMV { if(yrow < (IndexType) A_.extent(0)) { if(beta_ == KAT::zero()) - y_(yrow) = alpha_ * blockResult[i]; + y_(yrow) = y_value_type(alpha_ * blockResult[i]); else - y_(yrow) = beta_ * y_(yrow) + alpha_ * blockResult[i]; + y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + alpha_ * blockResult[i]); } }); } @@ -578,16 +568,15 @@ struct TwoLevelGEMV { KOKKOS_INLINE_FUNCTION void operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const { - using Kokkos::Details::ArithTraits; - using KAT = ArithTraits; + using KAT = Kokkos::ArithTraits; const IndexType N = A_.extent(1); const int i = team.league_rank(); // batch id // parallel-reduce to compute val += A(:,j)' * x - y_value_type val = KAT::zero(); - Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, y_value_type &update ) { - update += A_(i, j) * x_(j); + AccumScalar val; + Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, AccumScalar& update ) { + update += AccumScalar(A_(i, j)) * x_(j); }, val); // compute yj = beta*yj + alpha*val @@ -595,9 +584,9 @@ struct TwoLevelGEMV { [&]() { if(beta_ == KAT::zero()) - y_(i) = alpha_ * val; + y_(i) = y_value_type(alpha_ * val); else - y_(i) = beta_ * y_(i) + alpha_ * val; + y_(i) = y_value_type(beta_ * AccumScalar(y_(i)) + alpha_ * val); }); } @@ -625,7 +614,7 @@ struct TwoLevelTransposeGEMV { using y_value_type = typename YViewType::non_const_value_type; using AlphaCoeffType = typename AViewType::non_const_value_type; using BetaCoeffType = typename YViewType::non_const_value_type; - + using AccumScalar = typename std::conditional::value, float, y_value_type>::type; using execution_space = typename AViewType::execution_space; using policy_type = Kokkos::TeamPolicy; @@ -666,11 +655,11 @@ struct TwoLevelTransposeGEMV { const int j = team.league_rank(); // batch id // parallel-reduce to compute val += A(:,j)' * x - y_value_type val = KAT_Y::zero(); - Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, y_value_type &update ) { + AccumScalar val; + Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, AccumScalar& update ) { const auto x_i = x_(i); const auto A_ij = conj ? KAT_A::conj (A_(i,j)) : A_(i,j); - update += A_ij * x_i; + update += AccumScalar(A_ij) * x_i; }, val); // compute yj = beta*yj + alpha*val @@ -678,9 +667,9 @@ struct TwoLevelTransposeGEMV { [&]() { if(beta_ == KAT_Y::zero()) - y_(j) = alpha_ * val; + y_(j) = y_value_type(alpha_ * val); else - y_(j) = beta_ * y_(j) + alpha_ * val; + y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * val); }); } @@ -768,7 +757,8 @@ twoLevelGemv (const char trans[], tagged_policy team; if(isLayoutLeft) { - size_t sharedPerTeam = 32 * sizeof(y_value_type); + using AccumScalar = typename std::conditional::value, float, y_value_type>::type; + size_t sharedPerTeam = 32 * sizeof(AccumScalar); IndexType numTeams = (A.extent(0) + 31) / 32; tagged_policy temp(1, 1); int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag());