From 718fe40db84fb6619fcff400a43f5321aa083eb9 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Tue, 17 Aug 2021 10:28:31 -0600
Subject: [PATCH] Use float as accumulator for GEMV on half_t

This avoid atomics with half_t, which are currently very slow on CUDA
(use lock tables, not native atomics).
This also improves the numerical accuracy of the output for all modes
(not just TwoLevel/LayoutLeft, which was using the atomics).
---
 src/blas/impl/KokkosBlas2_gemv_impl.hpp | 88 +++++++++++--------------
 1 file changed, 39 insertions(+), 49 deletions(-)
diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index 1708089dfe..c3f93e3315 100644
--- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -63,6 +63,8 @@ template<class AViewType,
 struct SingleLevelNontransposeGEMV {
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
+  using y_value_type = typename YViewType::non_const_value_type;
+  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
 
   SingleLevelNontransposeGEMV (const AlphaCoeffType& alpha,
                                const AViewType& A,
@@ -96,17 +98,15 @@ struct SingleLevelNontransposeGEMV {
   KOKKOS_INLINE_FUNCTION void
   operator () (const IndexType& i) const
   {
-    using y_value_type = typename YViewType::non_const_value_type;
-
-    y_value_type y_i;
+    AccumScalar y_i;
     if (betaPreset == 0) {
-      y_i = Kokkos::ArithTraits<y_value_type>::zero ();
+      y_i = Kokkos::ArithTraits<AccumScalar>::zero ();
     }
     else if (betaPreset == 1) {
-      y_i = y_(i);
+      y_i = AccumScalar(y_(i));
     }
     else { // beta_ != 0 and beta != 1
-      y_i = beta_ * y_(i);
+      y_i = beta_ * AccumScalar(y_(i));
     }
 
     const IndexType numCols = A_.extent(1);
@@ -115,12 +115,12 @@ struct SingleLevelNontransposeGEMV {
     }
     else if (alphaPreset == 1) {
       for (IndexType j = 0; j < numCols; ++j) {
-        y_i += A_(i,j) * x_(j);
+        y_i += AccumScalar(A_(i,j)) * x_(j);
       }
     }
     else { // alpha_ != 0 and alpha_ != 1
       for (IndexType j = 0; j < numCols; ++j) {
-        y_i += alpha_ * A_(i,j) * x_(j);
+        y_i += alpha_ * AccumScalar(A_(i,j)) * x_(j);
       }
     }
 
@@ -155,8 +155,9 @@ struct SingleLevelTransposeGEMV {
   using y_value_type   = typename YViewType::non_const_value_type;
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
+  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
 
-  typedef y_value_type value_type[];
+  typedef AccumScalar value_type[];
   IndexType value_count; // Kokkos needs this for reductions w/ array results
 
   SingleLevelTransposeGEMV (const AlphaCoeffType& alpha,
@@ -192,7 +193,7 @@ struct SingleLevelTransposeGEMV {
   init (value_type y_cur) const
   {
     for (IndexType j = 0; j < value_count; ++j) {
-      y_cur[j] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
+      y_cur[j] = Kokkos::ArithTraits<AccumScalar>::zero ();
     }
   }
 
@@ -208,14 +209,12 @@ struct SingleLevelTransposeGEMV {
   KOKKOS_INLINE_FUNCTION void
   final (value_type y_result) const
   {
-    using Kokkos::Details::ArithTraits;
-
     for (IndexType j = 0; j < value_count; ++j) {
       // Sum into initial y_ values; use beta as a pre-multiplier if nonzero.
       if(betaPreset == 0)
-        y_(j) = y_result[j];
+        y_(j) = y_value_type(alpha_ * y_result[j]);
       else
-        y_(j) = beta_ * y_(j) + y_result[j];
+        y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * y_result[j]);
     }
   }
 
@@ -226,17 +225,9 @@ struct SingleLevelTransposeGEMV {
     using KAT = ArithTraits<typename AViewType::non_const_value_type>;
 
     const auto x_i = x_(i);
-    if (alphaPreset == 1) {
-      for (IndexType j = 0; j < value_count; ++j) {
-        const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
-        y_cur[j] += A_ij * x_i;
-      }
-    }
-    else if (alphaPreset != 0) { // alpha_ != 0 and alpha_ != 1
-      for (IndexType j = 0; j < value_count; ++j) {
-        const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
-        y_cur[j] += alpha_ * A_ij * x_i;
-      }
+    for (IndexType j = 0; j < value_count; ++j) {
+      const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
+      y_cur[j] += AccumScalar(A_ij) * x_i;
     }
   }
 
@@ -494,7 +485,7 @@ struct TwoLevelGEMV {
   using y_value_type   = typename YViewType::non_const_value_type;
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
-
+  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
 
   using execution_space = typename AViewType::execution_space;
   using policy_type = Kokkos::TeamPolicy<execution_space>;
@@ -531,15 +522,14 @@ struct TwoLevelGEMV {
   KOKKOS_INLINE_FUNCTION void
   operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const
   {
-    using Kokkos::Details::ArithTraits;
-    using Scalar = typename YViewType::non_const_value_type;
-    using KAT = ArithTraits<Scalar>;
+    using KAT = Kokkos::ArithTraits<y_value_type>;
+    using AKAT = Kokkos::ArithTraits<AccumScalar>;
     //Allocate a Scalar in shared for each thread
-    Scalar* blockResult = (Scalar*) team.team_shmem().get_shmem(32 * sizeof(Scalar));
+    AccumScalar* blockResult = (AccumScalar*) team.team_shmem().get_shmem(32 * sizeof(AccumScalar));
     Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
     [&](int i)
     {
-      blockResult[i] = KAT::zero();
+      blockResult[i] = AKAT::zero();
     });
     team.team_barrier();
     //Which block this thread will work on
@@ -547,14 +537,14 @@ struct TwoLevelGEMV {
     //Which row in the block this thread will work on
     IndexType row = team.league_rank() * 32 + team.team_rank() % 32;
     IndexType blockColStart = columnsPerThread * block;
-    Scalar localSum = KAT::zero();
+    AccumScalar localSum = AKAT::zero();
     //compute local sum
     if(row < (IndexType) A_.extent(0))
     {
       for(IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++)
       {
         //A access is coalesced, x access is a broadcast
-        localSum += A_(row, col) * x_(col);
+        localSum += AccumScalar(A_(row, col)) * AccumScalar(x_(col));
       }
     }
     //atomically combine local result into shared
@@ -567,9 +557,9 @@ struct TwoLevelGEMV {
       if(yrow < (IndexType) A_.extent(0))
       {
         if(beta_ == KAT::zero())
-          y_(yrow) = alpha_ * blockResult[i];
+          y_(yrow) = y_value_type(alpha_ * blockResult[i]);
         else
-          y_(yrow) = beta_ * y_(yrow) + alpha_ * blockResult[i];
+          y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + alpha_ * blockResult[i]);
       }
     });
   }
@@ -578,16 +568,15 @@ struct TwoLevelGEMV {
   KOKKOS_INLINE_FUNCTION void
   operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const
   {
-    using Kokkos::Details::ArithTraits;
-    using KAT = ArithTraits<typename YViewType::non_const_value_type>;
+    using KAT = Kokkos::ArithTraits<y_value_type>;
 
     const IndexType N = A_.extent(1);
     const int i = team.league_rank(); // batch id
 
     // parallel-reduce to compute val += A(:,j)' * x
-    y_value_type val = KAT::zero();
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, y_value_type &update ) {
-      update += A_(i, j) * x_(j);
+    AccumScalar val;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, AccumScalar& update ) {
+      update += AccumScalar(A_(i, j)) * x_(j);
     }, val);
 
     // compute yj = beta*yj + alpha*val
@@ -595,9 +584,9 @@ struct TwoLevelGEMV {
     [&]()
     {
       if(beta_ == KAT::zero())
-        y_(i) = alpha_ * val;
+        y_(i) = y_value_type(alpha_ * val);
       else
-        y_(i) = beta_ * y_(i) + alpha_ * val;
+        y_(i) = y_value_type(beta_ * AccumScalar(y_(i)) + alpha_ * val);
     });
   }
 
@@ -625,7 +614,7 @@ struct TwoLevelTransposeGEMV {
   using y_value_type   = typename YViewType::non_const_value_type;
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
-
+  using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
 
   using execution_space = typename AViewType::execution_space;
   using policy_type = Kokkos::TeamPolicy<execution_space>;
@@ -666,11 +655,11 @@ struct TwoLevelTransposeGEMV {
     const int j = team.league_rank(); // batch id
 
     // parallel-reduce to compute val += A(:,j)' * x
-    y_value_type val = KAT_Y::zero();
-    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, y_value_type &update ) {
+    AccumScalar val;
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, AccumScalar& update ) {
       const auto x_i = x_(i);
       const auto A_ij = conj ? KAT_A::conj (A_(i,j)) : A_(i,j);
-      update += A_ij * x_i;
+      update += AccumScalar(A_ij) * x_i;
     }, val);
 
     // compute yj = beta*yj + alpha*val
@@ -678,9 +667,9 @@ struct TwoLevelTransposeGEMV {
     [&]()
     {
       if(beta_ == KAT_Y::zero())
-        y_(j) = alpha_ * val;
+        y_(j) = y_value_type(alpha_ * val);
       else
-        y_(j) = beta_ * y_(j) + alpha_ * val;
+        y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * val);
     });
   }
 
@@ -768,7 +757,8 @@ twoLevelGemv (const char trans[],
     tagged_policy team;
     if(isLayoutLeft)
     {
-      size_t sharedPerTeam = 32 * sizeof(y_value_type);
+      using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
+      size_t sharedPerTeam = 32 * sizeof(AccumScalar);
       IndexType numTeams = (A.extent(0) + 31) / 32;
       tagged_policy temp(1, 1);
       int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag());