Skip to content

Commit

Permalink
Use float as accumulator for GEMV on half_t
Browse files Browse the repository at this point in the history
This avoid atomics with half_t, which are currently very slow on CUDA
(use lock tables, not native atomics).
This also improves the numerical accuracy of the output for all modes
(not just TwoLevel/LayoutLeft, which was using the atomics).
  • Loading branch information
brian-kelley committed Aug 17, 2021
1 parent c151fbb commit 718fe40
Showing 1 changed file with 39 additions and 49 deletions.
88 changes: 39 additions & 49 deletions src/blas/impl/KokkosBlas2_gemv_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ template<class AViewType,
struct SingleLevelNontransposeGEMV {
using AlphaCoeffType = typename AViewType::non_const_value_type;
using BetaCoeffType = typename YViewType::non_const_value_type;
using y_value_type = typename YViewType::non_const_value_type;
using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;

SingleLevelNontransposeGEMV (const AlphaCoeffType& alpha,
const AViewType& A,
Expand Down Expand Up @@ -96,17 +98,15 @@ struct SingleLevelNontransposeGEMV {
KOKKOS_INLINE_FUNCTION void
operator () (const IndexType& i) const
{
using y_value_type = typename YViewType::non_const_value_type;

y_value_type y_i;
AccumScalar y_i;
if (betaPreset == 0) {
y_i = Kokkos::ArithTraits<y_value_type>::zero ();
y_i = Kokkos::ArithTraits<AccumScalar>::zero ();
}
else if (betaPreset == 1) {
y_i = y_(i);
y_i = AccumScalar(y_(i));
}
else { // beta_ != 0 and beta != 1
y_i = beta_ * y_(i);
y_i = beta_ * AccumScalar(y_(i));
}

const IndexType numCols = A_.extent(1);
Expand All @@ -115,12 +115,12 @@ struct SingleLevelNontransposeGEMV {
}
else if (alphaPreset == 1) {
for (IndexType j = 0; j < numCols; ++j) {
y_i += A_(i,j) * x_(j);
y_i += AccumScalar(A_(i,j)) * x_(j);
}
}
else { // alpha_ != 0 and alpha_ != 1
for (IndexType j = 0; j < numCols; ++j) {
y_i += alpha_ * A_(i,j) * x_(j);
y_i += alpha_ * AccumScalar(A_(i,j)) * x_(j);
}
}

Expand Down Expand Up @@ -155,8 +155,9 @@ struct SingleLevelTransposeGEMV {
using y_value_type = typename YViewType::non_const_value_type;
using AlphaCoeffType = typename AViewType::non_const_value_type;
using BetaCoeffType = typename YViewType::non_const_value_type;
using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;

typedef y_value_type value_type[];
typedef AccumScalar value_type[];
IndexType value_count; // Kokkos needs this for reductions w/ array results

SingleLevelTransposeGEMV (const AlphaCoeffType& alpha,
Expand Down Expand Up @@ -192,7 +193,7 @@ struct SingleLevelTransposeGEMV {
init (value_type y_cur) const
{
for (IndexType j = 0; j < value_count; ++j) {
y_cur[j] = Kokkos::Details::ArithTraits<y_value_type>::zero ();
y_cur[j] = Kokkos::ArithTraits<AccumScalar>::zero ();
}
}

Expand All @@ -208,14 +209,12 @@ struct SingleLevelTransposeGEMV {
KOKKOS_INLINE_FUNCTION void
final (value_type y_result) const
{
using Kokkos::Details::ArithTraits;

for (IndexType j = 0; j < value_count; ++j) {
// Sum into initial y_ values; use beta as a pre-multiplier if nonzero.
if(betaPreset == 0)
y_(j) = y_result[j];
y_(j) = y_value_type(alpha_ * y_result[j]);
else
y_(j) = beta_ * y_(j) + y_result[j];
y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * y_result[j]);
}
}

Expand All @@ -226,17 +225,9 @@ struct SingleLevelTransposeGEMV {
using KAT = ArithTraits<typename AViewType::non_const_value_type>;

const auto x_i = x_(i);
if (alphaPreset == 1) {
for (IndexType j = 0; j < value_count; ++j) {
const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
y_cur[j] += A_ij * x_i;
}
}
else if (alphaPreset != 0) { // alpha_ != 0 and alpha_ != 1
for (IndexType j = 0; j < value_count; ++j) {
const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
y_cur[j] += alpha_ * A_ij * x_i;
}
for (IndexType j = 0; j < value_count; ++j) {
const auto A_ij = conj ? KAT::conj (A_(i,j)) : A_(i,j);
y_cur[j] += AccumScalar(A_ij) * x_i;
}
}

Expand Down Expand Up @@ -494,7 +485,7 @@ struct TwoLevelGEMV {
using y_value_type = typename YViewType::non_const_value_type;
using AlphaCoeffType = typename AViewType::non_const_value_type;
using BetaCoeffType = typename YViewType::non_const_value_type;

using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;

using execution_space = typename AViewType::execution_space;
using policy_type = Kokkos::TeamPolicy<execution_space>;
Expand Down Expand Up @@ -531,30 +522,29 @@ struct TwoLevelGEMV {
KOKKOS_INLINE_FUNCTION void
operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const
{
using Kokkos::Details::ArithTraits;
using Scalar = typename YViewType::non_const_value_type;
using KAT = ArithTraits<Scalar>;
using KAT = Kokkos::ArithTraits<y_value_type>;
using AKAT = Kokkos::ArithTraits<AccumScalar>;
//Allocate a Scalar in shared for each thread
Scalar* blockResult = (Scalar*) team.team_shmem().get_shmem(32 * sizeof(Scalar));
AccumScalar* blockResult = (AccumScalar*) team.team_shmem().get_shmem(32 * sizeof(AccumScalar));
Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
[&](int i)
{
blockResult[i] = KAT::zero();
blockResult[i] = AKAT::zero();
});
team.team_barrier();
//Which block this thread will work on
int block = team.team_rank() / 32;
//Which row in the block this thread will work on
IndexType row = team.league_rank() * 32 + team.team_rank() % 32;
IndexType blockColStart = columnsPerThread * block;
Scalar localSum = KAT::zero();
AccumScalar localSum = AKAT::zero();
//compute local sum
if(row < (IndexType) A_.extent(0))
{
for(IndexType col = blockColStart; col < blockColStart + columnsPerThread && col < A_.extent(1); col++)
{
//A access is coalesced, x access is a broadcast
localSum += A_(row, col) * x_(col);
localSum += AccumScalar(A_(row, col)) * AccumScalar(x_(col));
}
}
//atomically combine local result into shared
Expand All @@ -567,9 +557,9 @@ struct TwoLevelGEMV {
if(yrow < (IndexType) A_.extent(0))
{
if(beta_ == KAT::zero())
y_(yrow) = alpha_ * blockResult[i];
y_(yrow) = y_value_type(alpha_ * blockResult[i]);
else
y_(yrow) = beta_ * y_(yrow) + alpha_ * blockResult[i];
y_(yrow) = y_value_type(beta_ * AccumScalar(y_(yrow)) + alpha_ * blockResult[i]);
}
});
}
Expand All @@ -578,26 +568,25 @@ struct TwoLevelGEMV {
KOKKOS_INLINE_FUNCTION void
operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const
{
using Kokkos::Details::ArithTraits;
using KAT = ArithTraits<typename YViewType::non_const_value_type>;
using KAT = Kokkos::ArithTraits<y_value_type>;

const IndexType N = A_.extent(1);
const int i = team.league_rank(); // batch id

// parallel-reduce to compute val += A(:,j)' * x
y_value_type val = KAT::zero();
Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, y_value_type &update ) {
update += A_(i, j) * x_(j);
AccumScalar val;
Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, AccumScalar& update ) {
update += AccumScalar(A_(i, j)) * x_(j);
}, val);

// compute yj = beta*yj + alpha*val
Kokkos::single(Kokkos::PerTeam(team),
[&]()
{
if(beta_ == KAT::zero())
y_(i) = alpha_ * val;
y_(i) = y_value_type(alpha_ * val);
else
y_(i) = beta_ * y_(i) + alpha_ * val;
y_(i) = y_value_type(beta_ * AccumScalar(y_(i)) + alpha_ * val);
});
}

Expand Down Expand Up @@ -625,7 +614,7 @@ struct TwoLevelTransposeGEMV {
using y_value_type = typename YViewType::non_const_value_type;
using AlphaCoeffType = typename AViewType::non_const_value_type;
using BetaCoeffType = typename YViewType::non_const_value_type;

using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;

using execution_space = typename AViewType::execution_space;
using policy_type = Kokkos::TeamPolicy<execution_space>;
Expand Down Expand Up @@ -666,21 +655,21 @@ struct TwoLevelTransposeGEMV {
const int j = team.league_rank(); // batch id

// parallel-reduce to compute val += A(:,j)' * x
y_value_type val = KAT_Y::zero();
Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, y_value_type &update ) {
AccumScalar val;
Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, M ), [&] ( const int i, AccumScalar& update ) {
const auto x_i = x_(i);
const auto A_ij = conj ? KAT_A::conj (A_(i,j)) : A_(i,j);
update += A_ij * x_i;
update += AccumScalar(A_ij) * x_i;
}, val);

// compute yj = beta*yj + alpha*val
Kokkos::single(Kokkos::PerTeam(team),
[&]()
{
if(beta_ == KAT_Y::zero())
y_(j) = alpha_ * val;
y_(j) = y_value_type(alpha_ * val);
else
y_(j) = beta_ * y_(j) + alpha_ * val;
y_(j) = y_value_type(beta_ * AccumScalar(y_(j)) + alpha_ * val);
});
}

Expand Down Expand Up @@ -768,7 +757,8 @@ twoLevelGemv (const char trans[],
tagged_policy team;
if(isLayoutLeft)
{
size_t sharedPerTeam = 32 * sizeof(y_value_type);
using AccumScalar = typename std::conditional<std::is_same<y_value_type, Kokkos::Experimental::half_t>::value, float, y_value_type>::type;
size_t sharedPerTeam = 32 * sizeof(AccumScalar);
IndexType numTeams = (A.extent(0) + 31) / 32;
tagged_policy temp(1, 1);
int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag());
Expand Down

0 comments on commit 718fe40

Please sign in to comment.