From cd8a389fa408e4e7e33c8ab7ca48cbdf231f1f25 Mon Sep 17 00:00:00 2001
From: Brian Kelley <bmkelle@sandia.gov>
Date: Fri, 16 Apr 2021 16:14:23 -0600
Subject: [PATCH] Add fast two-level mode N GEMV

Very close to cublas in performance, with LayoutLeft or LayoutRight.
(~100 gflops for 1e5 * 1e5 double matrix on V100)
---
 perf_test/blas/CMakeLists.txt                 |   1 +
 perf_test/blas/blas2/CMakeLists.txt           |   7 +
 .../blas/blas2/KokkosBlas2_gemv_perf_test.cpp | 204 ++++++++++++++++
 src/blas/impl/KokkosBlas2_gemv_impl.hpp       | 218 +++++++++++++++++-
 src/blas/impl/KokkosBlas2_gemv_spec.hpp       |  14 +-
 unit_test/blas/Test_Blas2_gemv.hpp            |  26 ++-
 6 files changed, 443 insertions(+), 27 deletions(-)
 create mode 100644 perf_test/blas/blas2/CMakeLists.txt
 create mode 100644 perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp

diff --git a/perf_test/blas/CMakeLists.txt b/perf_test/blas/CMakeLists.txt
index 2d93de0458..4c347f1d83 100644
--- a/perf_test/blas/CMakeLists.txt
+++ b/perf_test/blas/CMakeLists.txt
@@ -1,2 +1,3 @@
 ADD_SUBDIRECTORY(blas)
+ADD_SUBDIRECTORY(blas2)
 ADD_SUBDIRECTORY(blas3)
diff --git a/perf_test/blas/blas2/CMakeLists.txt b/perf_test/blas/blas2/CMakeLists.txt
new file mode 100644
index 0000000000..f69c576cd3
--- /dev/null
+++ b/perf_test/blas/blas2/CMakeLists.txt
@@ -0,0 +1,7 @@
+KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
+KOKKOSKERNELS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR})
+
+KOKKOSKERNELS_ADD_EXECUTABLE(
+    KokkosBlas2_gemv_perf_test
+    SOURCES KokkosBlas2_gemv_perf_test.cpp
+)
diff --git a/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
new file mode 100644
index 0000000000..67949af165
--- /dev/null
+++ b/perf_test/blas/blas2/KokkosBlas2_gemv_perf_test.cpp
@@ -0,0 +1,204 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//                        Kokkos v. 3.0
+//       Copyright (2020) National Technology & Engineering
+//               Solutions of Sandia, LLC (NTESS).
+//
+// Under the terms of Contract DE-NA0003525 with NTESS,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#include "KokkosBlas2_gemv.hpp"
+#include <Kokkos_Random.hpp>
+
+struct Params
+{
+  int use_cuda = 0;
+  int use_openmp = 0;
+  int use_threads = 0;
+  int m = 5000;
+  int n = 5000;
+  int repeat = 1;
+  bool layoutLeft = true;
+};
+
+void print_options(){
+  std::cerr << "Options\n" << std::endl;
+
+  std::cerr << "\tBACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]'" << std::endl;
+  std::cerr << "\tIf none selected, serial is used." << std::endl;
+  std::cerr << "\t[Optional] --repeat :: how many times to repeat overall spadd (symbolic + repeated numeric)" << std::endl;
+  std::cerr << "\t[Optional] --layout :: matrix layout ('left' or 'right', default 'left')" << std::endl;
+  std::cerr << "\t[Optional] --m      :: number of rows to generate" << std::endl;
+  std::cerr << "\t[Optional] --n      :: number of cols to generate" << std::endl;
+}
+
+int parse_inputs (Params& params, int argc, char **argv){
+  for ( int i = 1 ; i < argc ; ++i ) {
+    if ( 0 == strcasecmp( argv[i] , "--help") || 0 == strcasecmp( argv[i] , "-h" )) {
+      print_options();
+      exit(0);  //note: this is before Kokkos::initialize
+    }
+    else if ( 0 == strcasecmp( argv[i] , "--threads" ) ) {
+      params.use_threads = atoi( argv[++i] );
+    }
+    else if ( 0 == strcasecmp( argv[i] , "--openmp" ) ) {
+      params.use_openmp = atoi( argv[++i] );
+    }
+    else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) {
+      params.use_cuda = atoi( argv[++i] ) + 1;
+    }
+    else if ( 0 == strcasecmp( argv[i] , "--layout" ) ) {
+      i++;
+      if(0 == strcasecmp( argv[i] , "left"))
+        params.layoutLeft = true;
+      else if(0 == strcasecmp( argv[i] , "right"))
+        params.layoutLeft = false;
+      else
+      {
+        std::cerr << "Invalid layout: must be 'left' or 'right'.\n";
+        exit(1);
+      }
+    }
+    else if( 0 == strcasecmp( argv[i], "--m" ))
+    {
+      params.m = atoi(argv[++i]);
+    }
+    else if( 0 == strcasecmp( argv[i], "--n" ))
+    {
+      params.n = atoi(argv[++i]);
+    }
+    else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) {
+      //if provided, C will be written to given file.
+      //has to have ".bin", or ".crs" extension.
+      params.repeat = atoi( argv[++i] );
+    }
+    else {
+      std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl ;
+      print_options();
+      return 1;
+    }
+  }
+  return 0;
+}
+
+template<typename ExecSpace, typename Layout>
+void run(int m, int n, int repeat)
+{
+  using Scalar = double;
+  using MemSpace = typename ExecSpace::memory_space;
+  using Device = Kokkos::Device<ExecSpace, MemSpace>;
+  std::cout << "Running GEMV experiment (" << ExecSpace::name() << ")\n";
+  Kokkos::View<Scalar**, Layout, Device> A(Kokkos::ViewAllocateWithoutInitializing("A"), m, n);
+  Kokkos::View<Scalar*, Device> x(Kokkos::ViewAllocateWithoutInitializing("x"), n);
+  Kokkos::View<Scalar*, Device> y(Kokkos::ViewAllocateWithoutInitializing("y"), m);
+  Kokkos::Random_XorShift64_Pool<ExecSpace> pool(123);
+  Kokkos::fill_random(A, pool, 10.0);
+  Kokkos::fill_random(x, pool, 10.0);
+  //Do a warm-up run
+  KokkosBlas::gemv("N", 1.0, A, x, 0.0, y);
+  //Now, start timing
+  Kokkos::fence();
+  Kokkos::Timer timer;
+  for(int i = 0; i < repeat; i++)
+  {
+    KokkosBlas::gemv("N", 1.0, A, x, 0.0, y);
+    ExecSpace().fence();
+  }
+  double total = timer.seconds();
+  double avg = total / repeat;
+  size_t flopsPerRun = (size_t) m * n;
+  printf("Avg GEMV time: %f s.\n", avg);
+  printf("Avg GEMV FLOP/s: %.3e\n", flopsPerRun / avg);
+}
+
+int main (int argc, char ** argv){
+  Params params;
+
+  if (parse_inputs (params, argc, argv) ){
+    return 1;
+  }
+  const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads
+  const int device_id = params.use_cuda - 1;
+
+  Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) );
+
+  bool useOMP = params.use_openmp != 0;
+  bool useCUDA = params.use_cuda != 0;
+
+  bool useSerial = !useOMP && !useCUDA;
+
+  if(useOMP)
+  {
+#if defined( KOKKOS_ENABLE_OPENMP )
+    if(params.layoutLeft)
+      run<Kokkos::OpenMP, Kokkos::LayoutLeft>(params.m, params.n, params.repeat);
+    else
+      run<Kokkos::OpenMP, Kokkos::LayoutRight>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: OpenMP requested, but not available.\n";
+    return 1;
+#endif
+  }
+  if(useCUDA)
+  {
+#if defined( KOKKOS_ENABLE_CUDA )
+    if(params.layoutLeft)
+      run<Kokkos::Cuda, Kokkos::LayoutLeft>(params.m, params.n, params.repeat);
+    else
+      run<Kokkos::Cuda, Kokkos::LayoutRight>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: CUDA requested, but not available.\n";
+    return 1;
+#endif
+  }
+  if(useSerial)
+  {
+#if defined( KOKKOS_ENABLE_SERIAL )
+    if(params.layoutLeft)
+      run<Kokkos::Serial, Kokkos::LayoutLeft>(params.m, params.n, params.repeat);
+    else
+      run<Kokkos::Serial, Kokkos::LayoutRight>(params.m, params.n, params.repeat);
+#else
+    std::cout << "ERROR: Serial device requested, but not available.\n";
+    return 1;
+#endif
+  }
+  Kokkos::finalize(); 
+  return 0;
+}
+
diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index db5bc9fbca..3c399278ca 100644
--- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -46,6 +46,7 @@
 
 #include "KokkosKernels_config.h"
 #include "Kokkos_Core.hpp"
+#include "KokkosKernels_ExecSpaceUtils.hpp"
 #include "Kokkos_ArithTraits.hpp"
 
 namespace KokkosBlas {
@@ -480,6 +481,132 @@ singleLevelGemv (const char trans[],
   }
 }
 
+struct TwoLevelGEMV_LayoutLeftTag {};
+struct TwoLevelGEMV_LayoutRightTag {};
+
+// ---------------------------------------------------------------------------------------------
+// Functor for a two-level parallel_reduce version of GEMV (non-transpose),
+// designed for performance on GPU. Kernel depends on the layout of A.
+template<class AViewType,
+         class XViewType,
+         class YViewType,
+         class IndexType = typename AViewType::size_type>
+struct TwoLevelGEMV {
+  using y_value_type   = typename YViewType::non_const_value_type;
+  using AlphaCoeffType = typename AViewType::non_const_value_type;
+  using BetaCoeffType  = typename YViewType::non_const_value_type;
+
+
+  using execution_space = typename AViewType::execution_space;
+  using policy_type = Kokkos::TeamPolicy<execution_space>;
+  using member_type = typename policy_type::member_type;
+
+  TwoLevelGEMV (const AlphaCoeffType& alpha,
+                         const AViewType& A,
+                         const XViewType& x,
+                         const BetaCoeffType& beta,
+                         const YViewType& y) :
+    alpha_ (alpha), A_ (A), x_ (x), beta_ (beta), y_ (y)
+  {
+    static_assert (Kokkos::Impl::is_view<AViewType>::value,
+                   "AViewType must be a Kokkos::View.");
+    static_assert (Kokkos::Impl::is_view<XViewType>::value,
+                   "XViewType must be a Kokkos::View.");
+    static_assert (Kokkos::Impl::is_view<YViewType>::value,
+                   "YViewType must be a Kokkos::View.");
+    static_assert (static_cast<int> (AViewType::rank) == 2,
+                   "AViewType must have rank 2.");
+    static_assert (static_cast<int> (XViewType::rank) == 1,
+                   "XViewType must have rank 1.");
+    static_assert (static_cast<int> (YViewType::rank) == 1,
+                   "YViewType must have rank 1.");
+    static_assert (std::is_integral<IndexType>::value,
+                   "IndexType must be an integer.");
+  }
+
+public:
+  //LayoutLeft version: 32xK blocks.
+  //  -Each team handles block rows. 
+  //  -Groups of 32 threads handle N/teamsize columns sequentially, placing results into shared.
+  //  -Then individual thread results are combined with parallel_reduce.
+  KOKKOS_INLINE_FUNCTION void
+  operator () (TwoLevelGEMV_LayoutLeftTag, const member_type& team) const
+  {
+    using Kokkos::Details::ArithTraits;
+    using Scalar = typename YViewType::non_const_value_type;
+    using KAT = ArithTraits<Scalar>;
+    //Allocate a Scalar in shared for each thread
+    Scalar* blockResult = (Scalar*) team.team_shmem().get_shmem(32 * sizeof(Scalar));
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
+    [&](int i)
+    {
+      blockResult[i] = KAT::zero();
+    });
+    team.team_barrier();
+    //Which block this thread will work on
+    int block = team.team_rank() / 32;
+    //Which row in the block this thread will work on
+    IndexType row = team.league_rank() * 32 + team.team_rank() % 32;
+    IndexType blockColStart = columnsPerThread * block;
+    Scalar localSum = KAT::zero();
+    //compute local sum
+    for(IndexType col = blockColStart; col < blockColStart + columnsPerThread; col++)
+    {
+      if(col == (IndexType) A_.extent(1))
+        break;
+      if(row < (IndexType) A_.extent(0))
+      {
+        //A access is coalesced, x access is a broadcast
+        localSum += A_(row, col) * x_(col);
+      }
+    }
+    //atomically combine local result into shared
+    Kokkos::atomic_add(&blockResult[team.team_rank() % 32], localSum);
+    team.team_barrier();
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(team, 32),
+    [&](int i)
+    {
+      IndexType yrow = team.league_rank() * 32 + i;
+      if(yrow < (IndexType) A_.extent(0))
+      {
+        y_[yrow] = beta_ * y_[yrow] + alpha_ * blockResult[i];
+      }
+    });
+  }
+
+  //LayoutRight version: one team per row
+  KOKKOS_INLINE_FUNCTION void
+  operator () (TwoLevelGEMV_LayoutRightTag, const member_type& team) const
+  {
+    using Kokkos::Details::ArithTraits;
+    using KAT = ArithTraits<typename AViewType::non_const_value_type>;
+
+    const IndexType N = A_.extent(1);
+    const int i = team.league_rank(); // batch id
+
+    // parallel-reduce to compute val += A(:,j)' * x
+    y_value_type val = KAT:: zero();
+    Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, N ), [&] ( const int j, y_value_type &update ) {
+      update += A_(i, j) * x_(j);
+    }, val);
+
+    // compute yj = beta*yj + alpha*val
+    Kokkos::single(Kokkos::PerTeam(team),
+    [=]()
+    {
+      y_[i] = beta_ * y_[i] + alpha_ * val;
+    });
+  }
+
+  IndexType columnsPerThread;
+private:
+  AlphaCoeffType alpha_;
+  typename AViewType::const_type A_;
+  typename XViewType::const_type x_;
+  BetaCoeffType beta_;
+  YViewType y_;
+};
+
 
 // ---------------------------------------------------------------------------------------------
 // Functor for a two-level parallel_reduce version of (conjugate)
@@ -593,34 +720,66 @@ twoLevelGemv (const char trans[],
 
   using Kokkos::Details::ArithTraits;
   using KAT = ArithTraits<typename AViewType::non_const_value_type>;
+  using YKAT = ArithTraits<typename YViewType::non_const_value_type>;
 
-  const char tr = trans[0];
+  const char tr = toupper(trans[0]);
 
   // The transpose and conjugate transpose cases where A has zero rows
   // need special handling.  These are equivalent to y := beta*y.  We
   // could implement this using KokkosBlas::scal, but we don't want to
   // depend on that or its implementation details.  Instead, we reuse
   // an instantiation of the non-transpose case for alpha=0.
-  if (A.extent(0) == 0 && (tr != 'N' && tr != 'n')) {
-    if (beta == KAT::zero ()) {
+  if (y.extent(0) == 0)
+  {
+    //no entries to update
+    return;
+  }
+  else if (x.extent(0) == 0)
+  {
+    if (beta == YKAT::zero ()) {
       Kokkos::deep_copy (y, KAT::zero ());
     }
-    else if (beta != Kokkos::Details::ArithTraits<BetaCoeffType>::one ()) {
+    else if (beta != YKAT::one ()) {
       // "Fake out" a scal() by using the non-transpose alpha=0,
       // general beta case.  This assumes that the functor doesn't
       // check dimensions.
       using functor_type = SingleLevelNontransposeGEMV<AViewType, XViewType, YViewType,
                                                        0, -1, IndexType>;
       functor_type functor (alpha, A, x, beta, y);
-      Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, A.extent(1)), functor);
+      Kokkos::parallel_for ("KokkosBlas::gemv[SingleLevel]",range_policy_type (0, y.extent(0)), functor);
     }
     return;
   }
 
-  if (tr == 'N' || tr == 'n') {
-    // NOTE: not implemented, so just call single-level version
-    singleLevelGemv<AViewType, XViewType, YViewType, IndexType>
-         (trans, alpha, A, x, beta, y);
+  if (tr == 'N') {
+    constexpr bool isLayoutLeft = std::is_same<typename AViewType::array_layout, Kokkos::LayoutLeft>::value;
+    using layout_tag = typename std::conditional<isLayoutLeft,
+      TwoLevelGEMV_LayoutLeftTag, TwoLevelGEMV_LayoutRightTag>::type;
+    using tagged_policy = Kokkos::TeamPolicy<execution_space, layout_tag>;
+    using functor_type = TwoLevelGEMV<AViewType, XViewType, YViewType, IndexType>;
+    functor_type functor (alpha, A, x, beta, y);
+    tagged_policy team;
+    if(isLayoutLeft)
+    {
+      size_t sharedPerTeam = 32 * sizeof(y_value_type);
+      IndexType numTeams = (A.extent(0) + 31) / 32;
+      tagged_policy temp(1, 1);
+      int teamSize = temp.team_size_max(functor, Kokkos::ParallelForTag());
+      //make sure teamSize is a multiple of 32
+      teamSize -= teamSize % 32;
+      //don't make teamSize larger than what's useful
+      if((size_t) teamSize > 32 * A.extent(1))
+        teamSize = 32 * A.extent(1);
+      int numBlocks = teamSize / 32;
+      functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks;
+      team = tagged_policy(numTeams, teamSize).set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam));
+    }
+    else
+    {
+      //LayoutRight: one team per row
+      team = tagged_policy(A.extent(0), Kokkos::AUTO);
+    }
+    Kokkos::parallel_for ("KokkosBlas::gemv[twoLevel]", team, functor);
   }
   else {
     if (alpha == KAT::zero () && beta == KAT::zero ()) {
@@ -630,7 +789,7 @@ twoLevelGemv (const char trans[],
     else if (alpha == KAT::zero () && beta == KAT::one ()) {
       // Do nothing (y := 1 * y)
     }
-    else if (tr == 'T' || tr == 't') {
+    else if (tr == 'T') {
       // transpose, and not conj transpose
       team_policy_type  team (A.extent(1), Kokkos::AUTO);
       using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType, YViewType,
@@ -638,7 +797,7 @@ twoLevelGemv (const char trans[],
       functor_type functor (alpha, A, x, beta, y);
       Kokkos::parallel_for ("KokkosBlas::gemv[twoLevelTranspose]", team, functor);
     }
-    else if (tr == 'C' || tr == 'c' || tr == 'H' || tr == 'h') {
+    else if (tr == 'C' || tr == 'H') {
       // conjugate transpose
       team_policy_type  team (A.extent(1), Kokkos::AUTO);
       using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType, YViewType,
@@ -649,6 +808,43 @@ twoLevelGemv (const char trans[],
   }
 }
 
+//generalGemv: use 1 level (Range) or 2 level (Team) implementation,
+//depending on whether execution space is CPU or GPU. enable_if makes sure
+//unused kernels are not instantiated.
+template<class AViewType,
+         class XViewType,
+         class YViewType,
+         class IndexType,
+         typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space
+           <typename AViewType::execution_space>()>::type* = nullptr>
+void
+generalGemvImpl (const char trans[],
+                 typename AViewType::const_value_type& alpha,
+                 const AViewType& A,
+                 const XViewType& x,
+                 typename YViewType::const_value_type& beta,
+                 const YViewType& y)
+{
+  singleLevelGemv (trans, alpha, A, x, beta, y);
+}
+
+template<class AViewType,
+         class XViewType,
+         class YViewType,
+         class IndexType,
+         typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space
+           <typename AViewType::execution_space>()>::type* = nullptr>
+void
+generalGemvImpl (const char trans[],
+                 typename AViewType::const_value_type& alpha,
+                 const AViewType& A,
+                 const XViewType& x,
+                 typename YViewType::const_value_type& beta,
+                 const YViewType& y)
+{
+  twoLevelGemv (trans, alpha, A, x, beta, y);
+}
+
 } // namespace Impl
 } // namespace KokkosBlas
 
diff --git a/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_gemv_spec.hpp
index 76d98c65bc..da7983b07a 100644
--- a/src/blas/impl/KokkosBlas2_gemv_spec.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_spec.hpp
@@ -136,22 +136,12 @@ struct GEMV {
     // Prefer int as the index type, but use a larger type if needed.
     if (numRows < static_cast<size_type> (INT_MAX) &&
         numCols < static_cast<size_type> (INT_MAX)) {
-      #if 1
-      twoLevelGemv<AViewType, XViewType, YViewType, int>
+      generalGemvImpl<AViewType, XViewType, YViewType, int>
          (trans, alpha, A, x, beta, y);
-      #else
-      singleLevelGemv<AViewType, XViewType, YViewType, int>
-         (trans, alpha, A, x, beta, y);
-      #endif
     }
     else {
-      #if 1
-      twoLevelGemv<AViewType, XViewType, YViewType, int64_t>
-         (trans, alpha, A, x, beta, y);
-      #else
-      singleLevelGemv<AViewType, XViewType, YViewType, int64_t>
+      generalGemvImpl<AViewType, XViewType, YViewType, int64_t>
          (trans, alpha, A, x, beta, y);
-      #endif
     }
     Kokkos::Profiling::popRegion();
   }
diff --git a/unit_test/blas/Test_Blas2_gemv.hpp b/unit_test/blas/Test_Blas2_gemv.hpp
index 9ae63b5f8f..b7fd1870a9 100644
--- a/unit_test/blas/Test_Blas2_gemv.hpp
+++ b/unit_test/blas/Test_Blas2_gemv.hpp
@@ -26,7 +26,7 @@ namespace Test {
 
     ScalarA alpha = 3;
     ScalarX beta = 5;
-    double eps = (std::is_same<typename Kokkos::ArithTraits<ScalarY>::mag_type, float>::value ? 1e-3 : 1e-10);
+    double eps = (std::is_same<typename Kokkos::ArithTraits<ScalarY>::mag_type, float>::value ? 1e-3 : 3e-10);
 
     int ldx;
     int ldy;
@@ -115,26 +115,35 @@ namespace Test {
 
     KokkosBlas::gemv(mode, alpha, A, x, beta, y);
     Kokkos::deep_copy(h_b_y, b_y);
+    int numErrors = 0;
     for(int i = 0; i < ldy; i++)
     {
-      EXPECT_NEAR_KK(expected(i), h_y(i), eps * expected(i));
+      if(KAT::abs(expected(i) - h_y(i)) > KAT::abs(eps * expected(i)))
+        numErrors++;
     }
+    EXPECT_EQ(numErrors, 0) << "Nonconst input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
  
     Kokkos::deep_copy(b_y, b_org_y);
     KokkosBlas::gemv(mode, alpha,A ,c_x, beta, y);
     Kokkos::deep_copy(h_b_y, b_y);
+    numErrors = 0;
     for(int i = 0; i < ldy; i++)
     {
-      EXPECT_NEAR_KK(expected(i), h_y(i), eps);
+      if(KAT::abs(expected(i) - h_y(i)) > KAT::abs(eps * expected(i)))
+        numErrors++;
     }
+    EXPECT_EQ(numErrors, 0) << "Const vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
 
     Kokkos::deep_copy(b_y, b_org_y);
     KokkosBlas::gemv(mode, alpha, c_A, c_x, beta, y);
     Kokkos::deep_copy(h_b_y, b_y);
+    numErrors = 0;
     for(int i = 0; i < ldy; i++)
     {
-      EXPECT_NEAR_KK(expected(i), h_y(i), eps);
+      if(KAT::abs(expected(i) - h_y(i)) > KAT::abs(eps * expected(i)))
+        numErrors++;
     }
+    EXPECT_EQ(numErrors, 0) << "Const matrix/vector input, " << M << 'x' << N << ", alpha = " << alpha << ", beta = " << beta << ", mode " << mode << ": gemv incorrect";
   }
 }
 
@@ -156,8 +165,11 @@ int test_gemv(const char* mode) {
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,200,10);
   #endif
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,0,1024);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,1024,0);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,13,13);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,13,1024);
   Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,1024,1024);
+  Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,4321,4321);
   //Test::impl_test_gemv<view_type_a_ll, view_type_b_ll, view_type_c_ll, Device>(mode,132231,1024);
 #endif
 
@@ -166,8 +178,11 @@ int test_gemv(const char* mode) {
   typedef Kokkos::View<ScalarX*, Kokkos::LayoutRight, Device> view_type_b_lr;
   typedef Kokkos::View<ScalarY*, Kokkos::LayoutRight, Device> view_type_c_lr;
   Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,0,1024);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,1024,0);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,13,13);
   Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,13,1024);
   Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,1024,1024);
+  Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,4321,4321);
   //Test::impl_test_gemv<view_type_a_lr, view_type_b_lr, view_type_c_lr, Device>(mode,132231,1024);
 #endif
 
@@ -176,8 +191,11 @@ int test_gemv(const char* mode) {
   typedef Kokkos::View<ScalarX*, Kokkos::LayoutStride, Device> view_type_b_ls;
   typedef Kokkos::View<ScalarY*, Kokkos::LayoutStride, Device> view_type_c_ls;
   Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,0,1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,1024,0);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,13);
   Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,13,1024);
   Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,1024,1024);
+  Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,4321,4321);
   //Test::impl_test_gemv<view_type_a_ls, view_type_b_ls, view_type_c_ls, Device>(mode,132231,1024);
 #endif