diff --git a/src/blas/KokkosBlas2_gemv.hpp b/src/blas/KokkosBlas2_gemv.hpp
index 9353428da2..9b8189f994 100644
--- a/src/blas/KokkosBlas2_gemv.hpp
+++ b/src/blas/KokkosBlas2_gemv.hpp
@@ -63,6 +63,9 @@ namespace KokkosBlas {
 /// \tparam AlphaCoeffType Type of input coefficient alpha
 /// \tparam BetaCoeffType Type of input coefficient beta
 ///
+/// \param space [in] execution space instance on which to run the
+///   kernel. This may contain information about which stream to
+///   run on.
 /// \param trans [in] "N" for non-transpose, "T" for transpose, "C"
 ///   for conjugate transpose.  All characters after the first are
 ///   ignored.  This works just like the BLAS routines.
@@ -72,9 +75,10 @@ namespace KokkosBlas {
 /// \param beta [in] Input coefficient of y
 /// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View
 template <class AViewType, class XViewType, class YViewType>
-void gemv(const char trans[], typename AViewType::const_value_type& alpha,
-          const AViewType& A, const XViewType& x,
-          typename YViewType::const_value_type& beta, const YViewType& y) {
+void gemv(const typename AViewType::execution_space& space, const char trans[],
+          typename AViewType::const_value_type& alpha, const AViewType& A,
+          const XViewType& x, typename YViewType::const_value_type& beta,
+          const YViewType& y) {
   static_assert(Kokkos::Impl::is_view<AViewType>::value,
                 "AViewType must be a Kokkos::View.");
   static_assert(Kokkos::Impl::is_view<XViewType>::value,
@@ -144,13 +148,38 @@ void gemv(const char trans[], typename AViewType::const_value_type& alpha,
     const bool eti_spec_avail =
         KokkosBlas::Impl::gemv_eti_spec_avail<AVT, XVT, YVT>::value;
     typedef Impl::GEMV<AVT, XVT, YVT, false, eti_spec_avail> fallback_impl_type;
-    fallback_impl_type::gemv(trans, alpha, A, x, beta, y);
+    fallback_impl_type::gemv(space, trans, alpha, A, x, beta, y);
   } else {
     typedef Impl::GEMV<AVT, XVT, YVT> impl_type;
-    impl_type::gemv(trans, alpha, A, x, beta, y);
+    impl_type::gemv(space, trans, alpha, A, x, beta, y);
   }
 }
 
+/// \brief Dense matrix-vector multiply: y = beta*y + alpha*A*x.
+///
+/// \tparam AViewType Input matrix, as a 2-D Kokkos::View
+/// \tparam XViewType Input vector, as a 1-D Kokkos::View
+/// \tparam YViewType Output vector, as a nonconst 1-D Kokkos::View
+/// \tparam AlphaCoeffType Type of input coefficient alpha
+/// \tparam BetaCoeffType Type of input coefficient beta
+///
+/// \param trans [in] "N" for non-transpose, "T" for transpose, "C"
+///   for conjugate transpose.  All characters after the first are
+///   ignored.  This works just like the BLAS routines.
+/// \param alpha [in] Input coefficient of A*x
+/// \param A [in] Input matrix, as a 2-D Kokkos::View
+/// \param x [in] Input vector, as a 1-D Kokkos::View
+/// \param beta [in] Input coefficient of y
+/// \param y [in/out] Output vector, as a nonconst 1-D Kokkos::View
+template <class AViewType, class XViewType, class YViewType>
+void gemv(const char trans[], typename AViewType::const_value_type& alpha,
+          const AViewType& A, const XViewType& x,
+          typename YViewType::const_value_type& beta, const YViewType& y) {
+  const typename AViewType::execution_space space =
+      typename AViewType::execution_space();
+  gemv(space, trans, alpha, A, x, beta, y);
+}
+
 }  // namespace KokkosBlas
 
 #endif  // KOKKOS_BLAS2_MV_HPP_
diff --git a/src/blas/KokkosBlas3_gemm.hpp b/src/blas/KokkosBlas3_gemm.hpp
index 52e02e8e60..1e26516f01 100644
--- a/src/blas/KokkosBlas3_gemm.hpp
+++ b/src/blas/KokkosBlas3_gemm.hpp
@@ -67,10 +67,10 @@ namespace Impl {
 // cuBLAS.
 template <class AViewType, class BViewType, class CViewType>
 bool gemv_based_gemm(
-    const char transA[], const char transB[],
-    typename AViewType::const_value_type& alpha, const AViewType& A,
-    const BViewType& B, typename CViewType::const_value_type& beta,
-    const CViewType& C,
+    const typename CViewType::execution_space& space, const char transA[],
+    const char transB[], typename AViewType::const_value_type& alpha,
+    const AViewType& A, const BViewType& B,
+    typename CViewType::const_value_type& beta, const CViewType& C,
     typename std::enable_if<!std::is_same<typename BViewType::array_layout,
                                           Kokkos::LayoutStride>::value &&
                             !std::is_same<typename CViewType::array_layout,
@@ -91,7 +91,7 @@ bool gemv_based_gemm(
                  typename CViewType::device_type,
                  Kokkos::MemoryTraits<Kokkos::Unmanaged>>
         Cvec(C.data(), C.extent(0));
-    KokkosBlas::gemv("N", alpha, A, Bvec, beta, Cvec);
+    KokkosBlas::gemv(space, "N", alpha, A, Bvec, beta, Cvec);
     return true;
   }
   return false;
@@ -102,6 +102,7 @@ bool gemv_based_gemm(
 // tests.
 template <class AViewType, class BViewType, class CViewType>
 bool gemv_based_gemm(
+    const typename CViewType::execution_space& /*space*/,
     const char /*transA*/[], const char /*transB*/[],
     typename AViewType::const_value_type& /*alpha*/, const AViewType& /*A*/,
     const BViewType& /*B*/, typename CViewType::const_value_type& /*beta*/,
@@ -121,6 +122,7 @@ bool gemv_based_gemm(
 /// \tparam BViewType Input matrix, as a 2-D Kokkos::View
 /// \tparam CViewType Output matrix, as a nonconst 2-D Kokkos::View
 ///
+/// \param space [in] an execution space instance
 /// \param transA [in] "N" for non-transpose, "T" for transpose, "C"
 ///   for conjugate transpose.  All characters after the first are
 ///   ignored.  This works just like the BLAS routines.
@@ -133,10 +135,10 @@ bool gemv_based_gemm(
 /// \param beta [in] Input coefficient of C
 /// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View
 template <class AViewType, class BViewType, class CViewType>
-void gemm(const char transA[], const char transB[],
-          typename AViewType::const_value_type& alpha, const AViewType& A,
-          const BViewType& B, typename CViewType::const_value_type& beta,
-          const CViewType& C) {
+void gemm(const typename CViewType::execution_space& space, const char transA[],
+          const char transB[], typename AViewType::const_value_type& alpha,
+          const AViewType& A, const BViewType& B,
+          typename CViewType::const_value_type& beta, const CViewType& C) {
 #if (KOKKOSKERNELS_DEBUG_LEVEL > 0)
   static_assert(Kokkos::Impl::is_view<AViewType>::value,
                 "AViewType must be a Kokkos::View.");
@@ -203,7 +205,8 @@ void gemm(const char transA[], const char transB[],
   }
 
   // Check if gemv code path is allowed and profitable, and if so run it.
-  if (Impl::gemv_based_gemm(transA, transB, alpha, A, B, beta, C)) return;
+  if (Impl::gemv_based_gemm(space, transA, transB, alpha, A, B, beta, C))
+    return;
 
   // Minimize the number of Impl::GEMM instantiations, by
   // standardizing on particular View specializations for its template
@@ -222,7 +225,34 @@ void gemm(const char transA[], const char transB[],
                        Kokkos::MemoryTraits<Kokkos::Unmanaged>>
       CVT;
   typedef Impl::GEMM<AVT, BVT, CVT> impl_type;
-  impl_type::gemm(transA, transB, alpha, A, B, beta, C);
+  impl_type::gemm(space, transA, transB, alpha, A, B, beta, C);
+}
+
+/// \brief Dense matrix-matrix multiply: C = beta*C + alpha*op(A)*op(B).
+///
+/// \tparam AViewType Input matrix, as a 2-D Kokkos::View
+/// \tparam BViewType Input matrix, as a 2-D Kokkos::View
+/// \tparam CViewType Output matrix, as a nonconst 2-D Kokkos::View
+///
+/// \param transA [in] "N" for non-transpose, "T" for transpose, "C"
+///   for conjugate transpose.  All characters after the first are
+///   ignored.  This works just like the BLAS routines.
+/// \param transB [in] "N" for non-transpose, "T" for transpose, "C"
+///   for conjugate transpose.  All characters after the first are
+///   ignored.  This works just like the BLAS routines.
+/// \param alpha [in] Input coefficient of A*x
+/// \param A [in] Input matrix, as a 2-D Kokkos::View
+/// \param B [in] Input matrix, as a 2-D Kokkos::View
+/// \param beta [in] Input coefficient of C
+/// \param C [in/out] Output vector, as a nonconst 2-D Kokkos::View
+template <class AViewType, class BViewType, class CViewType>
+void gemm(const char transA[], const char transB[],
+          typename AViewType::const_value_type& alpha, const AViewType& A,
+          const BViewType& B, typename CViewType::const_value_type& beta,
+          const CViewType& C) {
+  const typename CViewType::execution_space space =
+      typename CViewType::execution_space();
+  gemm(space, transA, transB, alpha, A, B, beta, C);
 }
 
 }  // namespace KokkosBlas
diff --git a/src/blas/impl/KokkosBlas2_gemv_impl.hpp b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
index b3b72bd2fe..ba8389e559 100644
--- a/src/blas/impl/KokkosBlas2_gemv_impl.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_impl.hpp
@@ -228,7 +228,8 @@ struct SingleLevelTransposeGEMV {
 // Single-level parallel version of GEMV.
 template <class AViewType, class XViewType, class YViewType,
           class IndexType = typename AViewType::size_type>
-void singleLevelGemv(const char trans[],
+void singleLevelGemv(const typename AViewType::execution_space& space,
+                     const char trans[],
                      typename AViewType::const_value_type& alpha,
                      const AViewType& A, const XViewType& x,
                      typename YViewType::const_value_type& beta,
@@ -255,7 +256,7 @@ void singleLevelGemv(const char trans[],
   using AlphaCoeffType = typename AViewType::non_const_value_type;
   using BetaCoeffType  = typename YViewType::non_const_value_type;
 
-  policy_type range(0, A.extent(0));
+  policy_type range(space, 0, A.extent(0));
   const char tr = trans[0];
 
   // The transpose and conjugate transpose cases where A has zero rows
@@ -666,7 +667,8 @@ struct TwoLevelTransposeGEMV {
 // Two-level parallel version of GEMV.
 template <class AViewType, class XViewType, class YViewType,
           class IndexType = typename AViewType::size_type>
-void twoLevelGemv(const char trans[],
+void twoLevelGemv(const typename AViewType::execution_space& space,
+                  const char trans[],
                   typename AViewType::const_value_type& alpha,
                   const AViewType& A, const XViewType& x,
                   typename YViewType::const_value_type& beta,
@@ -717,7 +719,7 @@ void twoLevelGemv(const char trans[],
                                       IndexType>;
       functor_type functor(alpha, A, x, beta, y);
       Kokkos::parallel_for("KokkosBlas::gemv[SingleLevel]",
-                           range_policy_type(0, y.extent(0)), functor);
+                           range_policy_type(space, 0, y.extent(0)), functor);
     }
     return;
   }
@@ -747,11 +749,11 @@ void twoLevelGemv(const char trans[],
       if ((size_t)teamSize > 32 * A.extent(1)) teamSize = 32 * A.extent(1);
       int numBlocks            = teamSize / 32;
       functor.columnsPerThread = (A.extent(1) + numBlocks - 1) / numBlocks;
-      team                     = tagged_policy(numTeams, teamSize)
+      team                     = tagged_policy(space, numTeams, teamSize)
                  .set_scratch_size(0, Kokkos::PerTeam(sharedPerTeam));
     } else {
       // LayoutRight: one team per row
-      team = tagged_policy(A.extent(0), Kokkos::AUTO);
+      team = tagged_policy(space, A.extent(0), Kokkos::AUTO);
     }
     Kokkos::parallel_for("KokkosBlas::gemv[twoLevel]", team, functor);
   } else {
@@ -762,7 +764,7 @@ void twoLevelGemv(const char trans[],
       // Do nothing (y := 1 * y)
     } else if (tr == 'T') {
       // transpose, and not conj transpose
-      team_policy_type team(A.extent(1), Kokkos::AUTO);
+      team_policy_type team(space, A.extent(1), Kokkos::AUTO);
       using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType,
                                                  YViewType, false, IndexType>;
       functor_type functor(alpha, A, x, beta, y);
@@ -770,7 +772,7 @@ void twoLevelGemv(const char trans[],
                            functor);
     } else if (tr == 'C' || tr == 'H') {
       // conjugate transpose
-      team_policy_type team(A.extent(1), Kokkos::AUTO);
+      team_policy_type team(space, A.extent(1), Kokkos::AUTO);
       using functor_type = TwoLevelTransposeGEMV<AViewType, XViewType,
                                                  YViewType, true, IndexType>;
       functor_type functor(alpha, A, x, beta, y);
@@ -786,23 +788,25 @@ void twoLevelGemv(const char trans[],
 template <class AViewType, class XViewType, class YViewType, class IndexType,
           typename std::enable_if<!KokkosKernels::Impl::kk_is_gpu_exec_space<
               typename AViewType::execution_space>()>::type* = nullptr>
-void generalGemvImpl(const char trans[],
+void generalGemvImpl(const typename AViewType::execution_space& space,
+                     const char trans[],
                      typename AViewType::const_value_type& alpha,
                      const AViewType& A, const XViewType& x,
                      typename YViewType::const_value_type& beta,
                      const YViewType& y) {
-  singleLevelGemv(trans, alpha, A, x, beta, y);
+  singleLevelGemv(space, trans, alpha, A, x, beta, y);
 }
 
 template <class AViewType, class XViewType, class YViewType, class IndexType,
           typename std::enable_if<KokkosKernels::Impl::kk_is_gpu_exec_space<
               typename AViewType::execution_space>()>::type* = nullptr>
-void generalGemvImpl(const char trans[],
+void generalGemvImpl(const typename AViewType::execution_space& space,
+                     const char trans[],
                      typename AViewType::const_value_type& alpha,
                      const AViewType& A, const XViewType& x,
                      typename YViewType::const_value_type& beta,
                      const YViewType& y) {
-  twoLevelGemv(trans, alpha, A, x, beta, y);
+  twoLevelGemv(space, trans, alpha, A, x, beta, y);
 }
 
 }  // namespace Impl
diff --git a/src/blas/impl/KokkosBlas2_gemv_spec.hpp b/src/blas/impl/KokkosBlas2_gemv_spec.hpp
index 68c90a3e72..e00b3ae283 100644
--- a/src/blas/impl/KokkosBlas2_gemv_spec.hpp
+++ b/src/blas/impl/KokkosBlas2_gemv_spec.hpp
@@ -101,7 +101,8 @@ template <class AViewType, class XViewType, class YViewType,
           bool eti_spec_avail =
               gemv_eti_spec_avail<AViewType, XViewType, YViewType>::value>
 struct GEMV {
-  static void gemv(const char trans[],
+  static void gemv(const typename AViewType::execution_space& space,
+                   const char trans[],
                    typename AViewType::const_value_type& alpha,
                    const AViewType& A, const XViewType& x,
                    typename YViewType::const_value_type& beta,
@@ -130,11 +131,11 @@ struct GEMV {
     // Prefer int as the index type, but use a larger type if needed.
     if (numRows < static_cast<size_type>(INT_MAX) &&
         numCols < static_cast<size_type>(INT_MAX)) {
-      generalGemvImpl<AViewType, XViewType, YViewType, int>(trans, alpha, A, x,
-                                                            beta, y);
+      generalGemvImpl<AViewType, XViewType, YViewType, int>(space, trans, alpha,
+                                                            A, x, beta, y);
     } else {
-      generalGemvImpl<AViewType, XViewType, YViewType, int64_t>(trans, alpha, A,
-                                                                x, beta, y);
+      generalGemvImpl<AViewType, XViewType, YViewType, int64_t>(
+          space, trans, alpha, A, x, beta, y);
     }
     Kokkos::Profiling::popRegion();
   }
diff --git a/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp
index 67e81ea737..29b2129710 100644
--- a/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_dotbased_impl.hpp
@@ -95,7 +95,7 @@ struct DotBasedGEMM {
         numCcols(C.extent(1)),
         dotSize(A.extent(0)) {}
 
-  void run(bool conjugateTranspose) {
+  void run(const typename CV::execution_space& space, bool conjugateTranspose) {
     // NOTE: these workPerTeam and approxNumTeams were used for TPL CUBLAS,
     //       and may need to be retuned for other architectures
     constexpr size_C workPerTeam = 4096;       // Amount of work per team
@@ -143,11 +143,12 @@ struct DotBasedGEMM {
 
     // Multiply alpha*A^TB and add it to beta*C
     if (conjugateTranspose) {
-      Kokkos::TeamPolicy<TagMultCT, ExecSpace> policyMult(numTeams,
+      Kokkos::TeamPolicy<TagMultCT, ExecSpace> policyMult(space, numTeams,
                                                           Kokkos::AUTO);
       Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this);
     } else {
-      Kokkos::TeamPolicy<TagMult, ExecSpace> policyMult(numTeams, Kokkos::AUTO);
+      Kokkos::TeamPolicy<TagMult, ExecSpace> policyMult(space, numTeams,
+                                                        Kokkos::AUTO);
       Kokkos::parallel_for("Perform Dot Product Based GEMM", policyMult, *this);
     }
   }
diff --git a/src/blas/impl/KokkosBlas3_gemm_impl.hpp b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
index 56e6ede229..2057c8d8ad 100644
--- a/src/blas/impl/KokkosBlas3_gemm_impl.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_impl.hpp
@@ -631,7 +631,8 @@ struct GEMMImpl {
     beta          = beta_;
   }
 
-  void run(int team_size, int vector_length, int scr_level) {
+  void run(const ExecSpace& space, int team_size, int vector_length,
+           int scr_level) {
     scratch_level           = scr_level;
     int scratch_memory_size = ViewTypeAScratch::shmem_size() +
                               ViewTypeBScratch::shmem_size() +
@@ -645,10 +646,10 @@ struct GEMMImpl {
     // that problem but I'm not sure if that it a good perf
     // parameter or why it is set to 2 for Cuda?
     Kokkos::TeamPolicy<ExecSpace, Kokkos::LaunchBounds<384, 0>> policy(
-        num_blocks_0 * num_blocks_1, team_size, vector_length);
+        space, num_blocks_0 * num_blocks_1, team_size, vector_length);
 #else
     Kokkos::TeamPolicy<ExecSpace, Kokkos::LaunchBounds<384, 2>> policy(
-        num_blocks_0 * num_blocks_1, team_size, vector_length);
+        space, num_blocks_0 * num_blocks_1, team_size, vector_length);
 #endif
 
     Kokkos::parallel_for(
diff --git a/src/blas/impl/KokkosBlas3_gemm_spec.hpp b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
index f311a5ce0f..9494207d29 100644
--- a/src/blas/impl/KokkosBlas3_gemm_spec.hpp
+++ b/src/blas/impl/KokkosBlas3_gemm_spec.hpp
@@ -118,7 +118,8 @@ template <class AViewType, class BViewType, class CViewType,
           bool eti_spec_avail =
               gemm_eti_spec_avail<AViewType, BViewType, CViewType>::value>
 struct GEMM {
-  static void gemm(const char transA[], const char transB[],
+  static void gemm(const typename CViewType::execution_space& space,
+                   const char transA[], const char transB[],
                    typename AViewType::const_value_type& alpha,
                    const AViewType& A, const BViewType& B,
                    typename CViewType::const_value_type& beta,
@@ -174,7 +175,7 @@ struct GEMM {
       bool A_is_conj = ((transA[0] == 'C') || (transA[0] == 'c'));
       DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> dotBasedGemm(
           alpha, A, B, beta, C);
-      dotBasedGemm.run(A_is_conj);
+      dotBasedGemm.run(space, A_is_conj);
 
     } else {
       // Define Blocking sizes (this will be used for scratch spaces)
@@ -233,7 +234,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 0, 0>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'T' || transA[0] == 't') &&
           (transB[0] == 'N' || transB[0] == 'n')) {
@@ -241,7 +242,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 1, 0>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'C' || transA[0] == 'c') &&
           (transB[0] == 'N' || transB[0] == 'n')) {
@@ -249,7 +250,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 2, 0>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'N' || transA[0] == 'n') &&
           (transB[0] == 'T' || transB[0] == 't')) {
@@ -257,7 +258,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 0, 1>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'T' || transA[0] == 't') &&
           (transB[0] == 'T' || transB[0] == 't')) {
@@ -265,7 +266,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 1, 1>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'C' || transA[0] == 'c') &&
           (transB[0] == 'T' || transB[0] == 't')) {
@@ -273,7 +274,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 2, 1>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'N' || transA[0] == 'n') &&
           (transB[0] == 'C' || transB[0] == 'c')) {
@@ -281,7 +282,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 0, 2>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'T' || transA[0] == 't') &&
           (transB[0] == 'C' || transB[0] == 'c')) {
@@ -289,7 +290,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 1, 2>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
       if ((transA[0] == 'C' || transA[0] == 'c') &&
           (transB[0] == 'C' || transB[0] == 'c')) {
@@ -297,7 +298,7 @@ struct GEMM {
                                    AViewType, BViewType, CViewType, blockA0,
                                    blockA1, blockB1, 2, 2>
             gemm(alpha, A, B, beta, C);
-        gemm.run(team_size, vector_length, scratch_level);
+        gemm.run(space, team_size, vector_length, scratch_level);
       }
     }
     Kokkos::Profiling::popRegion();
diff --git a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
index 3db1f3a763..1eb85e7f85 100644
--- a/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas2_gemv_tpl_spec_decl.hpp
@@ -71,44 +71,45 @@ namespace Impl {
     transa = 'C';                                                            \
   }
 
-#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,       \
-                               ETI_SPEC_AVAIL)                             \
-  template <class ExecSpace>                                               \
-  struct GEMV<                                                             \
-      Kokkos::View<const double**, LAYOUTA,                                \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                   \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      Kokkos::View<const double*, LAYOUTX,                                 \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                   \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      true, ETI_SPEC_AVAIL> {                                              \
-    typedef double SCALAR;                                                 \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                          \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        AViewType;                                                         \
-    typedef Kokkos::View<const SCALAR*, LAYOUTX,                           \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        XViewType;                                                         \
-    typedef Kokkos::View<SCALAR*, LAYOUTY,                                 \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        YViewType;                                                         \
-                                                                           \
-    static void gemv(const char trans[],                                   \
-                     typename AViewType::const_value_type& alpha,          \
-                     const AViewType& A, const XViewType& X,               \
-                     typename YViewType::const_value_type& beta,           \
-                     const YViewType& Y) {                                 \
-      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]");  \
-      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                            \
-      HostBlas<double>::gemv(transa, M, N, alpha, A.data(), LDA, X.data(), \
-                             one, beta, Y.data(), one);                    \
-      Kokkos::Profiling::popRegion();                                      \
-    }                                                                      \
+#define KOKKOSBLAS2_DGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                               ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                 \
+  struct GEMV<                                                               \
+      Kokkos::View<const double**, LAYOUTA,                                  \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<const double*, LAYOUTX,                                   \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,   \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                \
+      true, ETI_SPEC_AVAIL> {                                                \
+    typedef double SCALAR;                                                   \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                            \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        AViewType;                                                           \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        XViewType;                                                           \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                   \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        YViewType;                                                           \
+                                                                             \
+    static void gemv(const typename AViewType::execution_space& /* space */, \
+                     const char trans[],                                     \
+                     typename AViewType::const_value_type& alpha,            \
+                     const AViewType& A, const XViewType& X,                 \
+                     typename YViewType::const_value_type& beta,             \
+                     const YViewType& Y) {                                   \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_BLAS,double]");    \
+      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                              \
+      HostBlas<double>::gemv(transa, M, N, alpha, A.data(), LDA, X.data(),   \
+                             one, beta, Y.data(), one);                      \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
   };
 
 #define KOKKOSBLAS2_SGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,           \
@@ -138,7 +139,8 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
         YViewType;                                                             \
                                                                                \
-    static void gemv(const char trans[],                                       \
+    static void gemv(const typename AViewType::execution_space& /* space */,   \
+                     const char trans[],                                       \
                      typename AViewType::const_value_type& alpha,              \
                      const AViewType& A, const XViewType& X,                   \
                      typename YViewType::const_value_type& beta,               \
@@ -178,7 +180,8 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
         YViewType;                                                           \
                                                                              \
-    static void gemv(const char trans[],                                     \
+    static void gemv(const typename AViewType::execution_space& /* space */, \
+                     const char trans[],                                     \
                      typename AViewType::const_value_type& alpha,            \
                      const AViewType& A, const XViewType& X,                 \
                      typename YViewType::const_value_type& beta,             \
@@ -196,49 +199,50 @@ namespace Impl {
     }                                                                        \
   };
 
-#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,        \
-                               ETI_SPEC_AVAIL)                              \
-  template <class ExecSpace>                                                \
-  struct GEMV<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,         \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,            \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
-              Kokkos::View<const Kokkos::complex<float>*, LAYOUTX,          \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,            \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
-              Kokkos::View<Kokkos::complex<float>*, LAYOUTY,                \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,            \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,       \
-              true, ETI_SPEC_AVAIL> {                                       \
-    typedef Kokkos::complex<float> SCALAR;                                  \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                           \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        AViewType;                                                          \
-    typedef Kokkos::View<const SCALAR*, LAYOUTX,                            \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        XViewType;                                                          \
-    typedef Kokkos::View<SCALAR*, LAYOUTY,                                  \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        YViewType;                                                          \
-                                                                            \
-    static void gemv(const char trans[],                                    \
-                     typename AViewType::const_value_type& alpha,           \
-                     const AViewType& A, const XViewType& X,                \
-                     typename YViewType::const_value_type& beta,            \
-                     const YViewType& Y) {                                  \
-      Kokkos::Profiling::pushRegion(                                        \
-          "KokkosBlas::gemv[TPL_BLAS,complex<float>]");                     \
-      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                             \
-      const std::complex<float> alpha_val = alpha, beta_val = beta;         \
-      HostBlas<std::complex<float> >::gemv(                                 \
-          transa, M, N, alpha_val,                                          \
-          reinterpret_cast<const std::complex<float>*>(A.data()), LDA,      \
-          reinterpret_cast<const std::complex<float>*>(X.data()), one,      \
-          beta_val, reinterpret_cast<std::complex<float>*>(Y.data()), one); \
-      Kokkos::Profiling::popRegion();                                       \
-    }                                                                       \
+#define KOKKOSBLAS2_CGEMV_BLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                               ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                 \
+  struct GEMV<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,          \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUTX,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<Kokkos::complex<float>*, LAYOUTY,                 \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              true, ETI_SPEC_AVAIL> {                                        \
+    typedef Kokkos::complex<float> SCALAR;                                   \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                            \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        AViewType;                                                           \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        XViewType;                                                           \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                   \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        YViewType;                                                           \
+                                                                             \
+    static void gemv(const typename AViewType::execution_space& /* space */, \
+                     const char trans[],                                     \
+                     typename AViewType::const_value_type& alpha,            \
+                     const AViewType& A, const XViewType& X,                 \
+                     typename YViewType::const_value_type& beta,             \
+                     const YViewType& Y) {                                   \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::gemv[TPL_BLAS,complex<float>]");                      \
+      KOKKOSBLAS2_GEMV_DETERMINE_ARGS(LAYOUTA);                              \
+      const std::complex<float> alpha_val = alpha, beta_val = beta;          \
+      HostBlas<std::complex<float> >::gemv(                                  \
+          transa, M, N, alpha_val,                                           \
+          reinterpret_cast<const std::complex<float>*>(A.data()), LDA,       \
+          reinterpret_cast<const std::complex<float>*>(X.data()), one,       \
+          beta_val, reinterpret_cast<std::complex<float>*>(Y.data()), one);  \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
   };
 
 KOKKOSBLAS2_DGEMV_BLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
@@ -308,182 +312,201 @@ namespace Impl {
     transa = CUBLAS_OP_C;                                                    \
   }
 
-#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,      \
-                                 ETI_SPEC_AVAIL)                            \
-  template <class ExecSpace>                                                \
-  struct GEMV<                                                              \
-      Kokkos::View<const double**, LAYOUTA,                                 \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                    \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
-      Kokkos::View<const double*, LAYOUTX,                                  \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                    \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
-      Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,  \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
-      true, ETI_SPEC_AVAIL> {                                               \
-    typedef double SCALAR;                                                  \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                           \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        AViewType;                                                          \
-    typedef Kokkos::View<const SCALAR*, LAYOUTX,                            \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        XViewType;                                                          \
-    typedef Kokkos::View<SCALAR*, LAYOUTY,                                  \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        YViewType;                                                          \
-                                                                            \
-    static void gemv(const char trans[],                                    \
-                     typename AViewType::const_value_type& alpha,           \
-                     const AViewType& A, const XViewType& X,                \
-                     typename YViewType::const_value_type& beta,            \
-                     const YViewType& Y) {                                  \
-      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]"); \
-      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                      \
-      KokkosBlas::Impl::CudaBlasSingleton& s =                              \
-          KokkosBlas::Impl::CudaBlasSingleton::singleton();                 \
-      cublasDgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(),  \
-                  one, &beta, Y.data(), one);                               \
-      Kokkos::Profiling::popRegion();                                       \
-    }                                                                       \
+#define KOKKOSBLAS2_DGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                                 ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                   \
+  struct GEMV<                                                                 \
+      Kokkos::View<const double**, LAYOUTA,                                    \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double*, LAYOUTX,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<double*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef double SCALAR;                                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                               \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                     \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,double]");    \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                         \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasSetStream(s.handle, space.cuda_stream()));                     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasDgemv(s.handle, transa, M, N, &alpha, \
+                                               A.data(), LDA, X.data(), one,   \
+                                               &beta, Y.data(), one));         \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
   };
 
-#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,     \
-                                 ETI_SPEC_AVAIL)                           \
-  template <class ExecSpace>                                               \
-  struct GEMV<                                                             \
-      Kokkos::View<const float**, LAYOUTA,                                 \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                   \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      Kokkos::View<const float*, LAYOUTX,                                  \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                   \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      Kokkos::View<float*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,  \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      true, ETI_SPEC_AVAIL> {                                              \
-    typedef float SCALAR;                                                  \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                          \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        AViewType;                                                         \
-    typedef Kokkos::View<const SCALAR*, LAYOUTX,                           \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        XViewType;                                                         \
-    typedef Kokkos::View<SCALAR*, LAYOUTY,                                 \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        YViewType;                                                         \
-                                                                           \
-    static void gemv(const char trans[],                                   \
-                     typename AViewType::const_value_type& alpha,          \
-                     const AViewType& A, const XViewType& X,               \
-                     typename YViewType::const_value_type& beta,           \
-                     const YViewType& Y) {                                 \
-      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]"); \
-      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                     \
-      KokkosBlas::Impl::CudaBlasSingleton& s =                             \
-          KokkosBlas::Impl::CudaBlasSingleton::singleton();                \
-      cublasSgemv(s.handle, transa, M, N, &alpha, A.data(), LDA, X.data(), \
-                  one, &beta, Y.data(), one);                              \
-      Kokkos::Profiling::popRegion();                                      \
-    }                                                                      \
+#define KOKKOSBLAS2_SGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                                 ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                   \
+  struct GEMV<                                                                 \
+      Kokkos::View<const float**, LAYOUTA,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const float*, LAYOUTX,                                      \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<float*, LAYOUTY, Kokkos::Device<ExecSpace, MEM_SPACE>,      \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef float SCALAR;                                                      \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                               \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                     \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemv[TPL_CUBLAS,float]");     \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                         \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasSetStream(s.handle, space.cuda_stream()));                     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSgemv(s.handle, transa, M, N, &alpha, \
+                                               A.data(), LDA, X.data(), one,   \
+                                               &beta, Y.data(), one));         \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
   };
 
-#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,     \
-                                 ETI_SPEC_AVAIL)                           \
-  template <class ExecSpace>                                               \
-  struct GEMV<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,       \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,           \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
-              Kokkos::View<const Kokkos::complex<double>*, LAYOUTX,        \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,           \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
-              Kokkos::View<Kokkos::complex<double>*, LAYOUTY,              \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,           \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,      \
-              true, ETI_SPEC_AVAIL> {                                      \
-    typedef Kokkos::complex<double> SCALAR;                                \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                          \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        AViewType;                                                         \
-    typedef Kokkos::View<const SCALAR*, LAYOUTX,                           \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        XViewType;                                                         \
-    typedef Kokkos::View<SCALAR*, LAYOUTY,                                 \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        YViewType;                                                         \
-                                                                           \
-    static void gemv(const char trans[],                                   \
-                     typename AViewType::const_value_type& alpha,          \
-                     const AViewType& A, const XViewType& X,               \
-                     typename YViewType::const_value_type& beta,           \
-                     const YViewType& Y) {                                 \
-      Kokkos::Profiling::pushRegion(                                       \
-          "KokkosBlas::gemv[TPL_CUBLAS,complex<double>]");                 \
-      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                     \
-      KokkosBlas::Impl::CudaBlasSingleton& s =                             \
-          KokkosBlas::Impl::CudaBlasSingleton::singleton();                \
-      cublasZgemv(s.handle, transa, M, N,                                  \
-                  reinterpret_cast<const cuDoubleComplex*>(&alpha),        \
-                  reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, \
-                  reinterpret_cast<const cuDoubleComplex*>(X.data()), one, \
-                  reinterpret_cast<const cuDoubleComplex*>(&beta),         \
-                  reinterpret_cast<cuDoubleComplex*>(Y.data()), one);      \
-      Kokkos::Profiling::popRegion();                                      \
-    }                                                                      \
+#define KOKKOSBLAS2_ZGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,         \
+                                 ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                   \
+  struct GEMV<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<const Kokkos::complex<double>*, LAYOUTX,            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              Kokkos::View<Kokkos::complex<double>*, LAYOUTY,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
+              true, ETI_SPEC_AVAIL> {                                          \
+    typedef Kokkos::complex<double> SCALAR;                                    \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                               \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        XViewType;                                                             \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                     \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        YViewType;                                                             \
+                                                                               \
+    static void gemv(const typename AViewType::execution_space& space,         \
+                     const char trans[],                                       \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const XViewType& X,                   \
+                     typename YViewType::const_value_type& beta,               \
+                     const YViewType& Y) {                                     \
+      Kokkos::Profiling::pushRegion(                                           \
+          "KokkosBlas::gemv[TPL_CUBLAS,complex<double>]");                     \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                         \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                 \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasSetStream(s.handle, space.cuda_stream()));                     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                            \
+          cublasZgemv(s.handle, transa, M, N,                                  \
+                      reinterpret_cast<const cuDoubleComplex*>(&alpha),        \
+                      reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, \
+                      reinterpret_cast<const cuDoubleComplex*>(X.data()), one, \
+                      reinterpret_cast<const cuDoubleComplex*>(&beta),         \
+                      reinterpret_cast<cuDoubleComplex*>(Y.data()), one));     \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));           \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
   };
 
-#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE, \
-                                 ETI_SPEC_AVAIL)                       \
-  template <class ExecSpace>                                           \
-  struct GEMV<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,    \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,       \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,  \
-              Kokkos::View<const Kokkos::complex<float>*, LAYOUTX,     \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,       \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,  \
-              Kokkos::View<Kokkos::complex<float>*, LAYOUTY,           \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,       \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,  \
-              true, ETI_SPEC_AVAIL> {                                  \
-    typedef Kokkos::complex<float> SCALAR;                             \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                      \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,         \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >     \
-        AViewType;                                                     \
-    typedef Kokkos::View<const SCALAR*, LAYOUTX,                       \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,         \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >     \
-        XViewType;                                                     \
-    typedef Kokkos::View<SCALAR*, LAYOUTY,                             \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,         \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >     \
-        YViewType;                                                     \
-                                                                       \
-    static void gemv(const char trans[],                               \
-                     typename AViewType::const_value_type& alpha,      \
-                     const AViewType& A, const XViewType& X,           \
-                     typename YViewType::const_value_type& beta,       \
-                     const YViewType& Y) {                             \
-      Kokkos::Profiling::pushRegion(                                   \
-          "KokkosBlas::gemv[TPL_CUBLAS,complex<float>]");              \
-      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                 \
-      KokkosBlas::Impl::CudaBlasSingleton& s =                         \
-          KokkosBlas::Impl::CudaBlasSingleton::singleton();            \
-      cublasCgemv(s.handle, transa, M, N,                              \
-                  reinterpret_cast<const cuComplex*>(&alpha),          \
-                  reinterpret_cast<const cuComplex*>(A.data()), LDA,   \
-                  reinterpret_cast<const cuComplex*>(X.data()), one,   \
-                  reinterpret_cast<const cuComplex*>(&beta),           \
-                  reinterpret_cast<cuComplex*>(Y.data()), one);        \
-      Kokkos::Profiling::popRegion();                                  \
-    }                                                                  \
+#define KOKKOSBLAS2_CGEMV_CUBLAS(LAYOUTA, LAYOUTX, LAYOUTY, MEM_SPACE,        \
+                                 ETI_SPEC_AVAIL)                              \
+  template <class ExecSpace>                                                  \
+  struct GEMV<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,           \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<const Kokkos::complex<float>*, LAYOUTX,            \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              Kokkos::View<Kokkos::complex<float>*, LAYOUTY,                  \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,              \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,         \
+              true, ETI_SPEC_AVAIL> {                                         \
+    typedef Kokkos::complex<float> SCALAR;                                    \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                             \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        AViewType;                                                            \
+    typedef Kokkos::View<const SCALAR*, LAYOUTX,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        XViewType;                                                            \
+    typedef Kokkos::View<SCALAR*, LAYOUTY,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
+        YViewType;                                                            \
+                                                                              \
+    static void gemv(const typename AViewType::execution_space& space,        \
+                     const char trans[],                                      \
+                     typename AViewType::const_value_type& alpha,             \
+                     const AViewType& A, const XViewType& X,                  \
+                     typename YViewType::const_value_type& beta,              \
+                     const YViewType& Y) {                                    \
+      Kokkos::Profiling::pushRegion(                                          \
+          "KokkosBlas::gemv[TPL_CUBLAS,complex<float>]");                     \
+      KOKKOSBLAS2_GEMV_CUBLAS_DETERMINE_ARGS(LAYOUTA);                        \
+      KokkosBlas::Impl::CudaBlasSingleton& s =                                \
+          KokkosBlas::Impl::CudaBlasSingleton::singleton();                   \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                           \
+          cublasSetStream(s.handle, space.cuda_stream()));                    \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasCgemv(                               \
+          s.handle, transa, M, N, reinterpret_cast<const cuComplex*>(&alpha), \
+          reinterpret_cast<const cuComplex*>(A.data()), LDA,                  \
+          reinterpret_cast<const cuComplex*>(X.data()), one,                  \
+          reinterpret_cast<const cuComplex*>(&beta),                          \
+          reinterpret_cast<cuComplex*>(Y.data()), one));                      \
+      KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));          \
+      Kokkos::Profiling::popRegion();                                         \
+    }                                                                         \
   };
 
 KOKKOSBLAS2_DGEMV_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
diff --git a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp
index 4b2ae4a45d..be550d3a18 100644
--- a/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp
+++ b/src/impl/tpls/KokkosBlas3_gemm_tpl_spec_decl.hpp
@@ -78,7 +78,8 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
         CViewType;                                                             \
                                                                                \
-    static void gemm(const char transA[], const char transB[],                 \
+    static void gemm(const typename CViewType::execution_space& /* space*/,    \
+                     const char transA[], const char transB[],                 \
                      typename AViewType::const_value_type& alpha,              \
                      const AViewType& A, const BViewType& B,                   \
                      typename CViewType::const_value_type& beta,               \
@@ -137,7 +138,8 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
         CViewType;                                                            \
                                                                               \
-    static void gemm(const char transA[], const char transB[],                \
+    static void gemm(const typename CViewType::execution_space& /* space*/,   \
+                     const char transA[], const char transB[],                \
                      typename AViewType::const_value_type& alpha,             \
                      const AViewType& A, const BViewType& B,                  \
                      typename CViewType::const_value_type& beta,              \
@@ -196,7 +198,8 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
         CViewType;                                                             \
                                                                                \
-    static void gemm(const char transA[], const char transB[],                 \
+    static void gemm(const typename CViewType::execution_space& /* space*/,    \
+                     const char transA[], const char transB[],                 \
                      typename AViewType::const_value_type& alpha,              \
                      const AViewType& A, const BViewType& B,                   \
                      typename CViewType::const_value_type& beta,               \
@@ -263,7 +266,8 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >            \
         CViewType;                                                            \
                                                                               \
-    static void gemm(const char transA[], const char transB[],                \
+    static void gemm(const typename CViewType::execution_space& /* space*/,   \
+                     const char transA[], const char transB[],                \
                      typename AViewType::const_value_type& alpha,             \
                      const AViewType& A, const BViewType& B,                  \
                      typename CViewType::const_value_type& beta,              \
@@ -351,192 +355,112 @@ KOKKOSBLAS3_CGEMM_BLAS(Kokkos::LayoutRight, Kokkos::LayoutRight,
 namespace KokkosBlas {
 namespace Impl {
 
-#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,      \
-                                 ETI_SPEC_AVAIL)                            \
-  template <class ExecSpace>                                                \
-  struct GEMM<                                                              \
-      Kokkos::View<const double**, LAYOUTA,                                 \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                    \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
-      Kokkos::View<const double**, LAYOUTB,                                 \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                    \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
-      Kokkos::View<double**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,               \
-      true, ETI_SPEC_AVAIL> {                                               \
-    typedef double SCALAR;                                                  \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                           \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        AViewType;                                                          \
-    typedef Kokkos::View<const SCALAR**, LAYOUTB,                           \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        BViewType;                                                          \
-    typedef Kokkos::View<SCALAR**, LAYOUTC,                                 \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,              \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >          \
-        CViewType;                                                          \
-                                                                            \
-    static void gemm(const char transA[], const char transB[],              \
-                     typename AViewType::const_value_type& alpha,           \
-                     const AViewType& A, const BViewType& B,                \
-                     typename CViewType::const_value_type& beta,            \
-                     const CViewType& C) {                                  \
-      Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,double]");   \
-      const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');            \
-      const int M    = static_cast<int>(C.extent(0));                       \
-      const int N    = static_cast<int>(C.extent(1));                       \
-      const int K    = static_cast<int>(A.extent(A_t ? 0 : 1));             \
-                                                                            \
-      bool A_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;     \
-      bool B_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTB>::value;     \
-      bool C_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTC>::value;     \
-                                                                            \
-      const int AST = A_is_lr ? A.stride(0) : A.stride(1),                  \
-                LDA = AST == 0 ? 1 : AST;                                   \
-      const int BST = B_is_lr ? B.stride(0) : B.stride(1),                  \
-                LDB = BST == 0 ? 1 : BST;                                   \
-      const int CST = C_is_lr ? C.stride(0) : C.stride(1),                  \
-                LDC = CST == 0 ? 1 : CST;                                   \
-                                                                            \
-      cublasOperation_t transa, transb;                                     \
-      if ((transA[0] == 'N') || (transA[0] == 'n'))                         \
-        transa = CUBLAS_OP_N;                                               \
-      else if ((transA[0] == 'T') || (transA[0] == 't'))                    \
-        transa = CUBLAS_OP_T;                                               \
-      else                                                                  \
-        transa = CUBLAS_OP_C;                                               \
-      if ((transB[0] == 'N') || (transB[0] == 'n'))                         \
-        transb = CUBLAS_OP_N;                                               \
-      else if ((transB[0] == 'T') || (transB[0] == 't'))                    \
-        transb = CUBLAS_OP_T;                                               \
-      else                                                                  \
-        transb = CUBLAS_OP_C;                                               \
-                                                                            \
-      constexpr int numDotsLayoutLeftThreshold  = 1600;                     \
-      constexpr int numDotsLayoutRightThreshold = 100;                      \
-      if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&    \
-           M * N < numDotsLayoutLeftThreshold) ||                           \
-          (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&     \
-           M * N < numDotsLayoutRightThreshold)) {                          \
-        DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(      \
-            alpha, A, B, beta, C);                                          \
-        gemm.run(false);                                                    \
-      } else {                                                              \
-        KokkosBlas::Impl::CudaBlasSingleton& s =                            \
-            KokkosBlas::Impl::CudaBlasSingleton::singleton();               \
-        if (!A_is_lr && !B_is_lr && !C_is_lr)                               \
-          cublasDgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(),  \
-                      LDA, B.data(), LDB, &beta, C.data(), LDC);            \
-        if (A_is_lr && B_is_lr && C_is_lr)                                  \
-          cublasDgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(),  \
-                      LDB, A.data(), LDA, &beta, C.data(), LDC);            \
-      }                                                                     \
-      Kokkos::Profiling::popRegion();                                       \
-    }                                                                       \
-  };
-
-#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,     \
-                                 ETI_SPEC_AVAIL)                           \
-  template <class ExecSpace>                                               \
-  struct GEMM<                                                             \
-      Kokkos::View<const float**, LAYOUTA,                                 \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                   \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      Kokkos::View<const float**, LAYOUTB,                                 \
-                   Kokkos::Device<ExecSpace, MEM_SPACE>,                   \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      Kokkos::View<float**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>, \
-                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,              \
-      true, ETI_SPEC_AVAIL> {                                              \
-    typedef float SCALAR;                                                  \
-    typedef Kokkos::View<const SCALAR**, LAYOUTA,                          \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        AViewType;                                                         \
-    typedef Kokkos::View<const SCALAR**, LAYOUTB,                          \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        BViewType;                                                         \
-    typedef Kokkos::View<SCALAR**, LAYOUTC,                                \
-                         Kokkos::Device<ExecSpace, MEM_SPACE>,             \
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >         \
-        CViewType;                                                         \
-                                                                           \
-    static void gemm(const char transA[], const char transB[],             \
-                     typename AViewType::const_value_type& alpha,          \
-                     const AViewType& A, const BViewType& B,               \
-                     typename CViewType::const_value_type& beta,           \
-                     const CViewType& C) {                                 \
-      Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,float]");   \
-      const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');           \
-      const int M    = static_cast<int>(C.extent(0));                      \
-      const int N    = static_cast<int>(C.extent(1));                      \
-      const int K    = static_cast<int>(A.extent(A_t ? 0 : 1));            \
-                                                                           \
-      bool A_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;    \
-      bool B_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTB>::value;    \
-      bool C_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTC>::value;    \
-                                                                           \
-      const int AST = A_is_lr ? A.stride(0) : A.stride(1),                 \
-                LDA = AST == 0 ? 1 : AST;                                  \
-      const int BST = B_is_lr ? B.stride(0) : B.stride(1),                 \
-                LDB = BST == 0 ? 1 : BST;                                  \
-      const int CST = C_is_lr ? C.stride(0) : C.stride(1),                 \
-                LDC = CST == 0 ? 1 : CST;                                  \
-                                                                           \
-      cublasOperation_t transa, transb;                                    \
-      if ((transA[0] == 'N') || (transA[0] == 'n'))                        \
-        transa = CUBLAS_OP_N;                                              \
-      else if ((transA[0] == 'T') || (transA[0] == 't'))                   \
-        transa = CUBLAS_OP_T;                                              \
-      else                                                                 \
-        transa = CUBLAS_OP_C;                                              \
-      if ((transB[0] == 'N') || (transB[0] == 'n'))                        \
-        transb = CUBLAS_OP_N;                                              \
-      else if ((transB[0] == 'T') || (transB[0] == 't'))                   \
-        transb = CUBLAS_OP_T;                                              \
-      else                                                                 \
-        transb = CUBLAS_OP_C;                                              \
-                                                                           \
-      constexpr int numDotsLayoutLeftThreshold  = 1600;                    \
-      constexpr int numDotsLayoutRightThreshold = 100;                     \
-      if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&   \
-           M * N < numDotsLayoutLeftThreshold) ||                          \
-          (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&    \
-           M * N < numDotsLayoutRightThreshold)) {                         \
-        DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(     \
-            alpha, A, B, beta, C);                                         \
-        gemm.run(false);                                                   \
-      } else {                                                             \
-        KokkosBlas::Impl::CudaBlasSingleton& s =                           \
-            KokkosBlas::Impl::CudaBlasSingleton::singleton();              \
-        if (!A_is_lr && !B_is_lr && !C_is_lr)                              \
-          cublasSgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), \
-                      LDA, B.data(), LDB, &beta, C.data(), LDC);           \
-        if (A_is_lr && B_is_lr && C_is_lr)                                 \
-          cublasSgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), \
-                      LDB, A.data(), LDA, &beta, C.data(), LDC);           \
-      }                                                                    \
-      Kokkos::Profiling::popRegion();                                      \
-    }                                                                      \
+#define KOKKOSBLAS3_DGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,         \
+                                 ETI_SPEC_AVAIL)                               \
+  template <class ExecSpace>                                                   \
+  struct GEMM<                                                                 \
+      Kokkos::View<const double**, LAYOUTA,                                    \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const double**, LAYOUTB,                                    \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<double**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>,    \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef double SCALAR;                                                     \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        AViewType;                                                             \
+    typedef Kokkos::View<const SCALAR**, LAYOUTB,                              \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        BViewType;                                                             \
+    typedef Kokkos::View<SCALAR**, LAYOUTC,                                    \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
+        CViewType;                                                             \
+                                                                               \
+    static void gemm(const typename CViewType::execution_space& space,         \
+                     const char transA[], const char transB[],                 \
+                     typename AViewType::const_value_type& alpha,              \
+                     const AViewType& A, const BViewType& B,                   \
+                     typename CViewType::const_value_type& beta,               \
+                     const CViewType& C) {                                     \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,double]");      \
+      const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');               \
+      const int M    = static_cast<int>(C.extent(0));                          \
+      const int N    = static_cast<int>(C.extent(1));                          \
+      const int K    = static_cast<int>(A.extent(A_t ? 0 : 1));                \
+                                                                               \
+      bool A_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;        \
+      bool B_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTB>::value;        \
+      bool C_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTC>::value;        \
+                                                                               \
+      const int AST = A_is_lr ? A.stride(0) : A.stride(1),                     \
+                LDA = AST == 0 ? 1 : AST;                                      \
+      const int BST = B_is_lr ? B.stride(0) : B.stride(1),                     \
+                LDB = BST == 0 ? 1 : BST;                                      \
+      const int CST = C_is_lr ? C.stride(0) : C.stride(1),                     \
+                LDC = CST == 0 ? 1 : CST;                                      \
+                                                                               \
+      cublasOperation_t transa, transb;                                        \
+      if ((transA[0] == 'N') || (transA[0] == 'n'))                            \
+        transa = CUBLAS_OP_N;                                                  \
+      else if ((transA[0] == 'T') || (transA[0] == 't'))                       \
+        transa = CUBLAS_OP_T;                                                  \
+      else                                                                     \
+        transa = CUBLAS_OP_C;                                                  \
+      if ((transB[0] == 'N') || (transB[0] == 'n'))                            \
+        transb = CUBLAS_OP_N;                                                  \
+      else if ((transB[0] == 'T') || (transB[0] == 't'))                       \
+        transb = CUBLAS_OP_T;                                                  \
+      else                                                                     \
+        transb = CUBLAS_OP_C;                                                  \
+                                                                               \
+      constexpr int numDotsLayoutLeftThreshold  = 1600;                        \
+      constexpr int numDotsLayoutRightThreshold = 100;                         \
+      if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&       \
+           M * N < numDotsLayoutLeftThreshold) ||                              \
+          (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&        \
+           M * N < numDotsLayoutRightThreshold)) {                             \
+        DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(         \
+            alpha, A, B, beta, C);                                             \
+        gemm.run(space, false);                                                \
+      } else {                                                                 \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                               \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                  \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                          \
+            cublasSetStream(s.handle, space.cuda_stream()));                   \
+        if (!A_is_lr && !B_is_lr && !C_is_lr)                                  \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                        \
+              cublasDgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), \
+                          LDA, B.data(), LDB, &beta, C.data(), LDC));          \
+        if (A_is_lr && B_is_lr && C_is_lr)                                     \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                        \
+              cublasDgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), \
+                          LDB, A.data(), LDA, &beta, C.data(), LDC));          \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));         \
+      }                                                                        \
+      Kokkos::Profiling::popRegion();                                          \
+    }                                                                          \
   };
 
-#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,         \
+#define KOKKOSBLAS3_SGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,         \
                                  ETI_SPEC_AVAIL)                               \
   template <class ExecSpace>                                                   \
-  struct GEMM<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,           \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
-              Kokkos::View<const Kokkos::complex<double>**, LAYOUTB,           \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
-              Kokkos::View<Kokkos::complex<double>**, LAYOUTC,                 \
-                           Kokkos::Device<ExecSpace, MEM_SPACE>,               \
-                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,          \
-              true, ETI_SPEC_AVAIL> {                                          \
-    typedef Kokkos::complex<double> SCALAR;                                    \
+  struct GEMM<                                                                 \
+      Kokkos::View<const float**, LAYOUTA,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<const float**, LAYOUTB,                                     \
+                   Kokkos::Device<ExecSpace, MEM_SPACE>,                       \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      Kokkos::View<float**, LAYOUTC, Kokkos::Device<ExecSpace, MEM_SPACE>,     \
+                   Kokkos::MemoryTraits<Kokkos::Unmanaged> >,                  \
+      true, ETI_SPEC_AVAIL> {                                                  \
+    typedef float SCALAR;                                                      \
     typedef Kokkos::View<const SCALAR**, LAYOUTA,                              \
                          Kokkos::Device<ExecSpace, MEM_SPACE>,                 \
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
@@ -550,13 +474,13 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >             \
         CViewType;                                                             \
                                                                                \
-    static void gemm(const char transA[], const char transB[],                 \
+    static void gemm(const typename CViewType::execution_space& space,         \
+                     const char transA[], const char transB[],                 \
                      typename AViewType::const_value_type& alpha,              \
                      const AViewType& A, const BViewType& B,                   \
                      typename CViewType::const_value_type& beta,               \
                      const CViewType& C) {                                     \
-      Kokkos::Profiling::pushRegion(                                           \
-          "KokkosBlas::gemm[TPL_BLAS,complex<double>]");                       \
+      Kokkos::Profiling::pushRegion("KokkosBlas::gemm[TPL_BLAS,float]");       \
       const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');               \
       const int M    = static_cast<int>(C.extent(0));                          \
       const int N    = static_cast<int>(C.extent(1));                          \
@@ -595,43 +519,40 @@ namespace Impl {
            M * N < numDotsLayoutRightThreshold)) {                             \
         DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(         \
             alpha, A, B, beta, C);                                             \
-        gemm.run(transa == CUBLAS_OP_C ? true : false);                        \
+        gemm.run(space, false);                                                \
       } else {                                                                 \
         KokkosBlas::Impl::CudaBlasSingleton& s =                               \
             KokkosBlas::Impl::CudaBlasSingleton::singleton();                  \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                          \
+            cublasSetStream(s.handle, space.cuda_stream()));                   \
         if (!A_is_lr && !B_is_lr && !C_is_lr)                                  \
-          cublasZgemm(s.handle, transa, transb, M, N, K,                       \
-                      reinterpret_cast<const cuDoubleComplex*>(&alpha),        \
-                      reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, \
-                      reinterpret_cast<const cuDoubleComplex*>(B.data()), LDB, \
-                      reinterpret_cast<const cuDoubleComplex*>(&beta),         \
-                      reinterpret_cast<cuDoubleComplex*>(C.data()), LDC);      \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                        \
+              cublasSgemm(s.handle, transa, transb, M, N, K, &alpha, A.data(), \
+                          LDA, B.data(), LDB, &beta, C.data(), LDC));          \
         if (A_is_lr && B_is_lr && C_is_lr)                                     \
-          cublasZgemm(s.handle, transb, transa, N, M, K,                       \
-                      reinterpret_cast<const cuDoubleComplex*>(&alpha),        \
-                      reinterpret_cast<const cuDoubleComplex*>(B.data()), LDB, \
-                      reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA, \
-                      reinterpret_cast<const cuDoubleComplex*>(&beta),         \
-                      reinterpret_cast<cuDoubleComplex*>(C.data()), LDC);      \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                        \
+              cublasSgemm(s.handle, transb, transa, N, M, K, &alpha, B.data(), \
+                          LDB, A.data(), LDA, &beta, C.data(), LDC));          \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));         \
       }                                                                        \
       Kokkos::Profiling::popRegion();                                          \
     }                                                                          \
   };
 
-#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,   \
+#define KOKKOSBLAS3_ZGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,   \
                                  ETI_SPEC_AVAIL)                         \
   template <class ExecSpace>                                             \
-  struct GEMM<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,      \
+  struct GEMM<Kokkos::View<const Kokkos::complex<double>**, LAYOUTA,     \
                            Kokkos::Device<ExecSpace, MEM_SPACE>,         \
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,    \
-              Kokkos::View<const Kokkos::complex<float>**, LAYOUTB,      \
+              Kokkos::View<const Kokkos::complex<double>**, LAYOUTB,     \
                            Kokkos::Device<ExecSpace, MEM_SPACE>,         \
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,    \
-              Kokkos::View<Kokkos::complex<float>**, LAYOUTC,            \
+              Kokkos::View<Kokkos::complex<double>**, LAYOUTC,           \
                            Kokkos::Device<ExecSpace, MEM_SPACE>,         \
                            Kokkos::MemoryTraits<Kokkos::Unmanaged> >,    \
               true, ETI_SPEC_AVAIL> {                                    \
-    typedef Kokkos::complex<float> SCALAR;                               \
+    typedef Kokkos::complex<double> SCALAR;                              \
     typedef Kokkos::View<const SCALAR**, LAYOUTA,                        \
                          Kokkos::Device<ExecSpace, MEM_SPACE>,           \
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >       \
@@ -645,13 +566,14 @@ namespace Impl {
                          Kokkos::MemoryTraits<Kokkos::Unmanaged> >       \
         CViewType;                                                       \
                                                                          \
-    static void gemm(const char transA[], const char transB[],           \
+    static void gemm(const typename CViewType::execution_space& space,   \
+                     const char transA[], const char transB[],           \
                      typename AViewType::const_value_type& alpha,        \
                      const AViewType& A, const BViewType& B,             \
                      typename CViewType::const_value_type& beta,         \
                      const CViewType& C) {                               \
       Kokkos::Profiling::pushRegion(                                     \
-          "KokkosBlas::gemm[TPL_BLAS,complex<float>]");                  \
+          "KokkosBlas::gemm[TPL_BLAS,complex<double>]");                 \
       const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');         \
       const int M    = static_cast<int>(C.extent(0));                    \
       const int N    = static_cast<int>(C.extent(1));                    \
@@ -690,29 +612,135 @@ namespace Impl {
            M * N < numDotsLayoutRightThreshold)) {                       \
         DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(   \
             alpha, A, B, beta, C);                                       \
-        gemm.run(transa == CUBLAS_OP_C ? true : false);                  \
+        gemm.run(space, transa == CUBLAS_OP_C ? true : false);           \
       } else {                                                           \
         KokkosBlas::Impl::CudaBlasSingleton& s =                         \
             KokkosBlas::Impl::CudaBlasSingleton::singleton();            \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                    \
+            cublasSetStream(s.handle, space.cuda_stream()));             \
         if (!A_is_lr && !B_is_lr && !C_is_lr)                            \
-          cublasCgemm(s.handle, transa, transb, M, N, K,                 \
-                      reinterpret_cast<const cuComplex*>(&alpha),        \
-                      reinterpret_cast<const cuComplex*>(A.data()), LDA, \
-                      reinterpret_cast<const cuComplex*>(B.data()), LDB, \
-                      reinterpret_cast<const cuComplex*>(&beta),         \
-                      reinterpret_cast<cuComplex*>(C.data()), LDC);      \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemm(                      \
+              s.handle, transa, transb, M, N, K,                         \
+              reinterpret_cast<const cuDoubleComplex*>(&alpha),          \
+              reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA,   \
+              reinterpret_cast<const cuDoubleComplex*>(B.data()), LDB,   \
+              reinterpret_cast<const cuDoubleComplex*>(&beta),           \
+              reinterpret_cast<cuDoubleComplex*>(C.data()), LDC));       \
         if (A_is_lr && B_is_lr && C_is_lr)                               \
-          cublasCgemm(s.handle, transb, transa, N, M, K,                 \
-                      reinterpret_cast<const cuComplex*>(&alpha),        \
-                      reinterpret_cast<const cuComplex*>(B.data()), LDB, \
-                      reinterpret_cast<const cuComplex*>(A.data()), LDA, \
-                      reinterpret_cast<const cuComplex*>(&beta),         \
-                      reinterpret_cast<cuComplex*>(C.data()), LDC);      \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasZgemm(                      \
+              s.handle, transb, transa, N, M, K,                         \
+              reinterpret_cast<const cuDoubleComplex*>(&alpha),          \
+              reinterpret_cast<const cuDoubleComplex*>(B.data()), LDB,   \
+              reinterpret_cast<const cuDoubleComplex*>(A.data()), LDA,   \
+              reinterpret_cast<const cuDoubleComplex*>(&beta),           \
+              reinterpret_cast<cuDoubleComplex*>(C.data()), LDC));       \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));   \
       }                                                                  \
       Kokkos::Profiling::popRegion();                                    \
     }                                                                    \
   };
 
+#define KOKKOSBLAS3_CGEMM_CUBLAS(LAYOUTA, LAYOUTB, LAYOUTC, MEM_SPACE,       \
+                                 ETI_SPEC_AVAIL)                             \
+  template <class ExecSpace>                                                 \
+  struct GEMM<Kokkos::View<const Kokkos::complex<float>**, LAYOUTA,          \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<const Kokkos::complex<float>**, LAYOUTB,          \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              Kokkos::View<Kokkos::complex<float>**, LAYOUTC,                \
+                           Kokkos::Device<ExecSpace, MEM_SPACE>,             \
+                           Kokkos::MemoryTraits<Kokkos::Unmanaged> >,        \
+              true, ETI_SPEC_AVAIL> {                                        \
+    typedef Kokkos::complex<float> SCALAR;                                   \
+    typedef Kokkos::View<const SCALAR**, LAYOUTA,                            \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        AViewType;                                                           \
+    typedef Kokkos::View<const SCALAR**, LAYOUTB,                            \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        BViewType;                                                           \
+    typedef Kokkos::View<SCALAR**, LAYOUTC,                                  \
+                         Kokkos::Device<ExecSpace, MEM_SPACE>,               \
+                         Kokkos::MemoryTraits<Kokkos::Unmanaged> >           \
+        CViewType;                                                           \
+                                                                             \
+    static void gemm(const typename CViewType::execution_space& space,       \
+                     const char transA[], const char transB[],               \
+                     typename AViewType::const_value_type& alpha,            \
+                     const AViewType& A, const BViewType& B,                 \
+                     typename CViewType::const_value_type& beta,             \
+                     const CViewType& C) {                                   \
+      Kokkos::Profiling::pushRegion(                                         \
+          "KokkosBlas::gemm[TPL_BLAS,complex<float>]");                      \
+      const bool A_t = (transA[0] != 'N') && (transA[0] != 'n');             \
+      const int M    = static_cast<int>(C.extent(0));                        \
+      const int N    = static_cast<int>(C.extent(1));                        \
+      const int K    = static_cast<int>(A.extent(A_t ? 0 : 1));              \
+                                                                             \
+      bool A_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTA>::value;      \
+      bool B_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTB>::value;      \
+      bool C_is_lr = std::is_same<Kokkos::LayoutRight, LAYOUTC>::value;      \
+                                                                             \
+      const int AST = A_is_lr ? A.stride(0) : A.stride(1),                   \
+                LDA = AST == 0 ? 1 : AST;                                    \
+      const int BST = B_is_lr ? B.stride(0) : B.stride(1),                   \
+                LDB = BST == 0 ? 1 : BST;                                    \
+      const int CST = C_is_lr ? C.stride(0) : C.stride(1),                   \
+                LDC = CST == 0 ? 1 : CST;                                    \
+                                                                             \
+      cublasOperation_t transa, transb;                                      \
+      if ((transA[0] == 'N') || (transA[0] == 'n'))                          \
+        transa = CUBLAS_OP_N;                                                \
+      else if ((transA[0] == 'T') || (transA[0] == 't'))                     \
+        transa = CUBLAS_OP_T;                                                \
+      else                                                                   \
+        transa = CUBLAS_OP_C;                                                \
+      if ((transB[0] == 'N') || (transB[0] == 'n'))                          \
+        transb = CUBLAS_OP_N;                                                \
+      else if ((transB[0] == 'T') || (transB[0] == 't'))                     \
+        transb = CUBLAS_OP_T;                                                \
+      else                                                                   \
+        transb = CUBLAS_OP_C;                                                \
+                                                                             \
+      constexpr int numDotsLayoutLeftThreshold  = 1600;                      \
+      constexpr int numDotsLayoutRightThreshold = 100;                       \
+      if ((!A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&     \
+           M * N < numDotsLayoutLeftThreshold) ||                            \
+          (A_is_lr && transa != CUBLAS_OP_N && transb == CUBLAS_OP_N &&      \
+           M * N < numDotsLayoutRightThreshold)) {                           \
+        DotBasedGEMM<ExecSpace, AViewType, BViewType, CViewType> gemm(       \
+            alpha, A, B, beta, C);                                           \
+        gemm.run(space, transa == CUBLAS_OP_C ? true : false);               \
+      } else {                                                               \
+        KokkosBlas::Impl::CudaBlasSingleton& s =                             \
+            KokkosBlas::Impl::CudaBlasSingleton::singleton();                \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                        \
+            cublasSetStream(s.handle, space.cuda_stream()));                 \
+        if (!A_is_lr && !B_is_lr && !C_is_lr)                                \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                      \
+              cublasCgemm(s.handle, transa, transb, M, N, K,                 \
+                          reinterpret_cast<const cuComplex*>(&alpha),        \
+                          reinterpret_cast<const cuComplex*>(A.data()), LDA, \
+                          reinterpret_cast<const cuComplex*>(B.data()), LDB, \
+                          reinterpret_cast<const cuComplex*>(&beta),         \
+                          reinterpret_cast<cuComplex*>(C.data()), LDC));     \
+        if (A_is_lr && B_is_lr && C_is_lr)                                   \
+          KOKKOS_CUBLAS_SAFE_CALL_IMPL(                                      \
+              cublasCgemm(s.handle, transb, transa, N, M, K,                 \
+                          reinterpret_cast<const cuComplex*>(&alpha),        \
+                          reinterpret_cast<const cuComplex*>(B.data()), LDB, \
+                          reinterpret_cast<const cuComplex*>(A.data()), LDA, \
+                          reinterpret_cast<const cuComplex*>(&beta),         \
+                          reinterpret_cast<cuComplex*>(C.data()), LDC));     \
+        KOKKOS_CUBLAS_SAFE_CALL_IMPL(cublasSetStream(s.handle, NULL));       \
+      }                                                                      \
+      Kokkos::Profiling::popRegion();                                        \
+    }                                                                        \
+  };
+
 KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
                          Kokkos::LayoutLeft, Kokkos::CudaSpace, true)
 KOKKOSBLAS3_DGEMM_CUBLAS(Kokkos::LayoutLeft, Kokkos::LayoutLeft,
diff --git a/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp
index 256b5a17a3..9e0bff4c55 100644
--- a/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp
+++ b/src/impl/tpls/KokkosBlas_Cuda_tpl.hpp
@@ -22,7 +22,7 @@ CudaBlasSingleton& CudaBlasSingleton::singleton() {
 
 }  // namespace Impl
 }  // namespace KokkosBlas
-#endif
+#endif  // defined (KOKKOSKERNELS_ENABLE_TPL_CUBLAS)
 
 #if defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
 #include <KokkosBlas_tpl_spec.hpp>
@@ -44,6 +44,6 @@ MagmaSingleton& MagmaSingleton::singleton() {
 
 }  // namespace Impl
 }  // namespace KokkosBlas
-#endif
+#endif  // defined(KOKKOSKERNELS_ENABLE_TPL_MAGMA)
 
-#endif
+#endif  // KOKKOSBLAS_CUDA_TPL_HPP_
diff --git a/src/impl/tpls/KokkosBlas_tpl_spec.hpp b/src/impl/tpls/KokkosBlas_tpl_spec.hpp
index bcb1344279..f4ab92c3a6 100644
--- a/src/impl/tpls/KokkosBlas_tpl_spec.hpp
+++ b/src/impl/tpls/KokkosBlas_tpl_spec.hpp
@@ -60,6 +60,66 @@ struct CudaBlasSingleton {
   static CudaBlasSingleton& singleton();
 };
 
+inline void cublas_internal_error_throw(cublasStatus_t cublasState,
+                                        const char* name, const char* file,
+                                        const int line) {
+  std::ostringstream out;
+  // out << name << " error( " << cublasGetStatusName(cublasState)
+  //     << "): " << cublasGetStatusString(cublasState);
+  out << name << " error( ";
+  switch (cublasState) {
+    case CUBLAS_STATUS_NOT_INITIALIZED:
+      out << "CUBLAS_STATUS_NOT_INITIALIZED): the library was not initialized.";
+      break;
+    case CUBLAS_STATUS_ALLOC_FAILED:
+      out << "CUBLAS_STATUS_ALLOC_FAILED): the resource allocation failed.";
+      break;
+    case CUBLAS_STATUS_INVALID_VALUE:
+      out << "CUBLAS_STATUS_INVALID_VALUE): an invalid numerical value was "
+             "used as an argument.";
+      break;
+    case CUBLAS_STATUS_ARCH_MISMATCH:
+      out << "CUBLAS_STATUS_ARCH_MISMATCH): an absent device architectural "
+             "feature is required.";
+      break;
+    case CUBLAS_STATUS_MAPPING_ERROR:
+      out << "CUBLAS_STATUS_MAPPING_ERROR): an access to GPU memory space "
+             "failed.";
+      break;
+    case CUBLAS_STATUS_EXECUTION_FAILED:
+      out << "CUBLAS_STATUS_EXECUTION_FAILED): the GPU program failed to "
+             "execute.";
+      break;
+    case CUBLAS_STATUS_INTERNAL_ERROR:
+      out << "CUBLAS_STATUS_INTERNAL_ERROR): an internal operation failed.";
+      break;
+    case CUBLAS_STATUS_NOT_SUPPORTED:
+      out << "CUBLAS_STATUS_NOT_SUPPORTED): the feature required is not "
+             "supported.";
+      break;
+    default: out << "unrecognized error code): this is bad!"; break;
+  }
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw std::runtime_error(out.str());
+}
+
+inline void cublas_internal_safe_call(cublasStatus_t cublasState,
+                                      const char* name,
+                                      const char* file = nullptr,
+                                      const int line   = 0) {
+  if (CUBLAS_STATUS_SUCCESS != cublasState) {
+    cublas_internal_error_throw(cublasState, name, file, line);
+  }
+}
+
+// The macro below defines the interface for the safe cublas calls.
+// The functions themselves are protected by impl namespace and this
+// is not meant to be used by external application or libraries.
+#define KOKKOS_CUBLAS_SAFE_CALL_IMPL(call) \
+  KokkosBlas::Impl::cublas_internal_safe_call(call, #call, __FILE__, __LINE__)
+
 }  // namespace Impl
 }  // namespace KokkosBlas
 #endif  // KOKKOSKERNELS_ENABLE_TPL_CUBLAS
diff --git a/unit_test/blas/Test_Blas3_gemm.hpp b/unit_test/blas/Test_Blas3_gemm.hpp
index 70c7dd35e1..8ee7bd1915 100644
--- a/unit_test/blas/Test_Blas3_gemm.hpp
+++ b/unit_test/blas/Test_Blas3_gemm.hpp
@@ -59,6 +59,62 @@ struct gemm_VanillaGEMM {
   }
 };
 
+template <class ViewTypeA, class ViewTypeB, class ViewTypeC>
+void build_matrices(const int M, const int N, const int K,
+                    const typename ViewTypeA::value_type alpha, ViewTypeA& A,
+                    ViewTypeB& B, const typename ViewTypeA::value_type beta,
+                    ViewTypeC& C, ViewTypeC& Cref) {
+  using execution_space = TestExecSpace;
+  using ScalarA         = typename ViewTypeA::non_const_value_type;
+  using ScalarB         = typename ViewTypeB::non_const_value_type;
+  using ScalarC         = typename ViewTypeC::non_const_value_type;
+
+  A    = ViewTypeA("A", M, K);
+  B    = ViewTypeB("B", K, N);
+  C    = ViewTypeC("C", M, N);
+  Cref = ViewTypeC("Cref", M, N);
+
+  // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in
+  // Trilinos issue #6418
+  const uint64_t seed = Kokkos::Impl::clock_tic();
+  Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
+  Kokkos::fill_random(A, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarA>::max());
+  Kokkos::fill_random(B, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarB>::max());
+  Kokkos::fill_random(C, rand_pool,
+                      Kokkos::rand<typename Kokkos::Random_XorShift64_Pool<
+                                       execution_space>::generator_type,
+                                   ScalarC>::max());
+
+  Kokkos::deep_copy(Cref, C);
+  Kokkos::fence();
+
+  struct Test::gemm_VanillaGEMM<ViewTypeA, ViewTypeB, ViewTypeC,
+                                execution_space>
+      vgemm;
+  vgemm.A_t   = false;
+  vgemm.B_t   = false;
+  vgemm.A_c   = false;
+  vgemm.B_c   = false;
+  vgemm.N     = N;
+  vgemm.K     = K;
+  vgemm.A     = A;
+  vgemm.B     = B;
+  vgemm.C     = Cref;
+  vgemm.alpha = alpha;
+  vgemm.beta  = beta;
+
+  Kokkos::parallel_for("KokkosBlas::Test::gemm_VanillaGEMM",
+                       Kokkos::TeamPolicy<execution_space>(M, Kokkos::AUTO, 16),
+                       vgemm);
+  Kokkos::fence();
+}
+
 template <class ViewTypeC, class ExecutionSpace>
 struct DiffGEMM {
   int N;
@@ -110,7 +166,7 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K,
   ViewTypeC C("C", M, N);
   ViewTypeC C2("C", M, N);
 
-  uint64_t seed = Kokkos::Impl::clock_tic();
+  const uint64_t seed = Kokkos::Impl::clock_tic();
   Kokkos::Random_XorShift64_Pool<execution_space> rand_pool(seed);
 
   // (SA 11 Dec 2019) Max (previously: 10) increased to detect the bug in
@@ -176,6 +232,79 @@ void impl_test_gemm(const char* TA, const char* TB, int M, int N, int K,
     EXPECT_TRUE((diff_C_average < 1.05 * diff_C_expected));
   }
 }
+
+template <typename Scalar, typename Layout>
+void impl_test_stream_gemm(const int M, const int N, const int K,
+                           const Scalar alpha, const Scalar beta) {
+  using execution_space = TestExecSpace;
+  using ViewTypeA       = Kokkos::View<Scalar**, Layout, TestExecSpace>;
+  using ViewTypeB       = Kokkos::View<Scalar**, Layout, TestExecSpace>;
+  using ViewTypeC       = Kokkos::View<Scalar**, Layout, TestExecSpace>;
+  using ScalarC         = typename ViewTypeC::value_type;
+  using APT             = Kokkos::Details::ArithTraits<ScalarC>;
+  using mag_type        = typename APT::mag_type;
+
+  const char tA[]          = {"N"};
+  const char tB[]          = {"N"};
+  const double machine_eps = APT::epsilon();
+
+  ViewTypeA A1, A2;
+  ViewTypeB B1, B2;
+  ViewTypeC C1, C1ref, C2, C2ref;
+
+  Test::build_matrices(M, N, K, alpha, A1, B1, beta, C1, C1ref);
+  Test::build_matrices(N, M, K, alpha, A2, B2, beta, C2, C2ref);
+
+  auto instances =
+      Kokkos::Experimental::partition_space(execution_space(), 1, 1);
+  KokkosBlas::gemm(instances[0], tA, tB, alpha, A1, B1, beta, C1);
+  KokkosBlas::gemm(instances[1], tA, tB, alpha, A2, B2, beta, C2);
+  Kokkos::fence();
+
+  mag_type diff_C1 = 0;
+  struct Test::DiffGEMM<ViewTypeC, execution_space> diffgemm1;
+  diffgemm1.N  = N;
+  diffgemm1.C  = C1;
+  diffgemm1.C2 = C1ref;
+
+  Kokkos::parallel_reduce(
+      "KokkosBlas::Test::DiffGEMM1",
+      Kokkos::TeamPolicy<execution_space>(M, Kokkos::AUTO, 16), diffgemm1,
+      diff_C1);
+
+  mag_type diff_C2 = 0;
+  struct Test::DiffGEMM<ViewTypeC, execution_space> diffgemm2;
+  diffgemm2.N  = M;
+  diffgemm2.C  = C2;
+  diffgemm2.C2 = C2ref;
+
+  Kokkos::parallel_reduce(
+      "KokkosBlas::Test::DiffGEMM2",
+      Kokkos::TeamPolicy<execution_space>(N, Kokkos::AUTO, 16), diffgemm2,
+      diff_C2);
+  Kokkos::fence();
+
+  if (N != 0 && M != 0) {
+    int K_eff = (K == 0) ? 1 : K;
+    // Expected Result: Random Walk in the least significant bit (i.e. ~
+    // sqrt(K)*eps eps scales with the total sum and has a factor in it for the
+    // accuracy of the operations -> eps = K * 75 * machine_eps * 7
+    const double diff_C_expected =
+        1.0 * sqrt(K_eff) * K_eff * 75 * machine_eps * 7;
+
+    const double diff_C1_average = diff_C1 / (N * M);
+    if ((diff_C1_average >= 1.05 * diff_C_expected)) {
+      printf("Result: %e %e\n", diff_C1_average, diff_C_expected);
+    }
+    EXPECT_TRUE((diff_C1_average < 1.05 * diff_C_expected));
+
+    const double diff_C2_average = diff_C2 / (N * M);
+    if ((diff_C2_average >= 1.05 * diff_C_expected)) {
+      printf("Result: %e %e\n", diff_C2_average, diff_C_expected);
+    }
+    EXPECT_TRUE((diff_C2_average < 1.05 * diff_C_expected));
+  }
+}
 }  // namespace Test
 
 template <typename Scalar, typename Layout>
@@ -215,6 +344,12 @@ void test_gemm() {
       }
     }
   }
+  Test::impl_test_stream_gemm<Scalar, Layout>(53, 42, 17, 4.5,
+                                              3.0);  // General code path
+  Test::impl_test_stream_gemm<Scalar, Layout>(
+      13, 1, 17, 4.5, 3.0);  // gemv based gemm code path
+  Test::impl_test_stream_gemm<Scalar, Layout>(7, 13, 17, 4.5,
+                                              3.0);  // dot based gemm code path
 }
 
 template <typename Scalar>