From b01db1e7324b052766420b3692bfe39b2e28db11 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Fri, 15 Oct 2021 16:10:35 -0600
Subject: [PATCH 01/18] batched/dense: Rework BatchedDblBufGemm

  - Rework handling of partial rows/cols.
    Rather than increasing the league_size,
    use 1 more thread per team for each
    REG_M/REG_N overstep of tile_m/tile_n.
---
 src/batched/dense/KokkosBatched_Gemm_Decl.hpp |   7 +-
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 127 ++++++++++++------
 2 files changed, 86 insertions(+), 48 deletions(-)

diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
index 0f5afcc6aa..f3c5ba5b5a 100644
--- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
@@ -463,10 +463,9 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
       //     (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) {
       //   // TODO: invoke TeamShmem
       // } else
-      if (on_gpu &&
-          ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
-               ? (c_m >= 16)
-               : (c_m >= 24 && c_m <= 32) || (c_m >= 45 && c_m <= 64))) {
+      if (on_gpu && ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
+                         ? (c_m >= 16)
+                         : (c_m >= 24))) {
         handle->teamSz = handle->vecLen = 8;
         constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
         if (c_m % 32 == 0)  // No bounds checking
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index f8fc55f5b2..ae3a1b61a2 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -130,10 +130,15 @@ class BatchedDblBufGemm {
     }
 
     // Each team solves a single tile. Within each tile, the team solves
-    // all __n_tile_k_tiles one at a time.
+    // all n_sub_tiles, one at a time.
     size_t league_size = __c_batch_size * functor.get_n_sub_tiles();
-    int team_size      = stride_m;
-    int vector_len     = stride_n;
+    // TODO: determine max_team_size and max_vector_len here instead of using 32
+    // and 16
+    int team_size =
+        std::min(stride_m + functor.n_extra_threads(), (unsigned)32);
+    // TODO: why are 2x vector lanes needed rather than just 1 more?
+    int vector_len =
+        std::min(stride_n * (functor.n_extra_vlanes() + 1), (unsigned)16);
 
     const int max_team_size =
         policy_type(league_size, Kokkos::AUTO, vector_len)
@@ -165,14 +170,18 @@ class BatchedDblBufGemm {
                 << " team_size:" << team_size << std::endl
                 << "max_vector_len:" << max_vector_len
                 << " vector_len:" << vector_len << std::endl
+                << " league_size:" << league_size << std::endl
                 << "TILE_M:" << TILE_M << std::endl
                 << "TILE_N:" << TILE_N << std::endl
                 << "TILE_K:" << TILE_K << std::endl;
     }
 
-    // TODO: Use statically allocated shmem
-    int shmem_size = view_type_2d_scratch::shmem_size(TILE_M, TILE_K) +
-                     view_type_2d_scratch::shmem_size(TILE_K, TILE_N);
+    // NOTE: All but shmem_size args but partial tile sizes can be determined at
+    // compile time.
+    int shmem_size = view_type_2d_scratch::shmem_size(
+                         TILE_M + functor.get_partial_tile_m(), TILE_K) +
+                     view_type_2d_scratch::shmem_size(
+                         TILE_K, TILE_N + functor.get_partial_tile_n());
 
     // Each member solves a portion of TILE_K in parallel with other members
     policy_type team_policy(league_size, team_size, vector_len);
@@ -194,10 +203,15 @@ class BatchedDblBufGemm {
     ScalarType __alpha, __beta;
     int __k;
     size_t __n_tile_k_tiles, __n_sub_tiles;
-    unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row;
+    unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row,
+        __partial_tile_m, __partial_tile_n, __stride_m, __stride_n, __ts, __vl;
 
    public:
     size_t get_n_sub_tiles() { return __n_sub_tiles; }
+    unsigned get_partial_tile_m() { return __partial_tile_m; }
+    unsigned get_partial_tile_n() { return __partial_tile_n; }
+    unsigned n_extra_threads() { return __partial_tile_m / REG_M; }
+    unsigned n_extra_vlanes() { return __partial_tile_n / REG_N; }
 
     // NOTE: We cannot use __ei.{__A,__B,__C,__beta,__alpha,__k} in the operator
     // below. If those are used, we  get an invalid memory error from cuda. I
@@ -231,17 +245,44 @@ class BatchedDblBufGemm {
       }
       __beta  = ei.__beta;   // Copy to device
       __alpha = ei.__alpha;  // Copy to device
-      // To handle truncation of tiles per row/col, round up to one extra tile
-      // with '!!'. This extra tile will hang off the edge of the 2-rank matrix.
-      // For cases where tiles hang off the edge, we over-compute 0s within
-      // registers via a conditional bounds check selected at compile-time.
-      __tiles_per_row = ei.__c_m / __tile_m + !!((unsigned)ei.__c_m % __tile_m);
-      __tiles_per_col = ei.__c_n / __tile_n + !!((unsigned)ei.__c_n % __tile_n);
+      // To handle cases where ei.__c_{m,n} % __tile_{m,n} != 0 without simply
+      // multiplying by zeros, we can adjust the scratch space size in
+      // increments of REG_M / REG_M as well as the team / vector sizes up to 2
+      // * cur_team_size - 1. For each additional thread, we must increase
+      // STRIDE_M by 1. For each additional vlane, we must increase STRIDE_N by
+      // 1. Note that each thead can handle REG_M extra rows and each vlane can
+      // handle REG_N extra cols.
+      __tiles_per_row = ei.__c_m / __tile_m;
+      __tiles_per_col = ei.__c_n / __tile_n;
+
+      // Each partial_tile is a multiple of REG_{M,N} since each thread holds
+      // register buffers of size REG_{M,N}.
+      __partial_tile_m = ei.__c_m % __tile_m && ei.__c_m > __tile_m
+                             ? REG_M * ((ei.__c_m - __tile_m) / REG_M + 1)
+                             : 0;
+      __partial_tile_n = ei.__c_n % __tile_n && ei.__c_n > __tile_n
+                             ? REG_N * ((ei.__c_n - __tile_n) / REG_N + 1)
+                             : 0;
+
+      __stride_m = STRIDE_M + n_extra_threads();
+      __stride_n = STRIDE_N + n_extra_vlanes();
+
+      __ts = __tile_n / REG_N + n_extra_threads();
+      __vl = __tile_n / REG_N + n_extra_vlanes();
 
       // To handle truncation of __n_tile_k_tile, we have logic within the
       // operator for handling a partial __tile_k tile.
       __n_tile_k_tiles = __k / __tile_k;
       __n_sub_tiles    = __tiles_per_row * __tiles_per_col;
+
+      if (ei.__handle->enableDebug) {
+        std::cout << "__partial_tile_m:" << __partial_tile_m << std::endl
+                  << "__partial_tile_n:" << __partial_tile_n << std::endl
+                  << "__stride_m:" << __stride_m << std::endl
+                  << "__stride_n:" << __stride_n << std::endl
+                  << "__ts:" << __ts << std::endl
+                  << "__vl:" << __vl << std::endl;
+      }
     }
 
     KOKKOS_INLINE_FUNCTION
@@ -253,10 +294,9 @@ class BatchedDblBufGemm {
                            view_type_2d_scratch &svA_scr,
                            view_type_2d_scratch &svB_scr) const {
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, tile_m / REG_M),
-          [&](const int &thread_id) {
+          Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) {
             Kokkos::parallel_for(
-                Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N),
+                Kokkos::ThreadVectorRange(member, 0, __vl),
                 [&](const int &vlane_id) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -266,13 +306,13 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m)
-                      reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M);
+                      reg_a[m] = svA_scr(k, thread_id + m * __stride_m);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int n = 0; n < REG_N; ++n)
-                      reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+                      reg_b[n] = svB_scr(k, vlane_id + n * __stride_n);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -314,14 +354,15 @@ class BatchedDblBufGemm {
                                  __ei.__batch_layout_tag);
 
       // Allocate scratch memory buffers used for prefetching
-      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, __tile_m);
-      view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n);
+      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k,
+                                   __tile_m + __partial_tile_m);
+      view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k,
+                                   __tile_n + __partial_tile_n);
 
       // Here we populate scratch memory with one or more "k" tiles for every
       // thread of the team!
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
-          [&](const int &thread_id) {
+          Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) {
             auto thread_offset = thread_id + start_n;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
@@ -329,7 +370,7 @@ class BatchedDblBufGemm {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
+                  for (int i = 0; i < REG_N * __stride_n; i += __stride_n)
                     svB_scr(vlane_id, thread_id + i) =
                         access_view_bounds_check<view_value_type>(
                             svB, vlane_id, thread_offset + i,
@@ -337,8 +378,7 @@ class BatchedDblBufGemm {
                 });
           });
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
-          [&](const int &thread_id) {
+          Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) {
             auto thread_offset = thread_id + start_m;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
@@ -346,7 +386,7 @@ class BatchedDblBufGemm {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
+                  for (int i = 0; i < REG_M * __stride_m; i += __stride_m)
                     svA_scr(vlane_id, thread_id + i) =
                         access_view_bounds_check<view_value_type>(
                             svA, thread_offset + i, vlane_id,
@@ -373,7 +413,7 @@ class BatchedDblBufGemm {
         // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
         // over all threads in the team.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
+            Kokkos::TeamThreadRange(member, 0, __ts),
             [&](const int &thread_id) {
               auto thread_offset = thread_id + start_n;
               Kokkos::parallel_for(
@@ -386,7 +426,7 @@ class BatchedDblBufGemm {
                       prefetch_reg_b[i] =
                           access_view_bounds_check<view_value_type>(
                               svB, vlane_id + k_tile_offset,
-                              thread_offset + i * STRIDE_N,
+                              thread_offset + i * __stride_n,
                               __ei.__bounds_check_tag);
                   });
             });
@@ -395,7 +435,7 @@ class BatchedDblBufGemm {
         // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
         // over all threads in the team.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
+            Kokkos::TeamThreadRange(member, 0, __ts),
             [&](const int &thread_id) {
               auto thread_offset = thread_id + start_m;
               Kokkos::parallel_for(
@@ -407,7 +447,7 @@ class BatchedDblBufGemm {
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + i * STRIDE_M,
+                              svA, thread_offset + i * __stride_m,
                               vlane_id + k_tile_offset,
                               __ei.__bounds_check_tag);
                   });
@@ -424,7 +464,7 @@ class BatchedDblBufGemm {
         // populate shmem from prefetch registers. Each thread has its own copy
         // of prefetch_reg_a.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
+            Kokkos::TeamThreadRange(member, 0, __ts),
             [&](const int &thread_id) {
               auto thread_offset = thread_id;
               Kokkos::parallel_for(
@@ -434,7 +474,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_N; ++i)
-                      svB_scr(vlane_id, thread_offset + i * STRIDE_N) =
+                      svB_scr(vlane_id, thread_offset + i * __stride_n) =
                           prefetch_reg_b[i];
                   });
             });
@@ -442,7 +482,7 @@ class BatchedDblBufGemm {
         // populate shmem from prefetch registers. Each thread has its own copy
         // of prefetch_reg_b.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
+            Kokkos::TeamThreadRange(member, 0, __ts),
             [&](const int &thread_id) {
               auto thread_offset = thread_id;
               Kokkos::parallel_for(
@@ -452,7 +492,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
-                      svA_scr(vlane_id, thread_offset + i * STRIDE_M) =
+                      svA_scr(vlane_id, thread_offset + i * __stride_m) =
                           prefetch_reg_a[i];
                   });
             });
@@ -468,26 +508,26 @@ class BatchedDblBufGemm {
       __rshmem_and_mult(member, partial_tile_k, __tile_m, __tile_n, reg_a,
                         reg_b, reg_c, svA_scr, svB_scr);
 
+      // store results back to global memory
       if (__beta == 0.0F) {
-        // store results back to global memory
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
+            Kokkos::TeamThreadRange(member, 0, __ts),
             [&](const int &thread_id) {
               auto thread_m_offset = thread_id + start_m;
               Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
+                  Kokkos::ThreadVectorRange(member, 0, __vl),
                   [&](const int &vlane_id) {
                     auto thread_n_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m) {
-                      int cm = thread_m_offset + m * STRIDE_M;
+                      int cm = thread_m_offset + m * __stride_m;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                       for (int n = 0; n < REG_N; ++n) {
-                        int cn = thread_n_offset + n * STRIDE_N;
+                        int cn = thread_n_offset + n * __stride_n;
                         fma_bounds_check(svC, cm, cn, reg_c[m][n],
                                          __ei.__bounds_check_tag);
                       }
@@ -495,25 +535,24 @@ class BatchedDblBufGemm {
                   });
             });
       } else {
-        // store results back to global memory
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
+            Kokkos::TeamThreadRange(member, 0, __ts),
             [&](const int &thread_id) {
               auto thread_m_offset = thread_id + start_m;
               Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
+                  Kokkos::ThreadVectorRange(member, 0, __vl),
                   [&](const int &vlane_id) {
                     auto thread_n_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m) {
-                      int cm = thread_m_offset + m * STRIDE_M;
+                      int cm = thread_m_offset + m * __stride_m;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                       for (int n = 0; n < REG_N; ++n) {
-                        int cn = thread_n_offset + n * STRIDE_N;
+                        int cn = thread_n_offset + n * __stride_n;
                         fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta,
                                          __ei.__bounds_check_tag);
                       }

From 7dc1d60d3613924eac85fca331d87bab92cbd303 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 21 Oct 2021 13:11:19 -0600
Subject: [PATCH 02/18] Revert "batched/dense: Rework BatchedDblBufGemm"

This reverts commit b01db1e7324b052766420b3692bfe39b2e28db11.
---
 src/batched/dense/KokkosBatched_Gemm_Decl.hpp |   7 +-
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 127 ++++++------------
 2 files changed, 48 insertions(+), 86 deletions(-)

diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
index f3c5ba5b5a..0f5afcc6aa 100644
--- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
@@ -463,9 +463,10 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
       //     (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) {
       //   // TODO: invoke TeamShmem
       // } else
-      if (on_gpu && ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
-                         ? (c_m >= 16)
-                         : (c_m >= 24))) {
+      if (on_gpu &&
+          ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
+               ? (c_m >= 16)
+               : (c_m >= 24 && c_m <= 32) || (c_m >= 45 && c_m <= 64))) {
         handle->teamSz = handle->vecLen = 8;
         constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
         if (c_m % 32 == 0)  // No bounds checking
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index ae3a1b61a2..f8fc55f5b2 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -130,15 +130,10 @@ class BatchedDblBufGemm {
     }
 
     // Each team solves a single tile. Within each tile, the team solves
-    // all n_sub_tiles, one at a time.
+    // all __n_tile_k_tiles one at a time.
     size_t league_size = __c_batch_size * functor.get_n_sub_tiles();
-    // TODO: determine max_team_size and max_vector_len here instead of using 32
-    // and 16
-    int team_size =
-        std::min(stride_m + functor.n_extra_threads(), (unsigned)32);
-    // TODO: why are 2x vector lanes needed rather than just 1 more?
-    int vector_len =
-        std::min(stride_n * (functor.n_extra_vlanes() + 1), (unsigned)16);
+    int team_size      = stride_m;
+    int vector_len     = stride_n;
 
     const int max_team_size =
         policy_type(league_size, Kokkos::AUTO, vector_len)
@@ -170,18 +165,14 @@ class BatchedDblBufGemm {
                 << " team_size:" << team_size << std::endl
                 << "max_vector_len:" << max_vector_len
                 << " vector_len:" << vector_len << std::endl
-                << " league_size:" << league_size << std::endl
                 << "TILE_M:" << TILE_M << std::endl
                 << "TILE_N:" << TILE_N << std::endl
                 << "TILE_K:" << TILE_K << std::endl;
     }
 
-    // NOTE: All but shmem_size args but partial tile sizes can be determined at
-    // compile time.
-    int shmem_size = view_type_2d_scratch::shmem_size(
-                         TILE_M + functor.get_partial_tile_m(), TILE_K) +
-                     view_type_2d_scratch::shmem_size(
-                         TILE_K, TILE_N + functor.get_partial_tile_n());
+    // TODO: Use statically allocated shmem
+    int shmem_size = view_type_2d_scratch::shmem_size(TILE_M, TILE_K) +
+                     view_type_2d_scratch::shmem_size(TILE_K, TILE_N);
 
     // Each member solves a portion of TILE_K in parallel with other members
     policy_type team_policy(league_size, team_size, vector_len);
@@ -203,15 +194,10 @@ class BatchedDblBufGemm {
     ScalarType __alpha, __beta;
     int __k;
     size_t __n_tile_k_tiles, __n_sub_tiles;
-    unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row,
-        __partial_tile_m, __partial_tile_n, __stride_m, __stride_n, __ts, __vl;
+    unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row;
 
    public:
     size_t get_n_sub_tiles() { return __n_sub_tiles; }
-    unsigned get_partial_tile_m() { return __partial_tile_m; }
-    unsigned get_partial_tile_n() { return __partial_tile_n; }
-    unsigned n_extra_threads() { return __partial_tile_m / REG_M; }
-    unsigned n_extra_vlanes() { return __partial_tile_n / REG_N; }
 
     // NOTE: We cannot use __ei.{__A,__B,__C,__beta,__alpha,__k} in the operator
     // below. If those are used, we  get an invalid memory error from cuda. I
@@ -245,44 +231,17 @@ class BatchedDblBufGemm {
       }
       __beta  = ei.__beta;   // Copy to device
       __alpha = ei.__alpha;  // Copy to device
-      // To handle cases where ei.__c_{m,n} % __tile_{m,n} != 0 without simply
-      // multiplying by zeros, we can adjust the scratch space size in
-      // increments of REG_M / REG_M as well as the team / vector sizes up to 2
-      // * cur_team_size - 1. For each additional thread, we must increase
-      // STRIDE_M by 1. For each additional vlane, we must increase STRIDE_N by
-      // 1. Note that each thead can handle REG_M extra rows and each vlane can
-      // handle REG_N extra cols.
-      __tiles_per_row = ei.__c_m / __tile_m;
-      __tiles_per_col = ei.__c_n / __tile_n;
-
-      // Each partial_tile is a multiple of REG_{M,N} since each thread holds
-      // register buffers of size REG_{M,N}.
-      __partial_tile_m = ei.__c_m % __tile_m && ei.__c_m > __tile_m
-                             ? REG_M * ((ei.__c_m - __tile_m) / REG_M + 1)
-                             : 0;
-      __partial_tile_n = ei.__c_n % __tile_n && ei.__c_n > __tile_n
-                             ? REG_N * ((ei.__c_n - __tile_n) / REG_N + 1)
-                             : 0;
-
-      __stride_m = STRIDE_M + n_extra_threads();
-      __stride_n = STRIDE_N + n_extra_vlanes();
-
-      __ts = __tile_n / REG_N + n_extra_threads();
-      __vl = __tile_n / REG_N + n_extra_vlanes();
+      // To handle truncation of tiles per row/col, round up to one extra tile
+      // with '!!'. This extra tile will hang off the edge of the 2-rank matrix.
+      // For cases where tiles hang off the edge, we over-compute 0s within
+      // registers via a conditional bounds check selected at compile-time.
+      __tiles_per_row = ei.__c_m / __tile_m + !!((unsigned)ei.__c_m % __tile_m);
+      __tiles_per_col = ei.__c_n / __tile_n + !!((unsigned)ei.__c_n % __tile_n);
 
       // To handle truncation of __n_tile_k_tile, we have logic within the
       // operator for handling a partial __tile_k tile.
       __n_tile_k_tiles = __k / __tile_k;
       __n_sub_tiles    = __tiles_per_row * __tiles_per_col;
-
-      if (ei.__handle->enableDebug) {
-        std::cout << "__partial_tile_m:" << __partial_tile_m << std::endl
-                  << "__partial_tile_n:" << __partial_tile_n << std::endl
-                  << "__stride_m:" << __stride_m << std::endl
-                  << "__stride_n:" << __stride_n << std::endl
-                  << "__ts:" << __ts << std::endl
-                  << "__vl:" << __vl << std::endl;
-      }
     }
 
     KOKKOS_INLINE_FUNCTION
@@ -294,9 +253,10 @@ class BatchedDblBufGemm {
                            view_type_2d_scratch &svA_scr,
                            view_type_2d_scratch &svB_scr) const {
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) {
+          Kokkos::TeamThreadRange(member, 0, tile_m / REG_M),
+          [&](const int &thread_id) {
             Kokkos::parallel_for(
-                Kokkos::ThreadVectorRange(member, 0, __vl),
+                Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N),
                 [&](const int &vlane_id) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -306,13 +266,13 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m)
-                      reg_a[m] = svA_scr(k, thread_id + m * __stride_m);
+                      reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int n = 0; n < REG_N; ++n)
-                      reg_b[n] = svB_scr(k, vlane_id + n * __stride_n);
+                      reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -354,15 +314,14 @@ class BatchedDblBufGemm {
                                  __ei.__batch_layout_tag);
 
       // Allocate scratch memory buffers used for prefetching
-      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k,
-                                   __tile_m + __partial_tile_m);
-      view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k,
-                                   __tile_n + __partial_tile_n);
+      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, __tile_m);
+      view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n);
 
       // Here we populate scratch memory with one or more "k" tiles for every
       // thread of the team!
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) {
+          Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
+          [&](const int &thread_id) {
             auto thread_offset = thread_id + start_n;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
@@ -370,7 +329,7 @@ class BatchedDblBufGemm {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int i = 0; i < REG_N * __stride_n; i += __stride_n)
+                  for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
                     svB_scr(vlane_id, thread_id + i) =
                         access_view_bounds_check<view_value_type>(
                             svB, vlane_id, thread_offset + i,
@@ -378,7 +337,8 @@ class BatchedDblBufGemm {
                 });
           });
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) {
+          Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
+          [&](const int &thread_id) {
             auto thread_offset = thread_id + start_m;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
@@ -386,7 +346,7 @@ class BatchedDblBufGemm {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int i = 0; i < REG_M * __stride_m; i += __stride_m)
+                  for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
                     svA_scr(vlane_id, thread_id + i) =
                         access_view_bounds_check<view_value_type>(
                             svA, thread_offset + i, vlane_id,
@@ -413,7 +373,7 @@ class BatchedDblBufGemm {
         // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
         // over all threads in the team.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __ts),
+            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
             [&](const int &thread_id) {
               auto thread_offset = thread_id + start_n;
               Kokkos::parallel_for(
@@ -426,7 +386,7 @@ class BatchedDblBufGemm {
                       prefetch_reg_b[i] =
                           access_view_bounds_check<view_value_type>(
                               svB, vlane_id + k_tile_offset,
-                              thread_offset + i * __stride_n,
+                              thread_offset + i * STRIDE_N,
                               __ei.__bounds_check_tag);
                   });
             });
@@ -435,7 +395,7 @@ class BatchedDblBufGemm {
         // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
         // over all threads in the team.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __ts),
+            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
               auto thread_offset = thread_id + start_m;
               Kokkos::parallel_for(
@@ -447,7 +407,7 @@ class BatchedDblBufGemm {
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + i * __stride_m,
+                              svA, thread_offset + i * STRIDE_M,
                               vlane_id + k_tile_offset,
                               __ei.__bounds_check_tag);
                   });
@@ -464,7 +424,7 @@ class BatchedDblBufGemm {
         // populate shmem from prefetch registers. Each thread has its own copy
         // of prefetch_reg_a.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __ts),
+            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
             [&](const int &thread_id) {
               auto thread_offset = thread_id;
               Kokkos::parallel_for(
@@ -474,7 +434,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_N; ++i)
-                      svB_scr(vlane_id, thread_offset + i * __stride_n) =
+                      svB_scr(vlane_id, thread_offset + i * STRIDE_N) =
                           prefetch_reg_b[i];
                   });
             });
@@ -482,7 +442,7 @@ class BatchedDblBufGemm {
         // populate shmem from prefetch registers. Each thread has its own copy
         // of prefetch_reg_b.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __ts),
+            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
               auto thread_offset = thread_id;
               Kokkos::parallel_for(
@@ -492,7 +452,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
-                      svA_scr(vlane_id, thread_offset + i * __stride_m) =
+                      svA_scr(vlane_id, thread_offset + i * STRIDE_M) =
                           prefetch_reg_a[i];
                   });
             });
@@ -508,26 +468,26 @@ class BatchedDblBufGemm {
       __rshmem_and_mult(member, partial_tile_k, __tile_m, __tile_n, reg_a,
                         reg_b, reg_c, svA_scr, svB_scr);
 
-      // store results back to global memory
       if (__beta == 0.0F) {
+        // store results back to global memory
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __ts),
+            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
               auto thread_m_offset = thread_id + start_m;
               Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __vl),
+                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
                     auto thread_n_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m) {
-                      int cm = thread_m_offset + m * __stride_m;
+                      int cm = thread_m_offset + m * STRIDE_M;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                       for (int n = 0; n < REG_N; ++n) {
-                        int cn = thread_n_offset + n * __stride_n;
+                        int cn = thread_n_offset + n * STRIDE_N;
                         fma_bounds_check(svC, cm, cn, reg_c[m][n],
                                          __ei.__bounds_check_tag);
                       }
@@ -535,24 +495,25 @@ class BatchedDblBufGemm {
                   });
             });
       } else {
+        // store results back to global memory
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __ts),
+            Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
               auto thread_m_offset = thread_id + start_m;
               Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __vl),
+                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
                     auto thread_n_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m) {
-                      int cm = thread_m_offset + m * __stride_m;
+                      int cm = thread_m_offset + m * STRIDE_M;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                       for (int n = 0; n < REG_N; ++n) {
-                        int cn = thread_n_offset + n * __stride_n;
+                        int cn = thread_n_offset + n * STRIDE_N;
                         fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta,
                                          __ei.__bounds_check_tag);
                       }

From f6ac57d615801e3d8fa8d7b1cab9cee13ace75da Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 21 Oct 2021 13:08:08 -0600
Subject: [PATCH 03/18] some experiments for improving dram utilization

---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index f8fc55f5b2..384386113f 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -83,6 +83,7 @@ class BatchedDblBufGemm {
       typename execution_space_type::scratch_memory_space;
   using view_type_2d_scratch =
       Kokkos::View<view_value_type **, Kokkos::LayoutRight, scratch_space_type>;
+  // TODO: add compile-time extents
 
  public:
   BatchedDblBufGemm(HandleType *const handle, ScalarType alpha, AViewType A,
@@ -261,12 +262,16 @@ class BatchedDblBufGemm {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (unsigned k = 0; k < nk; ++k) {
+                  for (unsigned k = 0; k < nk;
+                       ++k) {  // TODO: would have to invert this for
+                               // threadVectorRange copy TODOs below
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                    for (int m = 0; m < REG_M; ++m)
+                    for (int m = 0; m < REG_M;
+                         ++m)  // TODO: this could be a threadVectorRange copy
                       reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M);
+              // TODO: reg_a could be a thread shared buffer
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -297,6 +302,9 @@ class BatchedDblBufGemm {
       // Allocate registers used for FMAs
       view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0},
                       reg_c[REG_M][REG_N] = {{0}};
+      // TODO: look at local loads and stores via nvprof
+      // TODO: look at GPU trace in nvprof to find out how many registers are
+      // used.
 
       unsigned batch_idx = member.league_rank() / __n_sub_tiles;
 
@@ -334,6 +342,13 @@ class BatchedDblBufGemm {
                         access_view_bounds_check<view_value_type>(
                             svB, vlane_id, thread_offset + i,
                             __ei.__bounds_check_tag);
+                  // TODO: Use LayoutLeft here for contiguous access across
+                  // vlanes??
+                  //   or change indexing like this:
+                  //   access_view_bounds_check<view_value_type>(
+                  //                            svB, i, thread_offset +
+                  //                            vlane_id,
+                  //                            __ei.__bounds_check_tag);
                 });
           });
       Kokkos::parallel_for(
@@ -351,6 +366,7 @@ class BatchedDblBufGemm {
                         access_view_bounds_check<view_value_type>(
                             svA, thread_offset + i, vlane_id,
                             __ei.__bounds_check_tag);
+                  // TODO: might be able to use local deep copy here.
                 });
           });
 

From d15eb7de80385aa925b7dcdc80ba4651cc3ed763 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Mon, 25 Oct 2021 14:54:22 -0600
Subject: [PATCH 04/18] Add some notes from talk with Vinh

---
 src/batched/KokkosBatched_Util.hpp                        | 4 ++++
 src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index f13db02e62..e59f2146f4 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -826,6 +826,10 @@ KOKKOS_INLINE_FUNCTION ViewValueType
 access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) {
   if (m < v.extent_int(0) && n < v.extent_int(1)) return v(m, n);
   return (ViewValueType)0.0F;
+  // TODO: use compile-time extents
+  //  if (m > scr.extent(0) || n > scr.extent(1))
+  //    return 0;
+  //  return v(m, n);
 }
 
 template <class ViewValueType, class ViewType>
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 384386113f..52d936f453 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -296,6 +296,8 @@ class BatchedDblBufGemm {
 
     KOKKOS_INLINE_FUNCTION
     void operator()(const MemberType &member) const {
+      // TODO: use Kokkos view with compile-time size to allocating register??
+      //  Then we can use local deep copy for prefetch_reg population.
       // Allocate registers used for prefetching
       view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0};
 
@@ -365,6 +367,7 @@ class BatchedDblBufGemm {
                     svA_scr(vlane_id, thread_id + i) =
                         access_view_bounds_check<view_value_type>(
                             svA, thread_offset + i, vlane_id,
+                            // svB, vlane_id, thread_offset + i,
                             __ei.__bounds_check_tag);
                   // TODO: might be able to use local deep copy here.
                 });

From e35709b217c8b3810bb6b4c63e9e0126b3c23fb7 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Mon, 25 Oct 2021 14:58:16 -0600
Subject: [PATCH 05/18] Add some notes from talk with Vinh last week

---
 .../dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp        | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 52d936f453..f95f4517d5 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -58,6 +58,15 @@ namespace Impl {
 /// CT/NT, NT/CT, CT/CT
 ///
 
+// TODO - scaling between (32x32, 64x64)
+//   Option 0: Increase number of tiles and figure out how to map kokkos teams
+//             into cuda grid. Keep team size and vector lanes constant.
+//             TODO: write up small example and ask Christian. [DONE,
+//             MdRangePolicy not applicable here]
+//   Option 1: Increase register sizes to handle rows/cols past tile size
+//   Option 2: Fix league_size and have single team solve full tile followed
+//   by same team solving extra rows/cols (without multiplying by the
+//   zero rows/cols)
 template <class ArgTransA, class ArgTransB, class ArgBatchSzDim,
           class HandleType, class ScalarType, class AViewType, class BViewType,
           class CViewType, class ArgBoundsCheck, int TILE_M, int TILE_N,

From b2e96799083f1b2b8bf87f4424e803906c46d6e5 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 28 Oct 2021 13:36:44 -0600
Subject: [PATCH 06/18] update strides to be 1

---
 src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index f95f4517d5..54941355d1 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -123,8 +123,8 @@ class BatchedDblBufGemm {
     // TODO: check these expressions for all tile_m, tile_n, tile_k in Z+.
     constexpr int reg_m    = TILE_M / TILE_K;
     constexpr int reg_n    = TILE_N / TILE_K + 2 * !!(TILE_N % TILE_K);
-    constexpr int stride_m = TILE_K;
-    constexpr int stride_n = TILE_N / reg_n;
+    constexpr int stride_m = 1;
+    constexpr int stride_n = 1;
     using functor_type = Functor<member_type, reg_m, reg_n, stride_m, stride_n>;
 
     functor_type functor(*this, __A, __B, __C, TILE_M, TILE_N, TILE_K);
@@ -142,8 +142,8 @@ class BatchedDblBufGemm {
     // Each team solves a single tile. Within each tile, the team solves
     // all __n_tile_k_tiles one at a time.
     size_t league_size = __c_batch_size * functor.get_n_sub_tiles();
-    int team_size      = stride_m;
-    int vector_len     = stride_n;
+    int team_size      = TILE_K;
+    int vector_len     = TILE_N / reg_n;
 
     const int max_team_size =
         policy_type(league_size, Kokkos::AUTO, vector_len)

From fdb5a6ba8544f4bcf80223c0944024a5619f92f5 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 28 Oct 2021 13:58:44 -0600
Subject: [PATCH 07/18] Update inital B scratch pre-fetch

---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp     | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 54941355d1..84c516e6b5 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -339,27 +339,22 @@ class BatchedDblBufGemm {
       // Here we populate scratch memory with one or more "k" tiles for every
       // thread of the team!
       Kokkos::parallel_for(
-          Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
+          Kokkos::TeamThreadRange(member, 0, __tile_k),
           [&](const int &thread_id) {
             auto thread_offset = thread_id + start_n;
             Kokkos::parallel_for(
-                Kokkos::ThreadVectorRange(member, 0, __tile_k),
+                Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                 [&](const int &vlane_id) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
-                    svB_scr(vlane_id, thread_id + i) =
+                    svB_scr(thread_id, vlane_id * REG_N + i) =
                         access_view_bounds_check<view_value_type>(
-                            svB, vlane_id, thread_offset + i,
+                            svB, thread_offset, vlane_id * REG_N + i,
                             __ei.__bounds_check_tag);
-                  // TODO: Use LayoutLeft here for contiguous access across
-                  // vlanes??
-                  //   or change indexing like this:
-                  //   access_view_bounds_check<view_value_type>(
-                  //                            svB, i, thread_offset +
-                  //                            vlane_id,
-                  //                            __ei.__bounds_check_tag);
+                  // TODO: use svB_scr(thread_id + i, vlane_id) to stride
+                  // accesses to shared memory
                 });
           });
       Kokkos::parallel_for(

From 0432f347e5e52f596b57414aa549b0269822bccd Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 28 Oct 2021 14:42:08 -0600
Subject: [PATCH 08/18] WIP: Update initial A scratch pre-fetch

---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 84c516e6b5..a2ad548605 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -333,7 +333,7 @@ class BatchedDblBufGemm {
                                  __ei.__batch_layout_tag);
 
       // Allocate scratch memory buffers used for prefetching
-      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, __tile_m);
+      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_m, __tile_k);
       view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n);
 
       // Here we populate scratch memory with one or more "k" tiles for every
@@ -368,14 +368,25 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
-                    svA_scr(vlane_id, thread_id + i) =
+                    svA_scr(thread_id EXPR, (vlane_id % 2) * REG_M + i) =
                         access_view_bounds_check<view_value_type>(
-                            svA, thread_offset + i, vlane_id,
-                            // svB, vlane_id, thread_offset + i,
+                            svA, thread_offset EXPR, (vlane_id % 2) * REG_M + i,
                             __ei.__bounds_check_tag);
                   // TODO: might be able to use local deep copy here.
                 });
           });
+      //      Kokkos::parallel_for(
+      //          Kokkos::TeamVectorRange(member, 0, (__tile_m / REG_M) *
+      //          __tile_k),
+      //          [&](const int vlane_id) {
+      //            for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) {
+      //              svA_scr(vlane_id / __tile_k, vlane_id / REG_M + i) =
+      //                  access_view_bounds_check<view_value_type>(
+      //                      svA, vlane_id / __tile_m + start_m, vlane_id /
+      //                      REG_M + i,
+      //                      __ei.__bounds_check_tag);
+      //            }
+      //          });
 
       // Check whether we have a partial tile
       unsigned partial_tile_k = __k - (__n_tile_k_tiles * __tile_k);

From d86006cdc02e35eb8285eb67cee27ee619d898ea Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Fri, 29 Oct 2021 16:52:04 -0600
Subject: [PATCH 09/18] DblBuf: Checkpoint, contig indexing

---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 82 +++++++++----------
 1 file changed, 41 insertions(+), 41 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index a2ad548605..0467b8e44f 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -262,31 +262,35 @@ class BatchedDblBufGemm {
                            view_value_type reg_c[REG_M][REG_N],
                            view_type_2d_scratch &svA_scr,
                            view_type_2d_scratch &svB_scr) const {
+      // view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_m,
+      // __tile_k); view_type_2d_scratch svB_scr(member.team_scratch(0),
+      // __tile_k, __tile_n);
       Kokkos::parallel_for(
           Kokkos::TeamThreadRange(member, 0, tile_m / REG_M),
           [&](const int &thread_id) {
+            auto thread_offset = thread_id * REG_M;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N),
                 [&](const int &vlane_id) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (unsigned k = 0; k < nk;
-                       ++k) {  // TODO: would have to invert this for
-                               // threadVectorRange copy TODOs below
+        // TODO: would have to invert this for
+        // threadVectorRange copy TODOs below
+                  for (unsigned k = 0; k < nk; ++k) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                    for (int m = 0; m < REG_M;
-                         ++m)  // TODO: this could be a threadVectorRange copy
-                      reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M);
+                    for (int m = 0; m < REG_M; ++m)
+                      // TODO: this could be a threadVectorRange copy
+                      reg_a[m] = svA_scr(thread_offset + m * STRIDE_M, k);
               // TODO: reg_a could be a thread shared buffer
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int n = 0; n < REG_N; ++n)
-                      reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+                      reg_b[n] = svB_scr(k, vlane_id * REG_N + n * STRIDE_N);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -349,7 +353,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
-                    svB_scr(thread_id, vlane_id * REG_N + i) =
+                    svB_scr(thread_id + i, vlane_id * REG_N + i) =
                         access_view_bounds_check<view_value_type>(
                             svB, thread_offset, vlane_id * REG_N + i,
                             __ei.__bounds_check_tag);
@@ -360,7 +364,7 @@ class BatchedDblBufGemm {
       Kokkos::parallel_for(
           Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
           [&](const int &thread_id) {
-            auto thread_offset = thread_id + start_m;
+            auto thread_offset = thread_id * REG_M + start_m;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
                 [&](const int &vlane_id) {
@@ -368,25 +372,21 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
-                    svA_scr(thread_id EXPR, (vlane_id % 2) * REG_M + i) =
+                    svA_scr(thread_id * REG_M + (vlane_id / 2),
+                            (vlane_id % 2) * REG_M + i) =
                         access_view_bounds_check<view_value_type>(
-                            svA, thread_offset EXPR, (vlane_id % 2) * REG_M + i,
+                            svA, thread_offset + (vlane_id / 2),
+                            (vlane_id % 2) * REG_M + i,
                             __ei.__bounds_check_tag);
+                  //                    svA_scr(thread_id, (vlane_id % 2) *
+                  //                    REG_M + i) =
+                  //                        access_view_bounds_check<view_value_type>(
+                  //                            svA, thread_offset, (vlane_id %
+                  //                            2) * REG_M + i,
+                  //                            __ei.__bounds_check_tag);
                   // TODO: might be able to use local deep copy here.
                 });
           });
-      //      Kokkos::parallel_for(
-      //          Kokkos::TeamVectorRange(member, 0, (__tile_m / REG_M) *
-      //          __tile_k),
-      //          [&](const int vlane_id) {
-      //            for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) {
-      //              svA_scr(vlane_id / __tile_k, vlane_id / REG_M + i) =
-      //                  access_view_bounds_check<view_value_type>(
-      //                      svA, vlane_id / __tile_m + start_m, vlane_id /
-      //                      REG_M + i,
-      //                      __ei.__bounds_check_tag);
-      //            }
-      //          });
 
       // Check whether we have a partial tile
       unsigned partial_tile_k = __k - (__n_tile_k_tiles * __tile_k);
@@ -407,11 +407,11 @@ class BatchedDblBufGemm {
         // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
         // over all threads in the team.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
+            Kokkos::TeamThreadRange(member, 0, __tile_k),
             [&](const int &thread_id) {
               auto thread_offset = thread_id + start_n;
               Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_k),
+                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -419,9 +419,9 @@ class BatchedDblBufGemm {
                     for (int i = 0; i < REG_N; ++i)
                       prefetch_reg_b[i] =
                           access_view_bounds_check<view_value_type>(
-                              svB, vlane_id + k_tile_offset,
-                              thread_offset + i * STRIDE_N,
-                              __ei.__bounds_check_tag);
+                              svB, thread_offset + k_tile_offset,
+                              vlane_id * REG_N + i, __ei.__bounds_check_tag);
+                    // TODO: use svB_scr(thread_id + i, vlane_id) to stride
                   });
             });
 
@@ -431,7 +431,7 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_offset = thread_id + start_m;
+              auto thread_offset = thread_id * REG_M + start_m;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_k),
                   [&](const int &vlane_id) {
@@ -441,8 +441,8 @@ class BatchedDblBufGemm {
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + i * STRIDE_M,
-                              vlane_id + k_tile_offset,
+                              svA, thread_offset + (vlane_id / 2),
+                              (vlane_id % 2) * REG_M + i + k_tile_offset,
                               __ei.__bounds_check_tag);
                   });
             });
@@ -458,17 +458,17 @@ class BatchedDblBufGemm {
         // populate shmem from prefetch registers. Each thread has its own copy
         // of prefetch_reg_a.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N),
+            Kokkos::TeamThreadRange(member, 0, __tile_k),
             [&](const int &thread_id) {
               auto thread_offset = thread_id;
               Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_k),
+                  Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_N; ++i)
-                      svB_scr(vlane_id, thread_offset + i * STRIDE_N) =
+                      svB_scr(thread_id, vlane_id * REG_N + i) =
                           prefetch_reg_b[i];
                   });
             });
@@ -478,7 +478,7 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_offset = thread_id;
+              auto thread_offset = thread_id * REG_M;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_k),
                   [&](const int &vlane_id) {
@@ -486,8 +486,8 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
-                      svA_scr(vlane_id, thread_offset + i * STRIDE_M) =
-                          prefetch_reg_a[i];
+                      svA_scr(thread_offset + (vlane_id / 2),
+                              (vlane_id % 2) * REG_M + i) = prefetch_reg_a[i];
                   });
             });
 
@@ -507,11 +507,11 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_m_offset = thread_id + start_m;
+              auto thread_m_offset = thread_id * REG_M + start_m;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
-                    auto thread_n_offset = vlane_id + start_n;
+                    auto thread_n_offset = vlane_id * REG_N + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
@@ -533,11 +533,11 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_m_offset = thread_id + start_m;
+              auto thread_m_offset = thread_id * REG_M + start_m;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
-                    auto thread_n_offset = vlane_id + start_n;
+                    auto thread_n_offset = vlane_id * REG_N + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL

From bd7e63a6b24d54153e864d1fbc02d4a48702e4e4 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Tue, 2 Nov 2021 12:17:46 -0600
Subject: [PATCH 10/18] DblBuf: Checkpoint, fix typo & cleanup

---
 .../dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp    | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 0467b8e44f..80f97d9946 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -353,12 +353,10 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
-                    svB_scr(thread_id + i, vlane_id * REG_N + i) =
+                    svB_scr(thread_id, vlane_id * REG_N + i) =
                         access_view_bounds_check<view_value_type>(
                             svB, thread_offset, vlane_id * REG_N + i,
                             __ei.__bounds_check_tag);
-                  // TODO: use svB_scr(thread_id + i, vlane_id) to stride
-                  // accesses to shared memory
                 });
           });
       Kokkos::parallel_for(
@@ -378,12 +376,6 @@ class BatchedDblBufGemm {
                             svA, thread_offset + (vlane_id / 2),
                             (vlane_id % 2) * REG_M + i,
                             __ei.__bounds_check_tag);
-                  //                    svA_scr(thread_id, (vlane_id % 2) *
-                  //                    REG_M + i) =
-                  //                        access_view_bounds_check<view_value_type>(
-                  //                            svA, thread_offset, (vlane_id %
-                  //                            2) * REG_M + i,
-                  //                            __ei.__bounds_check_tag);
                   // TODO: might be able to use local deep copy here.
                 });
           });
@@ -421,7 +413,6 @@ class BatchedDblBufGemm {
                           access_view_bounds_check<view_value_type>(
                               svB, thread_offset + k_tile_offset,
                               vlane_id * REG_N + i, __ei.__bounds_check_tag);
-                    // TODO: use svB_scr(thread_id + i, vlane_id) to stride
                   });
             });
 
@@ -460,7 +451,6 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_k),
             [&](const int &thread_id) {
-              auto thread_offset = thread_id;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {

From 27ecd4db6538ba79943ef1699e70cbdbe473b846 Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Nov 2021 09:25:47 -0600
Subject: [PATCH 11/18] DblBuf: Fix start_n usage

  - Also refactor pre-fetching loops to avoid
  duplicate index calculations.
---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 25 +++++++++++--------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 80f97d9946..0d8f2a2ff3 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -345,17 +345,17 @@ class BatchedDblBufGemm {
       Kokkos::parallel_for(
           Kokkos::TeamThreadRange(member, 0, __tile_k),
           [&](const int &thread_id) {
-            auto thread_offset = thread_id + start_n;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                 [&](const int &vlane_id) {
+                  auto vlane_offset = vlane_id * REG_N + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
                     svB_scr(thread_id, vlane_id * REG_N + i) =
                         access_view_bounds_check<view_value_type>(
-                            svB, thread_offset, vlane_id * REG_N + i,
+                            svB, thread_id, vlane_offset + i,
                             __ei.__bounds_check_tag);
                 });
           });
@@ -366,6 +366,8 @@ class BatchedDblBufGemm {
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
                 [&](const int &vlane_id) {
+                  auto vld          = (vlane_id / 2);
+                  auto vlane_offset = (vlane_id % 2) * REG_M;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
@@ -373,8 +375,7 @@ class BatchedDblBufGemm {
                     svA_scr(thread_id * REG_M + (vlane_id / 2),
                             (vlane_id % 2) * REG_M + i) =
                         access_view_bounds_check<view_value_type>(
-                            svA, thread_offset + (vlane_id / 2),
-                            (vlane_id % 2) * REG_M + i,
+                            svA, thread_offset + vld, vlane_offset + i,
                             __ei.__bounds_check_tag);
                   // TODO: might be able to use local deep copy here.
                 });
@@ -399,20 +400,21 @@ class BatchedDblBufGemm {
         // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs
         // over all threads in the team.
         Kokkos::parallel_for(
-            Kokkos::TeamThreadRange(member, 0, __tile_k),
-            [&](const int &thread_id) {
-              auto thread_offset = thread_id + start_n;
+            Kokkos::TeamThreadRange(member, k_tile_offset,
+                                    k_tile_offset + __tile_k),
+            [&](const int &thread_offset) {
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
+                    auto vlane_offset = vlane_id * REG_N + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_N; ++i)
                       prefetch_reg_b[i] =
                           access_view_bounds_check<view_value_type>(
-                              svB, thread_offset + k_tile_offset,
-                              vlane_id * REG_N + i, __ei.__bounds_check_tag);
+                              svB, thread_offset, vlane_offset + i,
+                              __ei.__bounds_check_tag);
                   });
             });
 
@@ -426,14 +428,15 @@ class BatchedDblBufGemm {
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_k),
                   [&](const int &vlane_id) {
+                    auto vld          = (vlane_id / 2);
+                    auto vlane_offset = (vlane_id % 2) * REG_M + k_tile_offset;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + (vlane_id / 2),
-                              (vlane_id % 2) * REG_M + i + k_tile_offset,
+                              svA, thread_offset + vld, vlane_offset + i,
                               __ei.__bounds_check_tag);
                   });
             });

From 8df38fcc7e092d4f61de21b6680f73cfb74d899f Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Nov 2021 14:00:43 -0600
Subject: [PATCH 12/18] DblBuf: Restore strides.

  - Initial pass at pre-fetch loops for 0th k-tile.
---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 0d8f2a2ff3..374e2e66ab 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -123,8 +123,8 @@ class BatchedDblBufGemm {
     // TODO: check these expressions for all tile_m, tile_n, tile_k in Z+.
     constexpr int reg_m    = TILE_M / TILE_K;
     constexpr int reg_n    = TILE_N / TILE_K + 2 * !!(TILE_N % TILE_K);
-    constexpr int stride_m = 1;
-    constexpr int stride_n = 1;
+    constexpr int stride_m = TILE_K;
+    constexpr int stride_n = TILE_N / reg_n;
     using functor_type = Functor<member_type, reg_m, reg_n, stride_m, stride_n>;
 
     functor_type functor(*this, __A, __B, __C, TILE_M, TILE_N, TILE_K);
@@ -142,8 +142,8 @@ class BatchedDblBufGemm {
     // Each team solves a single tile. Within each tile, the team solves
     // all __n_tile_k_tiles one at a time.
     size_t league_size = __c_batch_size * functor.get_n_sub_tiles();
-    int team_size      = TILE_K;
-    int vector_len     = TILE_N / reg_n;
+    int team_size      = stride_m;
+    int vector_len     = stride_n;
 
     const int max_team_size =
         policy_type(league_size, Kokkos::AUTO, vector_len)
@@ -348,12 +348,12 @@ class BatchedDblBufGemm {
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                 [&](const int &vlane_id) {
-                  auto vlane_offset = vlane_id * REG_N + start_n;
+                  auto vlane_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                   for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
-                    svB_scr(thread_id, vlane_id * REG_N + i) =
+                    svB_scr(thread_id, vlane_id + i) =
                         access_view_bounds_check<view_value_type>(
                             svB, thread_id, vlane_offset + i,
                             __ei.__bounds_check_tag);
@@ -366,16 +366,13 @@ class BatchedDblBufGemm {
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
                 [&](const int &vlane_id) {
-                  auto vld          = (vlane_id / 2);
-                  auto vlane_offset = (vlane_id % 2) * REG_M;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
-                    svA_scr(thread_id * REG_M + (vlane_id / 2),
-                            (vlane_id % 2) * REG_M + i) =
+                  for (int i = 0; i < REG_M; i++)
+                    svA_scr(thread_id * REG_M + i, vlane_id) =
                         access_view_bounds_check<view_value_type>(
-                            svA, thread_offset + vld, vlane_offset + i,
+                            svA, thread_offset + i, vlane_id,
                             __ei.__bounds_check_tag);
                   // TODO: might be able to use local deep copy here.
                 });
@@ -413,7 +410,7 @@ class BatchedDblBufGemm {
                     for (int i = 0; i < REG_N; ++i)
                       prefetch_reg_b[i] =
                           access_view_bounds_check<view_value_type>(
-                              svB, thread_offset, vlane_offset + i,
+                              svB, thread_offset, vlane_offset + i * STRIDE_N,
                               __ei.__bounds_check_tag);
                   });
             });
@@ -436,7 +433,8 @@ class BatchedDblBufGemm {
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + vld, vlane_offset + i,
+                              svA, thread_offset + vld,
+                              vlane_offset + i * STRIDE_N,
                               __ei.__bounds_check_tag);
                   });
             });
@@ -461,7 +459,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_N; ++i)
-                      svB_scr(thread_id, vlane_id * REG_N + i) =
+                      svB_scr(thread_id, vlane_id * REG_N + i * STRIDE_N) =
                           prefetch_reg_b[i];
                   });
             });
@@ -480,7 +478,8 @@ class BatchedDblBufGemm {
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
                       svA_scr(thread_offset + (vlane_id / 2),
-                              (vlane_id % 2) * REG_M + i) = prefetch_reg_a[i];
+                              (vlane_id % 2) * REG_M + i * STRIDE_M) =
+                          prefetch_reg_a[i];
                   });
             });
 

From aded516c278dce2d837c505e3b1c2df486f25d1f Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Wed, 3 Nov 2021 15:33:21 -0600
Subject: [PATCH 13/18] DblBuf: Restore strides.

  - Second pass at pre-fetch loops for all k tiles.
---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp    | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 374e2e66ab..0d33f9581a 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -283,14 +283,14 @@ class BatchedDblBufGemm {
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m)
                       // TODO: this could be a threadVectorRange copy
-                      reg_a[m] = svA_scr(thread_offset + m * STRIDE_M, k);
+                      reg_a[m] = svA_scr(thread_offset + m, k);
               // TODO: reg_a could be a thread shared buffer
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int n = 0; n < REG_N; ++n)
-                      reg_b[n] = svB_scr(k, vlane_id * REG_N + n * STRIDE_N);
+                      reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
@@ -423,18 +423,16 @@ class BatchedDblBufGemm {
             [&](const int &thread_id) {
               auto thread_offset = thread_id * REG_M + start_m;
               Kokkos::parallel_for(
-                  Kokkos::ThreadVectorRange(member, 0, __tile_k),
+                  Kokkos::ThreadVectorRange(member, k_tile_offset,
+                                            k_tile_offset + __tile_k),
                   [&](const int &vlane_id) {
-                    auto vld          = (vlane_id / 2);
-                    auto vlane_offset = (vlane_id % 2) * REG_M + k_tile_offset;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + vld,
-                              vlane_offset + i * STRIDE_N,
+                              svA, thread_offset + i, vlane_id,
                               __ei.__bounds_check_tag);
                   });
             });
@@ -459,7 +457,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_N; ++i)
-                      svB_scr(thread_id, vlane_id * REG_N + i * STRIDE_N) =
+                      svB_scr(thread_id, vlane_id + i * STRIDE_N) =
                           prefetch_reg_b[i];
                   });
             });
@@ -477,9 +475,7 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
-                      svA_scr(thread_offset + (vlane_id / 2),
-                              (vlane_id % 2) * REG_M + i * STRIDE_M) =
-                          prefetch_reg_a[i];
+                      svA_scr(thread_offset + i, vlane_id) = prefetch_reg_a[i];
                   });
             });
 

From d9252dbe49fa0885803fac7e1dcda151c4baf2ed Mon Sep 17 00:00:00 2001
From: Evan Harvey <eharvey@sandia.gov>
Date: Thu, 4 Nov 2021 15:12:42 -0600
Subject: [PATCH 14/18] DblBuf: Restore strides.

  - Third pass at pre-fetch loops for all k tiles.
---
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 34 +++++++++++--------
 1 file changed, 19 insertions(+), 15 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 0d33f9581a..14301ebc59 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -268,7 +268,6 @@ class BatchedDblBufGemm {
       Kokkos::parallel_for(
           Kokkos::TeamThreadRange(member, 0, tile_m / REG_M),
           [&](const int &thread_id) {
-            auto thread_offset = thread_id * REG_M;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N),
                 [&](const int &vlane_id) {
@@ -277,15 +276,20 @@ class BatchedDblBufGemm {
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
         // TODO: would have to invert this for
         // threadVectorRange copy TODOs below
-                  for (unsigned k = 0; k < nk; ++k) {
+                  for (unsigned k = 0; k < nk;
+                       ++k) {  // TODO: svA_scr coalesced access. All vlanes are
+                               // readying the same data from svA scr.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int m = 0; m < REG_M; ++m)
                       // TODO: this could be a threadVectorRange copy
-                      reg_a[m] = svA_scr(thread_offset + m, k);
+                      reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
               // TODO: reg_a could be a thread shared buffer
 
+//                    view_type_2d_scratch svA_scr(member.team_scratch(0),
+//                    __tile_m, __tile_k); view_type_2d_scratch
+//                    svB_scr(member.team_scratch(0), __tile_k, __tile_n);
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
@@ -362,15 +366,15 @@ class BatchedDblBufGemm {
       Kokkos::parallel_for(
           Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
           [&](const int &thread_id) {
-            auto thread_offset = thread_id * REG_M + start_m;
+            auto thread_offset = thread_id + start_m;
             Kokkos::parallel_for(
                 Kokkos::ThreadVectorRange(member, 0, __tile_k),
                 [&](const int &vlane_id) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int i = 0; i < REG_M; i++)
-                    svA_scr(thread_id * REG_M + i, vlane_id) =
+                  for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
+                    svA_scr(thread_id + i, vlane_id) =
                         access_view_bounds_check<view_value_type>(
                             svA, thread_offset + i, vlane_id,
                             __ei.__bounds_check_tag);
@@ -403,7 +407,7 @@ class BatchedDblBufGemm {
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
-                    auto vlane_offset = vlane_id * REG_N + start_n;
+                    auto vlane_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
@@ -421,7 +425,7 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_offset = thread_id * REG_M + start_m;
+              auto thread_offset = thread_id + start_m;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, k_tile_offset,
                                             k_tile_offset + __tile_k),
@@ -432,7 +436,7 @@ class BatchedDblBufGemm {
                     for (int i = 0; i < REG_M; ++i)
                       prefetch_reg_a[i] =
                           access_view_bounds_check<view_value_type>(
-                              svA, thread_offset + i, vlane_id,
+                              svA, thread_offset + i * STRIDE_M, vlane_id,
                               __ei.__bounds_check_tag);
                   });
             });
@@ -467,7 +471,6 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_offset = thread_id * REG_M;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_k),
                   [&](const int &vlane_id) {
@@ -475,7 +478,8 @@ class BatchedDblBufGemm {
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
                     for (int i = 0; i < REG_M; ++i)
-                      svA_scr(thread_offset + i, vlane_id) = prefetch_reg_a[i];
+                      svA_scr(thread_id + i * STRIDE_M, vlane_id) =
+                          prefetch_reg_a[i];
                   });
             });
 
@@ -495,11 +499,11 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_m_offset = thread_id * REG_M + start_m;
+              auto thread_m_offset = thread_id + start_m;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
-                    auto thread_n_offset = vlane_id * REG_N + start_n;
+                    auto thread_n_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
@@ -521,11 +525,11 @@ class BatchedDblBufGemm {
         Kokkos::parallel_for(
             Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M),
             [&](const int &thread_id) {
-              auto thread_m_offset = thread_id * REG_M + start_m;
+              auto thread_m_offset = thread_id + start_m;
               Kokkos::parallel_for(
                   Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N),
                   [&](const int &vlane_id) {
-                    auto thread_n_offset = vlane_id * REG_N + start_n;
+                    auto thread_n_offset = vlane_id + start_n;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
 #pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL

From 81d7f94f7dd04bfae43a5103cc46a79ee36bc5ef Mon Sep 17 00:00:00 2001
From: Vinh Dang <vqdang@sandia.gov>
Date: Mon, 6 Dec 2021 14:00:37 -0700
Subject: [PATCH 15/18] Use one pair of parallel-for for TeamThreadRange and
 ThreadVectorRange in double buffering batched gemm

---
 src/batched/KokkosBatched_Util.hpp            |  12 +-
 src/batched/dense/KokkosBatched_Gemm_Decl.hpp |   9 +-
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 194 ++++++++++++++++++
 3 files changed, 205 insertions(+), 10 deletions(-)

diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index e59f2146f4..89d70cb2df 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -23,6 +23,7 @@
 #include "Kokkos_Timer.hpp"
 
 #include "KokkosKernels_config.h"
+#include "KokkosKernels_Utils.hpp"
 
 // TPL macros
 #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
@@ -824,12 +825,11 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(
 template <class ViewValueType, class ViewType>
 KOKKOS_INLINE_FUNCTION ViewValueType
 access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) {
-  if (m < v.extent_int(0) && n < v.extent_int(1)) return v(m, n);
-  return (ViewValueType)0.0F;
-  // TODO: use compile-time extents
-  //  if (m > scr.extent(0) || n > scr.extent(1))
-  //    return 0;
-  //  return v(m, n);
+  return v(KOKKOSKERNELS_MACRO_MIN(m,v.extent_int(0)), KOKKOSKERNELS_MACRO_MIN(n,v.extent_int(1)));
+  //// TODO: use compile-time extents
+  ////  if (m > scr.extent(0) || n > scr.extent(1))
+  ////    return 0;
+  ////  return v(m, n);
 }
 
 template <class ViewValueType, class ViewType>
diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
index 0f5afcc6aa..ba154326bf 100644
--- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
@@ -465,11 +465,10 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
       // } else
       if (on_gpu &&
           ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
-               ? (c_m >= 16)
-               : (c_m >= 24 && c_m <= 32) || (c_m >= 45 && c_m <= 64))) {
+               ? (c_m >= 16) : (c_m >= 24))) {//Vinh's note: use this condition for now, might need to revisit
         handle->teamSz = handle->vecLen = 8;
         constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
-        if (c_m % 32 == 0)  // No bounds checking
+        if (c_m % 32 == 0) {  // No bounds checking
           ret =
               Impl::BatchedDblBufGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
                                       BatchedGemmHandleType, ScalarType,
@@ -477,7 +476,8 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
                                       BoundsCheck::No, tile_m, tile_n, tile_k>(
                   handle, alpha, A, B, beta, C)
                   .invoke();
-        else
+        }
+        else {
           ret =
               Impl::BatchedDblBufGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
                                       BatchedGemmHandleType, ScalarType,
@@ -485,6 +485,7 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
                                       BoundsCheck::Yes, tile_m, tile_n, tile_k>(
                   handle, alpha, A, B, beta, C)
                   .invoke();
+        }
       } else {
         ret = Impl::BatchedSerialGemm<ArgTransA, ArgTransB, bsgModeType,
                                       ArgBatchSzDim, bsgResultsPerThread,
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 14301ebc59..be51f06505 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -254,6 +254,7 @@ class BatchedDblBufGemm {
       __n_sub_tiles    = __tiles_per_row * __tiles_per_col;
     }
 
+#if 0
     KOKKOS_INLINE_FUNCTION
     void __rshmem_and_mult(const MemberType &member, const unsigned &nk,
                            const unsigned &tile_m, const unsigned &tile_n,
@@ -548,6 +549,199 @@ class BatchedDblBufGemm {
             });
       }
     }
+#else
+    KOKKOS_INLINE_FUNCTION
+    void operator()(const MemberType &member) const {
+      // TODO: use Kokkos view with compile-time size to allocating register??
+      //  Then we can use local deep copy for prefetch_reg population.
+      // Allocate registers used for prefetching
+      view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0};
+
+      // Allocate registers used for FMAs
+      view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0},
+                      reg_c[REG_M][REG_N] = {{0}};
+      // TODO: look at local loads and stores via nvprof
+      // TODO: look at GPU trace in nvprof to find out how many registers are
+      // used.
+
+      unsigned batch_idx = member.league_rank() / __n_sub_tiles;
+
+      // Compute starting tile offsets for each team into svA, svB, svC
+      unsigned local_team_idx = member.league_rank() % __n_sub_tiles;
+      unsigned start_m        = (local_team_idx / __tiles_per_col) * __tile_m;
+      unsigned start_n        = (local_team_idx % __tiles_per_col) * __tile_n;
+
+      unsigned kk;
+
+      // Fetch entire 2-rank sub-matrix
+      auto svA = subview_wrapper(__A, batch_idx, Kokkos::ALL(), Kokkos::ALL(),
+                                 __ei.__batch_layout_tag, __ei.__transA_tag);
+      auto svB = subview_wrapper(__B, batch_idx, Kokkos::ALL(), Kokkos::ALL(),
+                                 __ei.__batch_layout_tag, __ei.__transB_tag);
+      auto svC = subview_wrapper(__C, batch_idx, Kokkos::ALL(), Kokkos::ALL(),
+                                 __ei.__batch_layout_tag);
+
+      // Allocate scratch memory buffers used for prefetching
+      view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_m, __tile_k);
+      view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n);
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_M), [&](const int &thread_id) {
+          int thread_offset = thread_id + start_m;
+
+          Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_N), [&](const int &vlane_id) {
+              int vlane_offset = vlane_id + start_n;
+
+              // Here we populate scratch memory with one or more "k" tiles for every thread of the team!
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+              #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+              for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
+                svB_scr(thread_id, vlane_id + i) = access_view_bounds_check<view_value_type>(svB, thread_id, vlane_offset + i, __ei.__bounds_check_tag);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+              #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+              for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
+                svA_scr(thread_id + i, vlane_id) = access_view_bounds_check<view_value_type>(svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag);
+
+              // Wait for A, B to reside in scratch memory
+              member.team_barrier();
+
+              // Each thread calculates a single dot product in chunks of size __tile_k
+              for (kk = 0; kk < __k - __tile_k; kk += __tile_k) {
+                int k_tile_offset = kk + __tile_k;
+
+                // Get this threads next __tile_k entries from global memory
+                // Each thread has its own copy of prefetch_reg_b.
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int i = 0; i < REG_N; ++i)
+                  prefetch_reg_b[i] = access_view_bounds_check<view_value_type>(svB, thread_id + k_tile_offset, vlane_offset + i * STRIDE_N, __ei.__bounds_check_tag);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int i = 0; i < REG_M; ++i)
+                  prefetch_reg_a[i] = access_view_bounds_check<view_value_type>(svA, thread_offset + i * STRIDE_M, vlane_id + k_tile_offset, __ei.__bounds_check_tag);
+
+                // Multiply
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (unsigned k = 0; k < __tile_k; ++k) {
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                  #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                  for (int m = 0; m < REG_M; ++m)
+                    reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                  #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                  for (int n = 0; n < REG_N; ++n)
+                    reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                  #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                  for (int m = 0; m < REG_M; ++m) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                    #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                    for (int n = 0; n < REG_N; ++n)
+                      reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha;
+                  }
+                }
+
+                // Wait for:
+                //   1. prefetch_regs to be populated
+                //   2. for shmem to no longer be read from
+                member.team_barrier();
+
+                // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_b.     
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int i = 0; i < REG_N; ++i)
+                  svB_scr(thread_id, vlane_id + i * STRIDE_N) = prefetch_reg_b[i];
+
+                // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_a.
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int i = 0; i < REG_M; ++i)
+                  svA_scr(thread_id + i * STRIDE_M, vlane_id) = prefetch_reg_a[i];
+
+                // Wait for shmem stores to land before performing next __tile_k multiply
+                member.team_barrier();
+              }  // end n_tile_k_tiles loop
+
+              // Multiply last tile, may be a partial tile
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+              #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+              for (unsigned k = 0; k < __k - kk; ++k) {
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int m = 0; m < REG_M; ++m)
+                  reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int n = 0; n < REG_N; ++n)
+                  reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int m = 0; m < REG_M; ++m) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                  #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                  for (int n = 0; n < REG_N; ++n)
+                    reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha;
+                }
+              }
+
+              // store results back to global memory
+              if (__beta == 0.0F) {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int m = 0; m < REG_M; ++m) {
+                  int cm = thread_offset + m * STRIDE_M;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                  #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                  for (int n = 0; n < REG_N; ++n) {
+                    int cn = vlane_offset + n * STRIDE_N;
+                    fma_bounds_check(svC, cm, cn, reg_c[m][n], __ei.__bounds_check_tag);
+                  }
+                }
+              } else {
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                for (int m = 0; m < REG_M; ++m) {
+                  int cm = thread_offset + m * STRIDE_M;
+#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
+                  #pragma unroll
+#endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
+                  for (int n = 0; n < REG_N; ++n) {
+                    int cn = vlane_offset + n * STRIDE_N;
+                    fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta, __ei.__bounds_check_tag);
+                  }
+                }
+              }
+          });
+      });
+    }
+#endif
   };
 };
 /********************* END non-functor-level routines *********************/

From d549e7851bdc9ddc378e9d330822a9b4e8db3000 Mon Sep 17 00:00:00 2001
From: Vinh Dang <vqdang@sandia.gov>
Date: Mon, 6 Dec 2021 23:01:52 -0700
Subject: [PATCH 16/18] Use constexpr for tile sizes

---
 .../dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index be51f06505..1302d56c98 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -127,7 +127,7 @@ class BatchedDblBufGemm {
     constexpr int stride_n = TILE_N / reg_n;
     using functor_type = Functor<member_type, reg_m, reg_n, stride_m, stride_n>;
 
-    functor_type functor(*this, __A, __B, __C, TILE_M, TILE_N, TILE_K);
+    functor_type functor(*this, __A, __B, __C);
 
     if (__handle->enableDebug) {
       std::cout << "algo_type:" << __handle->get_kernel_algo_type() << std::endl
@@ -204,7 +204,10 @@ class BatchedDblBufGemm {
     ScalarType __alpha, __beta;
     int __k;
     size_t __n_tile_k_tiles, __n_sub_tiles;
-    unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row;
+    static constexpr unsigned __tile_m = TILE_M;
+    static constexpr unsigned __tile_n = TILE_M;
+    static constexpr unsigned __tile_k = TILE_K;
+    unsigned __tiles_per_col, __tiles_per_row;
 
    public:
     size_t get_n_sub_tiles() { return __n_sub_tiles; }
@@ -218,10 +221,7 @@ class BatchedDblBufGemm {
         : __ei(ei),
           __A(A),
           __B(B),
-          __C(C),
-          __tile_m(tile_m),
-          __tile_n(tile_n),
-          __tile_k(tile_k) {
+          __C(C) {
       if (std::is_same<ArgBatchSzDim, BatchLayout::Left>::value) {
         ei.__c_batch_size = ei.__C.extent_int(0);
         ei.__c_m          = ei.__C.extent_int(1);

From 03fce87891bb2436b6fab246bacf2a6b73fdad6c Mon Sep 17 00:00:00 2001
From: "Vinh Quang Dang (-EXP)" <vqdang@kokkos-dev-2.sandia.gov>
Date: Mon, 6 Dec 2021 23:28:54 -0700
Subject: [PATCH 17/18] Apply clang-format 9.0.0

---
 src/batched/KokkosBatched_Util.hpp            |   3 +-
 src/batched/dense/KokkosBatched_Gemm_Decl.hpp |  10 +-
 .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp   | 218 ++++++++++--------
 3 files changed, 128 insertions(+), 103 deletions(-)

diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp
index 89d70cb2df..6177b90c18 100644
--- a/src/batched/KokkosBatched_Util.hpp
+++ b/src/batched/KokkosBatched_Util.hpp
@@ -825,7 +825,8 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper(
 template <class ViewValueType, class ViewType>
 KOKKOS_INLINE_FUNCTION ViewValueType
 access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) {
-  return v(KOKKOSKERNELS_MACRO_MIN(m,v.extent_int(0)), KOKKOSKERNELS_MACRO_MIN(n,v.extent_int(1)));
+  return v(KOKKOSKERNELS_MACRO_MIN(m, v.extent_int(0)),
+           KOKKOSKERNELS_MACRO_MIN(n, v.extent_int(1)));
   //// TODO: use compile-time extents
   ////  if (m > scr.extent(0) || n > scr.extent(1))
   ////    return 0;
diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
index ba154326bf..d3efbbe4b6 100644
--- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
+++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp
@@ -463,9 +463,10 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
       //     (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) {
       //   // TODO: invoke TeamShmem
       // } else
-      if (on_gpu &&
-          ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
-               ? (c_m >= 16) : (c_m >= 24))) {//Vinh's note: use this condition for now, might need to revisit
+      if (on_gpu && ((std::is_same<layout_type, Kokkos::LayoutLeft>::value)
+                         ? (c_m >= 16)
+                         : (c_m >= 24))) {  // Vinh's note: use this condition
+                                            // for now, might need to revisit
         handle->teamSz = handle->vecLen = 8;
         constexpr int tile_m = 32, tile_n = 32, tile_k = 8;
         if (c_m % 32 == 0) {  // No bounds checking
@@ -476,8 +477,7 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha,
                                       BoundsCheck::No, tile_m, tile_n, tile_k>(
                   handle, alpha, A, B, beta, C)
                   .invoke();
-        }
-        else {
+        } else {
           ret =
               Impl::BatchedDblBufGemm<ArgTransA, ArgTransB, ArgBatchSzDim,
                                       BatchedGemmHandleType, ScalarType,
diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 1302d56c98..13e30117bf 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -218,10 +218,7 @@ class BatchedDblBufGemm {
     // runtime resolution of the host address &__ei.
     Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C,
             unsigned tile_m = 1, unsigned tile_n = 1, unsigned tile_k = 1)
-        : __ei(ei),
-          __A(A),
-          __B(B),
-          __C(C) {
+        : __ei(ei), __A(A), __B(B), __C(C) {
       if (std::is_same<ArgBatchSzDim, BatchLayout::Left>::value) {
         ei.__c_batch_size = ei.__C.extent_int(0);
         ei.__c_m          = ei.__C.extent_int(1);
@@ -585,161 +582,188 @@ class BatchedDblBufGemm {
       view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_m, __tile_k);
       view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n);
 
-      Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_M), [&](const int &thread_id) {
-          int thread_offset = thread_id + start_m;
+      Kokkos::parallel_for(
+          Kokkos::TeamThreadRange(member, 0, STRIDE_M),
+          [&](const int &thread_id) {
+            int thread_offset = thread_id + start_m;
 
-          Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_N), [&](const int &vlane_id) {
-              int vlane_offset = vlane_id + start_n;
+            Kokkos::parallel_for(
+                Kokkos::ThreadVectorRange(member, 0, STRIDE_N),
+                [&](const int &vlane_id) {
+                  int vlane_offset = vlane_id + start_n;
 
-              // Here we populate scratch memory with one or more "k" tiles for every thread of the team!
+          // Here we populate scratch memory with one or more "k" tiles for
+          // every thread of the team!
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-              #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-              for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
-                svB_scr(thread_id, vlane_id + i) = access_view_bounds_check<view_value_type>(svB, thread_id, vlane_offset + i, __ei.__bounds_check_tag);
+                  for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N)
+                    svB_scr(thread_id, vlane_id + i) =
+                        access_view_bounds_check<view_value_type>(
+                            svB, thread_id, vlane_offset + i,
+                            __ei.__bounds_check_tag);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-              #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-              for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
-                svA_scr(thread_id + i, vlane_id) = access_view_bounds_check<view_value_type>(svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag);
+                  for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M)
+                    svA_scr(thread_id + i, vlane_id) =
+                        access_view_bounds_check<view_value_type>(
+                            svA, thread_offset + i, vlane_id,
+                            __ei.__bounds_check_tag);
 
-              // Wait for A, B to reside in scratch memory
-              member.team_barrier();
+                  // Wait for A, B to reside in scratch memory
+                  member.team_barrier();
 
-              // Each thread calculates a single dot product in chunks of size __tile_k
-              for (kk = 0; kk < __k - __tile_k; kk += __tile_k) {
-                int k_tile_offset = kk + __tile_k;
+                  // Each thread calculates a single dot product in chunks of
+                  // size __tile_k
+                  for (kk = 0; kk < __k - __tile_k; kk += __tile_k) {
+                    int k_tile_offset = kk + __tile_k;
 
-                // Get this threads next __tile_k entries from global memory
-                // Each thread has its own copy of prefetch_reg_b.
+            // Get this threads next __tile_k entries from global memory
+            // Each thread has its own copy of prefetch_reg_b.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int i = 0; i < REG_N; ++i)
-                  prefetch_reg_b[i] = access_view_bounds_check<view_value_type>(svB, thread_id + k_tile_offset, vlane_offset + i * STRIDE_N, __ei.__bounds_check_tag);
+                    for (int i = 0; i < REG_N; ++i)
+                      prefetch_reg_b[i] =
+                          access_view_bounds_check<view_value_type>(
+                              svB, thread_id + k_tile_offset,
+                              vlane_offset + i * STRIDE_N,
+                              __ei.__bounds_check_tag);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int i = 0; i < REG_M; ++i)
-                  prefetch_reg_a[i] = access_view_bounds_check<view_value_type>(svA, thread_offset + i * STRIDE_M, vlane_id + k_tile_offset, __ei.__bounds_check_tag);
+                    for (int i = 0; i < REG_M; ++i)
+                      prefetch_reg_a[i] =
+                          access_view_bounds_check<view_value_type>(
+                              svA, thread_offset + i * STRIDE_M,
+                              vlane_id + k_tile_offset,
+                              __ei.__bounds_check_tag);
 
-                // Multiply
+              // Multiply
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (unsigned k = 0; k < __tile_k; ++k) {
+                    for (unsigned k = 0; k < __tile_k; ++k) {
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                  #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int m = 0; m < REG_M; ++m)
-                    reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
+                      for (int m = 0; m < REG_M; ++m)
+                        reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                  #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int n = 0; n < REG_N; ++n)
-                    reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+                      for (int n = 0; n < REG_N; ++n)
+                        reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                  #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int m = 0; m < REG_M; ++m) {
+                      for (int m = 0; m < REG_M; ++m) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                    #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                    for (int n = 0; n < REG_N; ++n)
-                      reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha;
-                  }
-                }
+                        for (int n = 0; n < REG_N; ++n)
+                          reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha;
+                      }
+                    }
 
-                // Wait for:
-                //   1. prefetch_regs to be populated
-                //   2. for shmem to no longer be read from
-                member.team_barrier();
+                    // Wait for:
+                    //   1. prefetch_regs to be populated
+                    //   2. for shmem to no longer be read from
+                    member.team_barrier();
 
-                // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_b.     
+            // populate shmem from prefetch registers. Each thread has its own
+            // copy of prefetch_reg_b.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int i = 0; i < REG_N; ++i)
-                  svB_scr(thread_id, vlane_id + i * STRIDE_N) = prefetch_reg_b[i];
+                    for (int i = 0; i < REG_N; ++i)
+                      svB_scr(thread_id, vlane_id + i * STRIDE_N) =
+                          prefetch_reg_b[i];
 
-                // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_a.
+              // populate shmem from prefetch registers. Each thread has its own
+              // copy of prefetch_reg_a.
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int i = 0; i < REG_M; ++i)
-                  svA_scr(thread_id + i * STRIDE_M, vlane_id) = prefetch_reg_a[i];
+                    for (int i = 0; i < REG_M; ++i)
+                      svA_scr(thread_id + i * STRIDE_M, vlane_id) =
+                          prefetch_reg_a[i];
 
-                // Wait for shmem stores to land before performing next __tile_k multiply
-                member.team_barrier();
-              }  // end n_tile_k_tiles loop
+                    // Wait for shmem stores to land before performing next
+                    // __tile_k multiply
+                    member.team_barrier();
+                  }  // end n_tile_k_tiles loop
 
-              // Multiply last tile, may be a partial tile
+          // Multiply last tile, may be a partial tile
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-              #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-              for (unsigned k = 0; k < __k - kk; ++k) {
+                  for (unsigned k = 0; k < __k - kk; ++k) {
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int m = 0; m < REG_M; ++m)
-                  reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
+                    for (int m = 0; m < REG_M; ++m)
+                      reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int n = 0; n < REG_N; ++n)
-                  reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
+                    for (int n = 0; n < REG_N; ++n)
+                      reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N);
 
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int m = 0; m < REG_M; ++m) {
+                    for (int m = 0; m < REG_M; ++m) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                  #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int n = 0; n < REG_N; ++n)
-                    reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha;
-                }
-              }
+                      for (int n = 0; n < REG_N; ++n)
+                        reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha;
+                    }
+                  }
 
-              // store results back to global memory
-              if (__beta == 0.0F) {
+                  // store results back to global memory
+                  if (__beta == 0.0F) {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int m = 0; m < REG_M; ++m) {
-                  int cm = thread_offset + m * STRIDE_M;
+                    for (int m = 0; m < REG_M; ++m) {
+                      int cm = thread_offset + m * STRIDE_M;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                  #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int n = 0; n < REG_N; ++n) {
-                    int cn = vlane_offset + n * STRIDE_N;
-                    fma_bounds_check(svC, cm, cn, reg_c[m][n], __ei.__bounds_check_tag);
-                  }
-                }
-              } else {
+                      for (int n = 0; n < REG_N; ++n) {
+                        int cn = vlane_offset + n * STRIDE_N;
+                        fma_bounds_check(svC, cm, cn, reg_c[m][n],
+                                         __ei.__bounds_check_tag);
+                      }
+                    }
+                  } else {
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                for (int m = 0; m < REG_M; ++m) {
-                  int cm = thread_offset + m * STRIDE_M;
+                    for (int m = 0; m < REG_M; ++m) {
+                      int cm = thread_offset + m * STRIDE_M;
 #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL)
-                  #pragma unroll
+#pragma unroll
 #endif  // KOKKOS_ENABLE_PRAGMA_UNROLL
-                  for (int n = 0; n < REG_N; ++n) {
-                    int cn = vlane_offset + n * STRIDE_N;
-                    fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta, __ei.__bounds_check_tag);
+                      for (int n = 0; n < REG_N; ++n) {
+                        int cn = vlane_offset + n * STRIDE_N;
+                        fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta,
+                                         __ei.__bounds_check_tag);
+                      }
+                    }
                   }
-                }
-              }
+                });
           });
-      });
     }
 #endif
   };

From 2602b97f4045c190c2b61b30183cf0134b71a822 Mon Sep 17 00:00:00 2001
From: Vinh Dang <vqdang@sandia.gov>
Date: Tue, 7 Dec 2021 12:57:36 -0700
Subject: [PATCH 18/18] Remove unused parameters tile_m, tile_n, tile_k

---
 src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
index 13e30117bf..139f30fb6c 100644
--- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
+++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp
@@ -216,8 +216,7 @@ class BatchedDblBufGemm {
     // below. If those are used, we  get an invalid memory error from cuda. I
     // suspect this is due the values not being copied to device and then
     // runtime resolution of the host address &__ei.
-    Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C,
-            unsigned tile_m = 1, unsigned tile_n = 1, unsigned tile_k = 1)
+    Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C)
         : __ei(ei), __A(A), __B(B), __C(C) {
       if (std::is_same<ArgBatchSzDim, BatchLayout::Left>::value) {
         ei.__c_batch_size = ei.__C.extent_int(0);