From b01db1e7324b052766420b3692bfe39b2e28db11 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 15 Oct 2021 16:10:35 -0600 Subject: [PATCH 01/18] batched/dense: Rework BatchedDblBufGemm - Rework handling of partial rows/cols. Rather than increasing the league_size, use 1 more thread per team for each REG_M/REG_N overstep of tile_m/tile_n. --- src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 7 +- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 127 ++++++++++++------ 2 files changed, 86 insertions(+), 48 deletions(-) diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp index 0f5afcc6aa..f3c5ba5b5a 100644 --- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp @@ -463,10 +463,9 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, // (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) { // // TODO: invoke TeamShmem // } else - if (on_gpu && - ((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || (c_m >= 45 && c_m <= 64))) { + if (on_gpu && ((std::is_same::value) + ? (c_m >= 16) + : (c_m >= 24))) { handle->teamSz = handle->vecLen = 8; constexpr int tile_m = 32, tile_n = 32, tile_k = 8; if (c_m % 32 == 0) // No bounds checking diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index f8fc55f5b2..ae3a1b61a2 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -130,10 +130,15 @@ class BatchedDblBufGemm { } // Each team solves a single tile. Within each tile, the team solves - // all __n_tile_k_tiles one at a time. + // all n_sub_tiles, one at a time. size_t league_size = __c_batch_size * functor.get_n_sub_tiles(); - int team_size = stride_m; - int vector_len = stride_n; + // TODO: determine max_team_size and max_vector_len here instead of using 32 + // and 16 + int team_size = + std::min(stride_m + functor.n_extra_threads(), (unsigned)32); + // TODO: why are 2x vector lanes needed rather than just 1 more? + int vector_len = + std::min(stride_n * (functor.n_extra_vlanes() + 1), (unsigned)16); const int max_team_size = policy_type(league_size, Kokkos::AUTO, vector_len) @@ -165,14 +170,18 @@ class BatchedDblBufGemm { << " team_size:" << team_size << std::endl << "max_vector_len:" << max_vector_len << " vector_len:" << vector_len << std::endl + << " league_size:" << league_size << std::endl << "TILE_M:" << TILE_M << std::endl << "TILE_N:" << TILE_N << std::endl << "TILE_K:" << TILE_K << std::endl; } - // TODO: Use statically allocated shmem - int shmem_size = view_type_2d_scratch::shmem_size(TILE_M, TILE_K) + - view_type_2d_scratch::shmem_size(TILE_K, TILE_N); + // NOTE: All but shmem_size args but partial tile sizes can be determined at + // compile time. + int shmem_size = view_type_2d_scratch::shmem_size( + TILE_M + functor.get_partial_tile_m(), TILE_K) + + view_type_2d_scratch::shmem_size( + TILE_K, TILE_N + functor.get_partial_tile_n()); // Each member solves a portion of TILE_K in parallel with other members policy_type team_policy(league_size, team_size, vector_len); @@ -194,10 +203,15 @@ class BatchedDblBufGemm { ScalarType __alpha, __beta; int __k; size_t __n_tile_k_tiles, __n_sub_tiles; - unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row; + unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row, + __partial_tile_m, __partial_tile_n, __stride_m, __stride_n, __ts, __vl; public: size_t get_n_sub_tiles() { return __n_sub_tiles; } + unsigned get_partial_tile_m() { return __partial_tile_m; } + unsigned get_partial_tile_n() { return __partial_tile_n; } + unsigned n_extra_threads() { return __partial_tile_m / REG_M; } + unsigned n_extra_vlanes() { return __partial_tile_n / REG_N; } // NOTE: We cannot use __ei.{__A,__B,__C,__beta,__alpha,__k} in the operator // below. If those are used, we get an invalid memory error from cuda. I @@ -231,17 +245,44 @@ class BatchedDblBufGemm { } __beta = ei.__beta; // Copy to device __alpha = ei.__alpha; // Copy to device - // To handle truncation of tiles per row/col, round up to one extra tile - // with '!!'. This extra tile will hang off the edge of the 2-rank matrix. - // For cases where tiles hang off the edge, we over-compute 0s within - // registers via a conditional bounds check selected at compile-time. - __tiles_per_row = ei.__c_m / __tile_m + !!((unsigned)ei.__c_m % __tile_m); - __tiles_per_col = ei.__c_n / __tile_n + !!((unsigned)ei.__c_n % __tile_n); + // To handle cases where ei.__c_{m,n} % __tile_{m,n} != 0 without simply + // multiplying by zeros, we can adjust the scratch space size in + // increments of REG_M / REG_M as well as the team / vector sizes up to 2 + // * cur_team_size - 1. For each additional thread, we must increase + // STRIDE_M by 1. For each additional vlane, we must increase STRIDE_N by + // 1. Note that each thead can handle REG_M extra rows and each vlane can + // handle REG_N extra cols. + __tiles_per_row = ei.__c_m / __tile_m; + __tiles_per_col = ei.__c_n / __tile_n; + + // Each partial_tile is a multiple of REG_{M,N} since each thread holds + // register buffers of size REG_{M,N}. + __partial_tile_m = ei.__c_m % __tile_m && ei.__c_m > __tile_m + ? REG_M * ((ei.__c_m - __tile_m) / REG_M + 1) + : 0; + __partial_tile_n = ei.__c_n % __tile_n && ei.__c_n > __tile_n + ? REG_N * ((ei.__c_n - __tile_n) / REG_N + 1) + : 0; + + __stride_m = STRIDE_M + n_extra_threads(); + __stride_n = STRIDE_N + n_extra_vlanes(); + + __ts = __tile_n / REG_N + n_extra_threads(); + __vl = __tile_n / REG_N + n_extra_vlanes(); // To handle truncation of __n_tile_k_tile, we have logic within the // operator for handling a partial __tile_k tile. __n_tile_k_tiles = __k / __tile_k; __n_sub_tiles = __tiles_per_row * __tiles_per_col; + + if (ei.__handle->enableDebug) { + std::cout << "__partial_tile_m:" << __partial_tile_m << std::endl + << "__partial_tile_n:" << __partial_tile_n << std::endl + << "__stride_m:" << __stride_m << std::endl + << "__stride_n:" << __stride_n << std::endl + << "__ts:" << __ts << std::endl + << "__vl:" << __vl << std::endl; + } } KOKKOS_INLINE_FUNCTION @@ -253,10 +294,9 @@ class BatchedDblBufGemm { view_type_2d_scratch &svA_scr, view_type_2d_scratch &svB_scr) const { Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, tile_m / REG_M), - [&](const int &thread_id) { + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N), + Kokkos::ThreadVectorRange(member, 0, __vl), [&](const int &vlane_id) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -266,13 +306,13 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M); + reg_a[m] = svA_scr(k, thread_id + m * __stride_m); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + reg_b[n] = svB_scr(k, vlane_id + n * __stride_n); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -314,14 +354,15 @@ class BatchedDblBufGemm { __ei.__batch_layout_tag); // Allocate scratch memory buffers used for prefetching - view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, __tile_m); - view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n); + view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, + __tile_m + __partial_tile_m); + view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, + __tile_n + __partial_tile_n); // Here we populate scratch memory with one or more "k" tiles for every // thread of the team! Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), - [&](const int &thread_id) { + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_offset = thread_id + start_n; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), @@ -329,7 +370,7 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) + for (int i = 0; i < REG_N * __stride_n; i += __stride_n) svB_scr(vlane_id, thread_id + i) = access_view_bounds_check( svB, vlane_id, thread_offset + i, @@ -337,8 +378,7 @@ class BatchedDblBufGemm { }); }); Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), - [&](const int &thread_id) { + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_offset = thread_id + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), @@ -346,7 +386,7 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + for (int i = 0; i < REG_M * __stride_m; i += __stride_m) svA_scr(vlane_id, thread_id + i) = access_view_bounds_check( svA, thread_offset + i, vlane_id, @@ -373,7 +413,7 @@ class BatchedDblBufGemm { // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs // over all threads in the team. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_offset = thread_id + start_n; Kokkos::parallel_for( @@ -386,7 +426,7 @@ class BatchedDblBufGemm { prefetch_reg_b[i] = access_view_bounds_check( svB, vlane_id + k_tile_offset, - thread_offset + i * STRIDE_N, + thread_offset + i * __stride_n, __ei.__bounds_check_tag); }); }); @@ -395,7 +435,7 @@ class BatchedDblBufGemm { // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs // over all threads in the team. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_offset = thread_id + start_m; Kokkos::parallel_for( @@ -407,7 +447,7 @@ class BatchedDblBufGemm { for (int i = 0; i < REG_M; ++i) prefetch_reg_a[i] = access_view_bounds_check( - svA, thread_offset + i * STRIDE_M, + svA, thread_offset + i * __stride_m, vlane_id + k_tile_offset, __ei.__bounds_check_tag); }); @@ -424,7 +464,7 @@ class BatchedDblBufGemm { // populate shmem from prefetch registers. Each thread has its own copy // of prefetch_reg_a. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_offset = thread_id; Kokkos::parallel_for( @@ -434,7 +474,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N; ++i) - svB_scr(vlane_id, thread_offset + i * STRIDE_N) = + svB_scr(vlane_id, thread_offset + i * __stride_n) = prefetch_reg_b[i]; }); }); @@ -442,7 +482,7 @@ class BatchedDblBufGemm { // populate shmem from prefetch registers. Each thread has its own copy // of prefetch_reg_b. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_offset = thread_id; Kokkos::parallel_for( @@ -452,7 +492,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) - svA_scr(vlane_id, thread_offset + i * STRIDE_M) = + svA_scr(vlane_id, thread_offset + i * __stride_m) = prefetch_reg_a[i]; }); }); @@ -468,26 +508,26 @@ class BatchedDblBufGemm { __rshmem_and_mult(member, partial_tile_k, __tile_m, __tile_n, reg_a, reg_b, reg_c, svA_scr, svB_scr); + // store results back to global memory if (__beta == 0.0F) { - // store results back to global memory Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_m_offset = thread_id + start_m; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), + Kokkos::ThreadVectorRange(member, 0, __vl), [&](const int &vlane_id) { auto thread_n_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) { - int cm = thread_m_offset + m * STRIDE_M; + int cm = thread_m_offset + m * __stride_m; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) { - int cn = thread_n_offset + n * STRIDE_N; + int cn = thread_n_offset + n * __stride_n; fma_bounds_check(svC, cm, cn, reg_c[m][n], __ei.__bounds_check_tag); } @@ -495,25 +535,24 @@ class BatchedDblBufGemm { }); }); } else { - // store results back to global memory Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), + Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { auto thread_m_offset = thread_id + start_m; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), + Kokkos::ThreadVectorRange(member, 0, __vl), [&](const int &vlane_id) { auto thread_n_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) { - int cm = thread_m_offset + m * STRIDE_M; + int cm = thread_m_offset + m * __stride_m; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) { - int cn = thread_n_offset + n * STRIDE_N; + int cn = thread_n_offset + n * __stride_n; fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta, __ei.__bounds_check_tag); } From 7dc1d60d3613924eac85fca331d87bab92cbd303 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 21 Oct 2021 13:11:19 -0600 Subject: [PATCH 02/18] Revert "batched/dense: Rework BatchedDblBufGemm" This reverts commit b01db1e7324b052766420b3692bfe39b2e28db11. --- src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 7 +- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 127 ++++++------------ 2 files changed, 48 insertions(+), 86 deletions(-) diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp index f3c5ba5b5a..0f5afcc6aa 100644 --- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp @@ -463,9 +463,10 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, // (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) { // // TODO: invoke TeamShmem // } else - if (on_gpu && ((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24))) { + if (on_gpu && + ((std::is_same::value) + ? (c_m >= 16) + : (c_m >= 24 && c_m <= 32) || (c_m >= 45 && c_m <= 64))) { handle->teamSz = handle->vecLen = 8; constexpr int tile_m = 32, tile_n = 32, tile_k = 8; if (c_m % 32 == 0) // No bounds checking diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index ae3a1b61a2..f8fc55f5b2 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -130,15 +130,10 @@ class BatchedDblBufGemm { } // Each team solves a single tile. Within each tile, the team solves - // all n_sub_tiles, one at a time. + // all __n_tile_k_tiles one at a time. size_t league_size = __c_batch_size * functor.get_n_sub_tiles(); - // TODO: determine max_team_size and max_vector_len here instead of using 32 - // and 16 - int team_size = - std::min(stride_m + functor.n_extra_threads(), (unsigned)32); - // TODO: why are 2x vector lanes needed rather than just 1 more? - int vector_len = - std::min(stride_n * (functor.n_extra_vlanes() + 1), (unsigned)16); + int team_size = stride_m; + int vector_len = stride_n; const int max_team_size = policy_type(league_size, Kokkos::AUTO, vector_len) @@ -170,18 +165,14 @@ class BatchedDblBufGemm { << " team_size:" << team_size << std::endl << "max_vector_len:" << max_vector_len << " vector_len:" << vector_len << std::endl - << " league_size:" << league_size << std::endl << "TILE_M:" << TILE_M << std::endl << "TILE_N:" << TILE_N << std::endl << "TILE_K:" << TILE_K << std::endl; } - // NOTE: All but shmem_size args but partial tile sizes can be determined at - // compile time. - int shmem_size = view_type_2d_scratch::shmem_size( - TILE_M + functor.get_partial_tile_m(), TILE_K) + - view_type_2d_scratch::shmem_size( - TILE_K, TILE_N + functor.get_partial_tile_n()); + // TODO: Use statically allocated shmem + int shmem_size = view_type_2d_scratch::shmem_size(TILE_M, TILE_K) + + view_type_2d_scratch::shmem_size(TILE_K, TILE_N); // Each member solves a portion of TILE_K in parallel with other members policy_type team_policy(league_size, team_size, vector_len); @@ -203,15 +194,10 @@ class BatchedDblBufGemm { ScalarType __alpha, __beta; int __k; size_t __n_tile_k_tiles, __n_sub_tiles; - unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row, - __partial_tile_m, __partial_tile_n, __stride_m, __stride_n, __ts, __vl; + unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row; public: size_t get_n_sub_tiles() { return __n_sub_tiles; } - unsigned get_partial_tile_m() { return __partial_tile_m; } - unsigned get_partial_tile_n() { return __partial_tile_n; } - unsigned n_extra_threads() { return __partial_tile_m / REG_M; } - unsigned n_extra_vlanes() { return __partial_tile_n / REG_N; } // NOTE: We cannot use __ei.{__A,__B,__C,__beta,__alpha,__k} in the operator // below. If those are used, we get an invalid memory error from cuda. I @@ -245,44 +231,17 @@ class BatchedDblBufGemm { } __beta = ei.__beta; // Copy to device __alpha = ei.__alpha; // Copy to device - // To handle cases where ei.__c_{m,n} % __tile_{m,n} != 0 without simply - // multiplying by zeros, we can adjust the scratch space size in - // increments of REG_M / REG_M as well as the team / vector sizes up to 2 - // * cur_team_size - 1. For each additional thread, we must increase - // STRIDE_M by 1. For each additional vlane, we must increase STRIDE_N by - // 1. Note that each thead can handle REG_M extra rows and each vlane can - // handle REG_N extra cols. - __tiles_per_row = ei.__c_m / __tile_m; - __tiles_per_col = ei.__c_n / __tile_n; - - // Each partial_tile is a multiple of REG_{M,N} since each thread holds - // register buffers of size REG_{M,N}. - __partial_tile_m = ei.__c_m % __tile_m && ei.__c_m > __tile_m - ? REG_M * ((ei.__c_m - __tile_m) / REG_M + 1) - : 0; - __partial_tile_n = ei.__c_n % __tile_n && ei.__c_n > __tile_n - ? REG_N * ((ei.__c_n - __tile_n) / REG_N + 1) - : 0; - - __stride_m = STRIDE_M + n_extra_threads(); - __stride_n = STRIDE_N + n_extra_vlanes(); - - __ts = __tile_n / REG_N + n_extra_threads(); - __vl = __tile_n / REG_N + n_extra_vlanes(); + // To handle truncation of tiles per row/col, round up to one extra tile + // with '!!'. This extra tile will hang off the edge of the 2-rank matrix. + // For cases where tiles hang off the edge, we over-compute 0s within + // registers via a conditional bounds check selected at compile-time. + __tiles_per_row = ei.__c_m / __tile_m + !!((unsigned)ei.__c_m % __tile_m); + __tiles_per_col = ei.__c_n / __tile_n + !!((unsigned)ei.__c_n % __tile_n); // To handle truncation of __n_tile_k_tile, we have logic within the // operator for handling a partial __tile_k tile. __n_tile_k_tiles = __k / __tile_k; __n_sub_tiles = __tiles_per_row * __tiles_per_col; - - if (ei.__handle->enableDebug) { - std::cout << "__partial_tile_m:" << __partial_tile_m << std::endl - << "__partial_tile_n:" << __partial_tile_n << std::endl - << "__stride_m:" << __stride_m << std::endl - << "__stride_n:" << __stride_n << std::endl - << "__ts:" << __ts << std::endl - << "__vl:" << __vl << std::endl; - } } KOKKOS_INLINE_FUNCTION @@ -294,9 +253,10 @@ class BatchedDblBufGemm { view_type_2d_scratch &svA_scr, view_type_2d_scratch &svB_scr) const { Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { + Kokkos::TeamThreadRange(member, 0, tile_m / REG_M), + [&](const int &thread_id) { Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __vl), + Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N), [&](const int &vlane_id) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -306,13 +266,13 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(k, thread_id + m * __stride_m); + reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id + n * __stride_n); + reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -354,15 +314,14 @@ class BatchedDblBufGemm { __ei.__batch_layout_tag); // Allocate scratch memory buffers used for prefetching - view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, - __tile_m + __partial_tile_m); - view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, - __tile_n + __partial_tile_n); + view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, __tile_m); + view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n); // Here we populate scratch memory with one or more "k" tiles for every // thread of the team! Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { + Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), + [&](const int &thread_id) { auto thread_offset = thread_id + start_n; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), @@ -370,7 +329,7 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N * __stride_n; i += __stride_n) + for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) svB_scr(vlane_id, thread_id + i) = access_view_bounds_check( svB, vlane_id, thread_offset + i, @@ -378,7 +337,8 @@ class BatchedDblBufGemm { }); }); Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), [&](const int &thread_id) { + Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), + [&](const int &thread_id) { auto thread_offset = thread_id + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), @@ -386,7 +346,7 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * __stride_m; i += __stride_m) + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) svA_scr(vlane_id, thread_id + i) = access_view_bounds_check( svA, thread_offset + i, vlane_id, @@ -413,7 +373,7 @@ class BatchedDblBufGemm { // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs // over all threads in the team. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), + Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), [&](const int &thread_id) { auto thread_offset = thread_id + start_n; Kokkos::parallel_for( @@ -426,7 +386,7 @@ class BatchedDblBufGemm { prefetch_reg_b[i] = access_view_bounds_check( svB, vlane_id + k_tile_offset, - thread_offset + i * __stride_n, + thread_offset + i * STRIDE_N, __ei.__bounds_check_tag); }); }); @@ -435,7 +395,7 @@ class BatchedDblBufGemm { // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs // over all threads in the team. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), + Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { auto thread_offset = thread_id + start_m; Kokkos::parallel_for( @@ -447,7 +407,7 @@ class BatchedDblBufGemm { for (int i = 0; i < REG_M; ++i) prefetch_reg_a[i] = access_view_bounds_check( - svA, thread_offset + i * __stride_m, + svA, thread_offset + i * STRIDE_M, vlane_id + k_tile_offset, __ei.__bounds_check_tag); }); @@ -464,7 +424,7 @@ class BatchedDblBufGemm { // populate shmem from prefetch registers. Each thread has its own copy // of prefetch_reg_a. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), + Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), [&](const int &thread_id) { auto thread_offset = thread_id; Kokkos::parallel_for( @@ -474,7 +434,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N; ++i) - svB_scr(vlane_id, thread_offset + i * __stride_n) = + svB_scr(vlane_id, thread_offset + i * STRIDE_N) = prefetch_reg_b[i]; }); }); @@ -482,7 +442,7 @@ class BatchedDblBufGemm { // populate shmem from prefetch registers. Each thread has its own copy // of prefetch_reg_b. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), + Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { auto thread_offset = thread_id; Kokkos::parallel_for( @@ -492,7 +452,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) - svA_scr(vlane_id, thread_offset + i * __stride_m) = + svA_scr(vlane_id, thread_offset + i * STRIDE_M) = prefetch_reg_a[i]; }); }); @@ -508,26 +468,26 @@ class BatchedDblBufGemm { __rshmem_and_mult(member, partial_tile_k, __tile_m, __tile_n, reg_a, reg_b, reg_c, svA_scr, svB_scr); - // store results back to global memory if (__beta == 0.0F) { + // store results back to global memory Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), + Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { auto thread_m_offset = thread_id + start_m; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __vl), + Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { auto thread_n_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) { - int cm = thread_m_offset + m * __stride_m; + int cm = thread_m_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) { - int cn = thread_n_offset + n * __stride_n; + int cn = thread_n_offset + n * STRIDE_N; fma_bounds_check(svC, cm, cn, reg_c[m][n], __ei.__bounds_check_tag); } @@ -535,24 +495,25 @@ class BatchedDblBufGemm { }); }); } else { + // store results back to global memory Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __ts), + Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { auto thread_m_offset = thread_id + start_m; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __vl), + Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { auto thread_n_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) { - int cm = thread_m_offset + m * __stride_m; + int cm = thread_m_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) { - int cn = thread_n_offset + n * __stride_n; + int cn = thread_n_offset + n * STRIDE_N; fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta, __ei.__bounds_check_tag); } From f6ac57d615801e3d8fa8d7b1cab9cee13ace75da Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 21 Oct 2021 13:08:08 -0600 Subject: [PATCH 03/18] some experiments for improving dram utilization --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index f8fc55f5b2..384386113f 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -83,6 +83,7 @@ class BatchedDblBufGemm { typename execution_space_type::scratch_memory_space; using view_type_2d_scratch = Kokkos::View; + // TODO: add compile-time extents public: BatchedDblBufGemm(HandleType *const handle, ScalarType alpha, AViewType A, @@ -261,12 +262,16 @@ class BatchedDblBufGemm { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (unsigned k = 0; k < nk; ++k) { + for (unsigned k = 0; k < nk; + ++k) { // TODO: would have to invert this for + // threadVectorRange copy TODOs below #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) + for (int m = 0; m < REG_M; + ++m) // TODO: this could be a threadVectorRange copy reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M); + // TODO: reg_a could be a thread shared buffer #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -297,6 +302,9 @@ class BatchedDblBufGemm { // Allocate registers used for FMAs view_value_type reg_a[REG_M] = {0}, reg_b[REG_N] = {0}, reg_c[REG_M][REG_N] = {{0}}; + // TODO: look at local loads and stores via nvprof + // TODO: look at GPU trace in nvprof to find out how many registers are + // used. unsigned batch_idx = member.league_rank() / __n_sub_tiles; @@ -334,6 +342,13 @@ class BatchedDblBufGemm { access_view_bounds_check( svB, vlane_id, thread_offset + i, __ei.__bounds_check_tag); + // TODO: Use LayoutLeft here for contiguous access across + // vlanes?? + // or change indexing like this: + // access_view_bounds_check( + // svB, i, thread_offset + + // vlane_id, + // __ei.__bounds_check_tag); }); }); Kokkos::parallel_for( @@ -351,6 +366,7 @@ class BatchedDblBufGemm { access_view_bounds_check( svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag); + // TODO: might be able to use local deep copy here. }); }); From d15eb7de80385aa925b7dcdc80ba4651cc3ed763 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 25 Oct 2021 14:54:22 -0600 Subject: [PATCH 04/18] Add some notes from talk with Vinh --- src/batched/KokkosBatched_Util.hpp | 4 ++++ src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 3 +++ 2 files changed, 7 insertions(+) diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index f13db02e62..e59f2146f4 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -826,6 +826,10 @@ KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) { if (m < v.extent_int(0) && n < v.extent_int(1)) return v(m, n); return (ViewValueType)0.0F; + // TODO: use compile-time extents + // if (m > scr.extent(0) || n > scr.extent(1)) + // return 0; + // return v(m, n); } template diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 384386113f..52d936f453 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -296,6 +296,8 @@ class BatchedDblBufGemm { KOKKOS_INLINE_FUNCTION void operator()(const MemberType &member) const { + // TODO: use Kokkos view with compile-time size to allocating register?? + // Then we can use local deep copy for prefetch_reg population. // Allocate registers used for prefetching view_value_type prefetch_reg_a[REG_M] = {0}, prefetch_reg_b[REG_N] = {0}; @@ -365,6 +367,7 @@ class BatchedDblBufGemm { svA_scr(vlane_id, thread_id + i) = access_view_bounds_check( svA, thread_offset + i, vlane_id, + // svB, vlane_id, thread_offset + i, __ei.__bounds_check_tag); // TODO: might be able to use local deep copy here. }); From e35709b217c8b3810bb6b4c63e9e0126b3c23fb7 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 25 Oct 2021 14:58:16 -0600 Subject: [PATCH 05/18] Add some notes from talk with Vinh last week --- .../dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 52d936f453..f95f4517d5 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -58,6 +58,15 @@ namespace Impl { /// CT/NT, NT/CT, CT/CT /// +// TODO - scaling between (32x32, 64x64) +// Option 0: Increase number of tiles and figure out how to map kokkos teams +// into cuda grid. Keep team size and vector lanes constant. +// TODO: write up small example and ask Christian. [DONE, +// MdRangePolicy not applicable here] +// Option 1: Increase register sizes to handle rows/cols past tile size +// Option 2: Fix league_size and have single team solve full tile followed +// by same team solving extra rows/cols (without multiplying by the +// zero rows/cols) template Date: Thu, 28 Oct 2021 13:36:44 -0600 Subject: [PATCH 06/18] update strides to be 1 --- src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index f95f4517d5..54941355d1 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -123,8 +123,8 @@ class BatchedDblBufGemm { // TODO: check these expressions for all tile_m, tile_n, tile_k in Z+. constexpr int reg_m = TILE_M / TILE_K; constexpr int reg_n = TILE_N / TILE_K + 2 * !!(TILE_N % TILE_K); - constexpr int stride_m = TILE_K; - constexpr int stride_n = TILE_N / reg_n; + constexpr int stride_m = 1; + constexpr int stride_n = 1; using functor_type = Functor; functor_type functor(*this, __A, __B, __C, TILE_M, TILE_N, TILE_K); @@ -142,8 +142,8 @@ class BatchedDblBufGemm { // Each team solves a single tile. Within each tile, the team solves // all __n_tile_k_tiles one at a time. size_t league_size = __c_batch_size * functor.get_n_sub_tiles(); - int team_size = stride_m; - int vector_len = stride_n; + int team_size = TILE_K; + int vector_len = TILE_N / reg_n; const int max_team_size = policy_type(league_size, Kokkos::AUTO, vector_len) From fdb5a6ba8544f4bcf80223c0944024a5619f92f5 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 28 Oct 2021 13:58:44 -0600 Subject: [PATCH 07/18] Update inital B scratch pre-fetch --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 54941355d1..84c516e6b5 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -339,27 +339,22 @@ class BatchedDblBufGemm { // Here we populate scratch memory with one or more "k" tiles for every // thread of the team! Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), + Kokkos::TeamThreadRange(member, 0, __tile_k), [&](const int &thread_id) { auto thread_offset = thread_id + start_n; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __tile_k), + Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(vlane_id, thread_id + i) = + svB_scr(thread_id, vlane_id * REG_N + i) = access_view_bounds_check( - svB, vlane_id, thread_offset + i, + svB, thread_offset, vlane_id * REG_N + i, __ei.__bounds_check_tag); - // TODO: Use LayoutLeft here for contiguous access across - // vlanes?? - // or change indexing like this: - // access_view_bounds_check( - // svB, i, thread_offset + - // vlane_id, - // __ei.__bounds_check_tag); + // TODO: use svB_scr(thread_id + i, vlane_id) to stride + // accesses to shared memory }); }); Kokkos::parallel_for( From 0432f347e5e52f596b57414aa549b0269822bccd Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 28 Oct 2021 14:42:08 -0600 Subject: [PATCH 08/18] WIP: Update initial A scratch pre-fetch --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 84c516e6b5..a2ad548605 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -333,7 +333,7 @@ class BatchedDblBufGemm { __ei.__batch_layout_tag); // Allocate scratch memory buffers used for prefetching - view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_k, __tile_m); + view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_m, __tile_k); view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n); // Here we populate scratch memory with one or more "k" tiles for every @@ -368,14 +368,25 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(vlane_id, thread_id + i) = + svA_scr(thread_id EXPR, (vlane_id % 2) * REG_M + i) = access_view_bounds_check( - svA, thread_offset + i, vlane_id, - // svB, vlane_id, thread_offset + i, + svA, thread_offset EXPR, (vlane_id % 2) * REG_M + i, __ei.__bounds_check_tag); // TODO: might be able to use local deep copy here. }); }); + // Kokkos::parallel_for( + // Kokkos::TeamVectorRange(member, 0, (__tile_m / REG_M) * + // __tile_k), + // [&](const int vlane_id) { + // for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) { + // svA_scr(vlane_id / __tile_k, vlane_id / REG_M + i) = + // access_view_bounds_check( + // svA, vlane_id / __tile_m + start_m, vlane_id / + // REG_M + i, + // __ei.__bounds_check_tag); + // } + // }); // Check whether we have a partial tile unsigned partial_tile_k = __k - (__n_tile_k_tiles * __tile_k); From d86006cdc02e35eb8285eb67cee27ee619d898ea Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Fri, 29 Oct 2021 16:52:04 -0600 Subject: [PATCH 09/18] DblBuf: Checkpoint, contig indexing --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 82 +++++++++---------- 1 file changed, 41 insertions(+), 41 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index a2ad548605..0467b8e44f 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -262,31 +262,35 @@ class BatchedDblBufGemm { view_value_type reg_c[REG_M][REG_N], view_type_2d_scratch &svA_scr, view_type_2d_scratch &svB_scr) const { + // view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_m, + // __tile_k); view_type_2d_scratch svB_scr(member.team_scratch(0), + // __tile_k, __tile_n); Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, tile_m / REG_M), [&](const int &thread_id) { + auto thread_offset = thread_id * REG_M; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N), [&](const int &vlane_id) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (unsigned k = 0; k < nk; - ++k) { // TODO: would have to invert this for - // threadVectorRange copy TODOs below + // TODO: would have to invert this for + // threadVectorRange copy TODOs below + for (unsigned k = 0; k < nk; ++k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; - ++m) // TODO: this could be a threadVectorRange copy - reg_a[m] = svA_scr(k, thread_id + m * STRIDE_M); + for (int m = 0; m < REG_M; ++m) + // TODO: this could be a threadVectorRange copy + reg_a[m] = svA_scr(thread_offset + m * STRIDE_M, k); // TODO: reg_a could be a thread shared buffer #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + reg_b[n] = svB_scr(k, vlane_id * REG_N + n * STRIDE_N); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -349,7 +353,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id, vlane_id * REG_N + i) = + svB_scr(thread_id + i, vlane_id * REG_N + i) = access_view_bounds_check( svB, thread_offset, vlane_id * REG_N + i, __ei.__bounds_check_tag); @@ -360,7 +364,7 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_offset = thread_id + start_m; + auto thread_offset = thread_id * REG_M + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { @@ -368,25 +372,21 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(thread_id EXPR, (vlane_id % 2) * REG_M + i) = + svA_scr(thread_id * REG_M + (vlane_id / 2), + (vlane_id % 2) * REG_M + i) = access_view_bounds_check( - svA, thread_offset EXPR, (vlane_id % 2) * REG_M + i, + svA, thread_offset + (vlane_id / 2), + (vlane_id % 2) * REG_M + i, __ei.__bounds_check_tag); + // svA_scr(thread_id, (vlane_id % 2) * + // REG_M + i) = + // access_view_bounds_check( + // svA, thread_offset, (vlane_id % + // 2) * REG_M + i, + // __ei.__bounds_check_tag); // TODO: might be able to use local deep copy here. }); }); - // Kokkos::parallel_for( - // Kokkos::TeamVectorRange(member, 0, (__tile_m / REG_M) * - // __tile_k), - // [&](const int vlane_id) { - // for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) { - // svA_scr(vlane_id / __tile_k, vlane_id / REG_M + i) = - // access_view_bounds_check( - // svA, vlane_id / __tile_m + start_m, vlane_id / - // REG_M + i, - // __ei.__bounds_check_tag); - // } - // }); // Check whether we have a partial tile unsigned partial_tile_k = __k - (__n_tile_k_tiles * __tile_k); @@ -407,11 +407,11 @@ class BatchedDblBufGemm { // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs // over all threads in the team. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), + Kokkos::TeamThreadRange(member, 0, __tile_k), [&](const int &thread_id) { auto thread_offset = thread_id + start_n; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __tile_k), + Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -419,9 +419,9 @@ class BatchedDblBufGemm { for (int i = 0; i < REG_N; ++i) prefetch_reg_b[i] = access_view_bounds_check( - svB, vlane_id + k_tile_offset, - thread_offset + i * STRIDE_N, - __ei.__bounds_check_tag); + svB, thread_offset + k_tile_offset, + vlane_id * REG_N + i, __ei.__bounds_check_tag); + // TODO: use svB_scr(thread_id + i, vlane_id) to stride }); }); @@ -431,7 +431,7 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_offset = thread_id + start_m; + auto thread_offset = thread_id * REG_M + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { @@ -441,8 +441,8 @@ class BatchedDblBufGemm { for (int i = 0; i < REG_M; ++i) prefetch_reg_a[i] = access_view_bounds_check( - svA, thread_offset + i * STRIDE_M, - vlane_id + k_tile_offset, + svA, thread_offset + (vlane_id / 2), + (vlane_id % 2) * REG_M + i + k_tile_offset, __ei.__bounds_check_tag); }); }); @@ -458,17 +458,17 @@ class BatchedDblBufGemm { // populate shmem from prefetch registers. Each thread has its own copy // of prefetch_reg_a. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_n / REG_N), + Kokkos::TeamThreadRange(member, 0, __tile_k), [&](const int &thread_id) { auto thread_offset = thread_id; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __tile_k), + Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N; ++i) - svB_scr(vlane_id, thread_offset + i * STRIDE_N) = + svB_scr(thread_id, vlane_id * REG_N + i) = prefetch_reg_b[i]; }); }); @@ -478,7 +478,7 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_offset = thread_id; + auto thread_offset = thread_id * REG_M; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { @@ -486,8 +486,8 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) - svA_scr(vlane_id, thread_offset + i * STRIDE_M) = - prefetch_reg_a[i]; + svA_scr(thread_offset + (vlane_id / 2), + (vlane_id % 2) * REG_M + i) = prefetch_reg_a[i]; }); }); @@ -507,11 +507,11 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_m_offset = thread_id + start_m; + auto thread_m_offset = thread_id * REG_M + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { - auto thread_n_offset = vlane_id + start_n; + auto thread_n_offset = vlane_id * REG_N + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL @@ -533,11 +533,11 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_m_offset = thread_id + start_m; + auto thread_m_offset = thread_id * REG_M + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { - auto thread_n_offset = vlane_id + start_n; + auto thread_n_offset = vlane_id * REG_N + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL From bd7e63a6b24d54153e864d1fbc02d4a48702e4e4 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 2 Nov 2021 12:17:46 -0600 Subject: [PATCH 10/18] DblBuf: Checkpoint, fix typo & cleanup --- .../dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 0467b8e44f..80f97d9946 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -353,12 +353,10 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id + i, vlane_id * REG_N + i) = + svB_scr(thread_id, vlane_id * REG_N + i) = access_view_bounds_check( svB, thread_offset, vlane_id * REG_N + i, __ei.__bounds_check_tag); - // TODO: use svB_scr(thread_id + i, vlane_id) to stride - // accesses to shared memory }); }); Kokkos::parallel_for( @@ -378,12 +376,6 @@ class BatchedDblBufGemm { svA, thread_offset + (vlane_id / 2), (vlane_id % 2) * REG_M + i, __ei.__bounds_check_tag); - // svA_scr(thread_id, (vlane_id % 2) * - // REG_M + i) = - // access_view_bounds_check( - // svA, thread_offset, (vlane_id % - // 2) * REG_M + i, - // __ei.__bounds_check_tag); // TODO: might be able to use local deep copy here. }); }); @@ -421,7 +413,6 @@ class BatchedDblBufGemm { access_view_bounds_check( svB, thread_offset + k_tile_offset, vlane_id * REG_N + i, __ei.__bounds_check_tag); - // TODO: use svB_scr(thread_id + i, vlane_id) to stride }); }); @@ -460,7 +451,6 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_k), [&](const int &thread_id) { - auto thread_offset = thread_id; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { From 27ecd4db6538ba79943ef1699e70cbdbe473b846 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Nov 2021 09:25:47 -0600 Subject: [PATCH 11/18] DblBuf: Fix start_n usage - Also refactor pre-fetching loops to avoid duplicate index calculations. --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 80f97d9946..0d8f2a2ff3 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -345,17 +345,17 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_k), [&](const int &thread_id) { - auto thread_offset = thread_id + start_n; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { + auto vlane_offset = vlane_id * REG_N + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) svB_scr(thread_id, vlane_id * REG_N + i) = access_view_bounds_check( - svB, thread_offset, vlane_id * REG_N + i, + svB, thread_id, vlane_offset + i, __ei.__bounds_check_tag); }); }); @@ -366,6 +366,8 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { + auto vld = (vlane_id / 2); + auto vlane_offset = (vlane_id % 2) * REG_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL @@ -373,8 +375,7 @@ class BatchedDblBufGemm { svA_scr(thread_id * REG_M + (vlane_id / 2), (vlane_id % 2) * REG_M + i) = access_view_bounds_check( - svA, thread_offset + (vlane_id / 2), - (vlane_id % 2) * REG_M + i, + svA, thread_offset + vld, vlane_offset + i, __ei.__bounds_check_tag); // TODO: might be able to use local deep copy here. }); @@ -399,20 +400,21 @@ class BatchedDblBufGemm { // Each thread has its own copy of prefetch_reg_b. TeamThreadRange runs // over all threads in the team. Kokkos::parallel_for( - Kokkos::TeamThreadRange(member, 0, __tile_k), - [&](const int &thread_id) { - auto thread_offset = thread_id + start_n; + Kokkos::TeamThreadRange(member, k_tile_offset, + k_tile_offset + __tile_k), + [&](const int &thread_offset) { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { + auto vlane_offset = vlane_id * REG_N + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N; ++i) prefetch_reg_b[i] = access_view_bounds_check( - svB, thread_offset + k_tile_offset, - vlane_id * REG_N + i, __ei.__bounds_check_tag); + svB, thread_offset, vlane_offset + i, + __ei.__bounds_check_tag); }); }); @@ -426,14 +428,15 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { + auto vld = (vlane_id / 2); + auto vlane_offset = (vlane_id % 2) * REG_M + k_tile_offset; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) prefetch_reg_a[i] = access_view_bounds_check( - svA, thread_offset + (vlane_id / 2), - (vlane_id % 2) * REG_M + i + k_tile_offset, + svA, thread_offset + vld, vlane_offset + i, __ei.__bounds_check_tag); }); }); From 8df38fcc7e092d4f61de21b6680f73cfb74d899f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Nov 2021 14:00:43 -0600 Subject: [PATCH 12/18] DblBuf: Restore strides. - Initial pass at pre-fetch loops for 0th k-tile. --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 31 +++++++++---------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 0d8f2a2ff3..374e2e66ab 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -123,8 +123,8 @@ class BatchedDblBufGemm { // TODO: check these expressions for all tile_m, tile_n, tile_k in Z+. constexpr int reg_m = TILE_M / TILE_K; constexpr int reg_n = TILE_N / TILE_K + 2 * !!(TILE_N % TILE_K); - constexpr int stride_m = 1; - constexpr int stride_n = 1; + constexpr int stride_m = TILE_K; + constexpr int stride_n = TILE_N / reg_n; using functor_type = Functor; functor_type functor(*this, __A, __B, __C, TILE_M, TILE_N, TILE_K); @@ -142,8 +142,8 @@ class BatchedDblBufGemm { // Each team solves a single tile. Within each tile, the team solves // all __n_tile_k_tiles one at a time. size_t league_size = __c_batch_size * functor.get_n_sub_tiles(); - int team_size = TILE_K; - int vector_len = TILE_N / reg_n; + int team_size = stride_m; + int vector_len = stride_n; const int max_team_size = policy_type(league_size, Kokkos::AUTO, vector_len) @@ -348,12 +348,12 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { - auto vlane_offset = vlane_id * REG_N + start_n; + auto vlane_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id, vlane_id * REG_N + i) = + svB_scr(thread_id, vlane_id + i) = access_view_bounds_check( svB, thread_id, vlane_offset + i, __ei.__bounds_check_tag); @@ -366,16 +366,13 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { - auto vld = (vlane_id / 2); - auto vlane_offset = (vlane_id % 2) * REG_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(thread_id * REG_M + (vlane_id / 2), - (vlane_id % 2) * REG_M + i) = + for (int i = 0; i < REG_M; i++) + svA_scr(thread_id * REG_M + i, vlane_id) = access_view_bounds_check( - svA, thread_offset + vld, vlane_offset + i, + svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag); // TODO: might be able to use local deep copy here. }); @@ -413,7 +410,7 @@ class BatchedDblBufGemm { for (int i = 0; i < REG_N; ++i) prefetch_reg_b[i] = access_view_bounds_check( - svB, thread_offset, vlane_offset + i, + svB, thread_offset, vlane_offset + i * STRIDE_N, __ei.__bounds_check_tag); }); }); @@ -436,7 +433,8 @@ class BatchedDblBufGemm { for (int i = 0; i < REG_M; ++i) prefetch_reg_a[i] = access_view_bounds_check( - svA, thread_offset + vld, vlane_offset + i, + svA, thread_offset + vld, + vlane_offset + i * STRIDE_N, __ei.__bounds_check_tag); }); }); @@ -461,7 +459,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N; ++i) - svB_scr(thread_id, vlane_id * REG_N + i) = + svB_scr(thread_id, vlane_id * REG_N + i * STRIDE_N) = prefetch_reg_b[i]; }); }); @@ -480,7 +478,8 @@ class BatchedDblBufGemm { #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) svA_scr(thread_offset + (vlane_id / 2), - (vlane_id % 2) * REG_M + i) = prefetch_reg_a[i]; + (vlane_id % 2) * REG_M + i * STRIDE_M) = + prefetch_reg_a[i]; }); }); From aded516c278dce2d837c505e3b1c2df486f25d1f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 3 Nov 2021 15:33:21 -0600 Subject: [PATCH 13/18] DblBuf: Restore strides. - Second pass at pre-fetch loops for all k tiles. --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 374e2e66ab..0d33f9581a 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -283,14 +283,14 @@ class BatchedDblBufGemm { #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) // TODO: this could be a threadVectorRange copy - reg_a[m] = svA_scr(thread_offset + m * STRIDE_M, k); + reg_a[m] = svA_scr(thread_offset + m, k); // TODO: reg_a could be a thread shared buffer #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id * REG_N + n * STRIDE_N); + reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll @@ -423,18 +423,16 @@ class BatchedDblBufGemm { [&](const int &thread_id) { auto thread_offset = thread_id * REG_M + start_m; Kokkos::parallel_for( - Kokkos::ThreadVectorRange(member, 0, __tile_k), + Kokkos::ThreadVectorRange(member, k_tile_offset, + k_tile_offset + __tile_k), [&](const int &vlane_id) { - auto vld = (vlane_id / 2); - auto vlane_offset = (vlane_id % 2) * REG_M + k_tile_offset; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) prefetch_reg_a[i] = access_view_bounds_check( - svA, thread_offset + vld, - vlane_offset + i * STRIDE_N, + svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag); }); }); @@ -459,7 +457,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_N; ++i) - svB_scr(thread_id, vlane_id * REG_N + i * STRIDE_N) = + svB_scr(thread_id, vlane_id + i * STRIDE_N) = prefetch_reg_b[i]; }); }); @@ -477,9 +475,7 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) - svA_scr(thread_offset + (vlane_id / 2), - (vlane_id % 2) * REG_M + i * STRIDE_M) = - prefetch_reg_a[i]; + svA_scr(thread_offset + i, vlane_id) = prefetch_reg_a[i]; }); }); From d9252dbe49fa0885803fac7e1dcda151c4baf2ed Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 4 Nov 2021 15:12:42 -0600 Subject: [PATCH 14/18] DblBuf: Restore strides. - Third pass at pre-fetch loops for all k tiles. --- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 0d33f9581a..14301ebc59 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -268,7 +268,6 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, tile_m / REG_M), [&](const int &thread_id) { - auto thread_offset = thread_id * REG_M; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, tile_n / REG_N), [&](const int &vlane_id) { @@ -277,15 +276,20 @@ class BatchedDblBufGemm { #endif // KOKKOS_ENABLE_PRAGMA_UNROLL // TODO: would have to invert this for // threadVectorRange copy TODOs below - for (unsigned k = 0; k < nk; ++k) { + for (unsigned k = 0; k < nk; + ++k) { // TODO: svA_scr coalesced access. All vlanes are + // readying the same data from svA scr. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int m = 0; m < REG_M; ++m) // TODO: this could be a threadVectorRange copy - reg_a[m] = svA_scr(thread_offset + m, k); + reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); // TODO: reg_a could be a thread shared buffer +// view_type_2d_scratch svA_scr(member.team_scratch(0), +// __tile_m, __tile_k); view_type_2d_scratch +// svB_scr(member.team_scratch(0), __tile_k, __tile_n); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL @@ -362,15 +366,15 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_offset = thread_id * REG_M + start_m; + auto thread_offset = thread_id + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; i++) - svA_scr(thread_id * REG_M + i, vlane_id) = + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + svA_scr(thread_id + i, vlane_id) = access_view_bounds_check( svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag); @@ -403,7 +407,7 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { - auto vlane_offset = vlane_id * REG_N + start_n; + auto vlane_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL @@ -421,7 +425,7 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_offset = thread_id * REG_M + start_m; + auto thread_offset = thread_id + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, k_tile_offset, k_tile_offset + __tile_k), @@ -432,7 +436,7 @@ class BatchedDblBufGemm { for (int i = 0; i < REG_M; ++i) prefetch_reg_a[i] = access_view_bounds_check( - svA, thread_offset + i, vlane_id, + svA, thread_offset + i * STRIDE_M, vlane_id, __ei.__bounds_check_tag); }); }); @@ -467,7 +471,6 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_offset = thread_id * REG_M; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_k), [&](const int &vlane_id) { @@ -475,7 +478,8 @@ class BatchedDblBufGemm { #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL for (int i = 0; i < REG_M; ++i) - svA_scr(thread_offset + i, vlane_id) = prefetch_reg_a[i]; + svA_scr(thread_id + i * STRIDE_M, vlane_id) = + prefetch_reg_a[i]; }); }); @@ -495,11 +499,11 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_m_offset = thread_id * REG_M + start_m; + auto thread_m_offset = thread_id + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { - auto thread_n_offset = vlane_id * REG_N + start_n; + auto thread_n_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL @@ -521,11 +525,11 @@ class BatchedDblBufGemm { Kokkos::parallel_for( Kokkos::TeamThreadRange(member, 0, __tile_m / REG_M), [&](const int &thread_id) { - auto thread_m_offset = thread_id * REG_M + start_m; + auto thread_m_offset = thread_id + start_m; Kokkos::parallel_for( Kokkos::ThreadVectorRange(member, 0, __tile_n / REG_N), [&](const int &vlane_id) { - auto thread_n_offset = vlane_id * REG_N + start_n; + auto thread_n_offset = vlane_id + start_n; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) #pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL From 81d7f94f7dd04bfae43a5103cc46a79ee36bc5ef Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 6 Dec 2021 14:00:37 -0700 Subject: [PATCH 15/18] Use one pair of parallel-for for TeamThreadRange and ThreadVectorRange in double buffering batched gemm --- src/batched/KokkosBatched_Util.hpp | 12 +- src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 9 +- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 194 ++++++++++++++++++ 3 files changed, 205 insertions(+), 10 deletions(-) diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index e59f2146f4..89d70cb2df 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -23,6 +23,7 @@ #include "Kokkos_Timer.hpp" #include "KokkosKernels_config.h" +#include "KokkosKernels_Utils.hpp" // TPL macros #if defined(KOKKOSKERNELS_ENABLE_TPL_MKL) @@ -824,12 +825,11 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( template KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) { - if (m < v.extent_int(0) && n < v.extent_int(1)) return v(m, n); - return (ViewValueType)0.0F; - // TODO: use compile-time extents - // if (m > scr.extent(0) || n > scr.extent(1)) - // return 0; - // return v(m, n); + return v(KOKKOSKERNELS_MACRO_MIN(m,v.extent_int(0)), KOKKOSKERNELS_MACRO_MIN(n,v.extent_int(1))); + //// TODO: use compile-time extents + //// if (m > scr.extent(0) || n > scr.extent(1)) + //// return 0; + //// return v(m, n); } template diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp index 0f5afcc6aa..ba154326bf 100644 --- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp @@ -465,11 +465,10 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, // } else if (on_gpu && ((std::is_same::value) - ? (c_m >= 16) - : (c_m >= 24 && c_m <= 32) || (c_m >= 45 && c_m <= 64))) { + ? (c_m >= 16) : (c_m >= 24))) {//Vinh's note: use this condition for now, might need to revisit handle->teamSz = handle->vecLen = 8; constexpr int tile_m = 32, tile_n = 32, tile_k = 8; - if (c_m % 32 == 0) // No bounds checking + if (c_m % 32 == 0) { // No bounds checking ret = Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); - else + } + else { ret = Impl::BatchedDblBufGemm( handle, alpha, A, B, beta, C) .invoke(); + } } else { ret = Impl::BatchedSerialGemm(svB, thread_id, vlane_offset + i, __ei.__bounds_check_tag); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + svA_scr(thread_id + i, vlane_id) = access_view_bounds_check(svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag); + + // Wait for A, B to reside in scratch memory + member.team_barrier(); + + // Each thread calculates a single dot product in chunks of size __tile_k + for (kk = 0; kk < __k - __tile_k; kk += __tile_k) { + int k_tile_offset = kk + __tile_k; + + // Get this threads next __tile_k entries from global memory + // Each thread has its own copy of prefetch_reg_b. +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int i = 0; i < REG_N; ++i) + prefetch_reg_b[i] = access_view_bounds_check(svB, thread_id + k_tile_offset, vlane_offset + i * STRIDE_N, __ei.__bounds_check_tag); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int i = 0; i < REG_M; ++i) + prefetch_reg_a[i] = access_view_bounds_check(svA, thread_offset + i * STRIDE_M, vlane_id + k_tile_offset, __ei.__bounds_check_tag); + + // Multiply +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (unsigned k = 0; k < __tile_k; ++k) { + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int m = 0; m < REG_M; ++m) + reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int n = 0; n < REG_N; ++n) + reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int m = 0; m < REG_M; ++m) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int n = 0; n < REG_N; ++n) + reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha; + } + } + + // Wait for: + // 1. prefetch_regs to be populated + // 2. for shmem to no longer be read from + member.team_barrier(); + + // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_b. +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int i = 0; i < REG_N; ++i) + svB_scr(thread_id, vlane_id + i * STRIDE_N) = prefetch_reg_b[i]; + + // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_a. +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int i = 0; i < REG_M; ++i) + svA_scr(thread_id + i * STRIDE_M, vlane_id) = prefetch_reg_a[i]; + + // Wait for shmem stores to land before performing next __tile_k multiply + member.team_barrier(); + } // end n_tile_k_tiles loop + + // Multiply last tile, may be a partial tile +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (unsigned k = 0; k < __k - kk; ++k) { + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int m = 0; m < REG_M; ++m) + reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int n = 0; n < REG_N; ++n) + reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int m = 0; m < REG_M; ++m) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int n = 0; n < REG_N; ++n) + reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha; + } + } + + // store results back to global memory + if (__beta == 0.0F) { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int m = 0; m < REG_M; ++m) { + int cm = thread_offset + m * STRIDE_M; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int n = 0; n < REG_N; ++n) { + int cn = vlane_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __ei.__bounds_check_tag); + } + } + } else { +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int m = 0; m < REG_M; ++m) { + int cm = thread_offset + m * STRIDE_M; +#if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) + #pragma unroll +#endif // KOKKOS_ENABLE_PRAGMA_UNROLL + for (int n = 0; n < REG_N; ++n) { + int cn = vlane_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta, __ei.__bounds_check_tag); + } + } + } + }); + }); + } +#endif }; }; /********************* END non-functor-level routines *********************/ From d549e7851bdc9ddc378e9d330822a9b4e8db3000 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Mon, 6 Dec 2021 23:01:52 -0700 Subject: [PATCH 16/18] Use constexpr for tile sizes --- .../dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index be51f06505..1302d56c98 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -127,7 +127,7 @@ class BatchedDblBufGemm { constexpr int stride_n = TILE_N / reg_n; using functor_type = Functor; - functor_type functor(*this, __A, __B, __C, TILE_M, TILE_N, TILE_K); + functor_type functor(*this, __A, __B, __C); if (__handle->enableDebug) { std::cout << "algo_type:" << __handle->get_kernel_algo_type() << std::endl @@ -204,7 +204,10 @@ class BatchedDblBufGemm { ScalarType __alpha, __beta; int __k; size_t __n_tile_k_tiles, __n_sub_tiles; - unsigned __tile_m, __tile_n, __tile_k, __tiles_per_col, __tiles_per_row; + static constexpr unsigned __tile_m = TILE_M; + static constexpr unsigned __tile_n = TILE_M; + static constexpr unsigned __tile_k = TILE_K; + unsigned __tiles_per_col, __tiles_per_row; public: size_t get_n_sub_tiles() { return __n_sub_tiles; } @@ -218,10 +221,7 @@ class BatchedDblBufGemm { : __ei(ei), __A(A), __B(B), - __C(C), - __tile_m(tile_m), - __tile_n(tile_n), - __tile_k(tile_k) { + __C(C) { if (std::is_same::value) { ei.__c_batch_size = ei.__C.extent_int(0); ei.__c_m = ei.__C.extent_int(1); From 03fce87891bb2436b6fab246bacf2a6b73fdad6c Mon Sep 17 00:00:00 2001 From: "Vinh Quang Dang (-EXP)" Date: Mon, 6 Dec 2021 23:28:54 -0700 Subject: [PATCH 17/18] Apply clang-format 9.0.0 --- src/batched/KokkosBatched_Util.hpp | 3 +- src/batched/dense/KokkosBatched_Gemm_Decl.hpp | 10 +- .../impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 218 ++++++++++-------- 3 files changed, 128 insertions(+), 103 deletions(-) diff --git a/src/batched/KokkosBatched_Util.hpp b/src/batched/KokkosBatched_Util.hpp index 89d70cb2df..6177b90c18 100644 --- a/src/batched/KokkosBatched_Util.hpp +++ b/src/batched/KokkosBatched_Util.hpp @@ -825,7 +825,8 @@ KOKKOS_INLINE_FUNCTION auto subview_wrapper( template KOKKOS_INLINE_FUNCTION ViewValueType access_view_bounds_check(ViewType v, int m, int n, const BoundsCheck::Yes &) { - return v(KOKKOSKERNELS_MACRO_MIN(m,v.extent_int(0)), KOKKOSKERNELS_MACRO_MIN(n,v.extent_int(1))); + return v(KOKKOSKERNELS_MACRO_MIN(m, v.extent_int(0)), + KOKKOSKERNELS_MACRO_MIN(n, v.extent_int(1))); //// TODO: use compile-time extents //// if (m > scr.extent(0) || n > scr.extent(1)) //// return 0; diff --git a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp index ba154326bf..d3efbbe4b6 100644 --- a/src/batched/dense/KokkosBatched_Gemm_Decl.hpp +++ b/src/batched/dense/KokkosBatched_Gemm_Decl.hpp @@ -463,9 +463,10 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, // (alpha == 1.0F && beta == 0.0F) ? c_m <= 24 : c_m <= 21) { // // TODO: invoke TeamShmem // } else - if (on_gpu && - ((std::is_same::value) - ? (c_m >= 16) : (c_m >= 24))) {//Vinh's note: use this condition for now, might need to revisit + if (on_gpu && ((std::is_same::value) + ? (c_m >= 16) + : (c_m >= 24))) { // Vinh's note: use this condition + // for now, might need to revisit handle->teamSz = handle->vecLen = 8; constexpr int tile_m = 32, tile_n = 32, tile_k = 8; if (c_m % 32 == 0) { // No bounds checking @@ -476,8 +477,7 @@ int BatchedGemm(BatchedGemmHandleType *const handle, const ScalarType alpha, BoundsCheck::No, tile_m, tile_n, tile_k>( handle, alpha, A, B, beta, C) .invoke(); - } - else { + } else { ret = Impl::BatchedDblBufGemm::value) { ei.__c_batch_size = ei.__C.extent_int(0); ei.__c_m = ei.__C.extent_int(1); @@ -585,161 +582,188 @@ class BatchedDblBufGemm { view_type_2d_scratch svA_scr(member.team_scratch(0), __tile_m, __tile_k); view_type_2d_scratch svB_scr(member.team_scratch(0), __tile_k, __tile_n); - Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, STRIDE_M), [&](const int &thread_id) { - int thread_offset = thread_id + start_m; + Kokkos::parallel_for( + Kokkos::TeamThreadRange(member, 0, STRIDE_M), + [&](const int &thread_id) { + int thread_offset = thread_id + start_m; - Kokkos::parallel_for(Kokkos::ThreadVectorRange(member, 0, STRIDE_N), [&](const int &vlane_id) { - int vlane_offset = vlane_id + start_n; + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(member, 0, STRIDE_N), + [&](const int &vlane_id) { + int vlane_offset = vlane_id + start_n; - // Here we populate scratch memory with one or more "k" tiles for every thread of the team! + // Here we populate scratch memory with one or more "k" tiles for + // every thread of the team! #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) - svB_scr(thread_id, vlane_id + i) = access_view_bounds_check(svB, thread_id, vlane_offset + i, __ei.__bounds_check_tag); + for (int i = 0; i < REG_N * STRIDE_N; i += STRIDE_N) + svB_scr(thread_id, vlane_id + i) = + access_view_bounds_check( + svB, thread_id, vlane_offset + i, + __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) - svA_scr(thread_id + i, vlane_id) = access_view_bounds_check(svA, thread_offset + i, vlane_id, __ei.__bounds_check_tag); + for (int i = 0; i < REG_M * STRIDE_M; i += STRIDE_M) + svA_scr(thread_id + i, vlane_id) = + access_view_bounds_check( + svA, thread_offset + i, vlane_id, + __ei.__bounds_check_tag); - // Wait for A, B to reside in scratch memory - member.team_barrier(); + // Wait for A, B to reside in scratch memory + member.team_barrier(); - // Each thread calculates a single dot product in chunks of size __tile_k - for (kk = 0; kk < __k - __tile_k; kk += __tile_k) { - int k_tile_offset = kk + __tile_k; + // Each thread calculates a single dot product in chunks of + // size __tile_k + for (kk = 0; kk < __k - __tile_k; kk += __tile_k) { + int k_tile_offset = kk + __tile_k; - // Get this threads next __tile_k entries from global memory - // Each thread has its own copy of prefetch_reg_b. + // Get this threads next __tile_k entries from global memory + // Each thread has its own copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - prefetch_reg_b[i] = access_view_bounds_check(svB, thread_id + k_tile_offset, vlane_offset + i * STRIDE_N, __ei.__bounds_check_tag); + for (int i = 0; i < REG_N; ++i) + prefetch_reg_b[i] = + access_view_bounds_check( + svB, thread_id + k_tile_offset, + vlane_offset + i * STRIDE_N, + __ei.__bounds_check_tag); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - prefetch_reg_a[i] = access_view_bounds_check(svA, thread_offset + i * STRIDE_M, vlane_id + k_tile_offset, __ei.__bounds_check_tag); + for (int i = 0; i < REG_M; ++i) + prefetch_reg_a[i] = + access_view_bounds_check( + svA, thread_offset + i * STRIDE_M, + vlane_id + k_tile_offset, + __ei.__bounds_check_tag); - // Multiply + // Multiply #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (unsigned k = 0; k < __tile_k; ++k) { + for (unsigned k = 0; k < __tile_k; ++k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); + for (int m = 0; m < REG_M; ++m) + reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + for (int n = 0; n < REG_N; ++n) + reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { + for (int m = 0; m < REG_M; ++m) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha; - } - } + for (int n = 0; n < REG_N; ++n) + reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha; + } + } - // Wait for: - // 1. prefetch_regs to be populated - // 2. for shmem to no longer be read from - member.team_barrier(); + // Wait for: + // 1. prefetch_regs to be populated + // 2. for shmem to no longer be read from + member.team_barrier(); - // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_b. + // populate shmem from prefetch registers. Each thread has its own + // copy of prefetch_reg_b. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_N; ++i) - svB_scr(thread_id, vlane_id + i * STRIDE_N) = prefetch_reg_b[i]; + for (int i = 0; i < REG_N; ++i) + svB_scr(thread_id, vlane_id + i * STRIDE_N) = + prefetch_reg_b[i]; - // populate shmem from prefetch registers. Each thread has its own copy of prefetch_reg_a. + // populate shmem from prefetch registers. Each thread has its own + // copy of prefetch_reg_a. #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int i = 0; i < REG_M; ++i) - svA_scr(thread_id + i * STRIDE_M, vlane_id) = prefetch_reg_a[i]; + for (int i = 0; i < REG_M; ++i) + svA_scr(thread_id + i * STRIDE_M, vlane_id) = + prefetch_reg_a[i]; - // Wait for shmem stores to land before performing next __tile_k multiply - member.team_barrier(); - } // end n_tile_k_tiles loop + // Wait for shmem stores to land before performing next + // __tile_k multiply + member.team_barrier(); + } // end n_tile_k_tiles loop - // Multiply last tile, may be a partial tile + // Multiply last tile, may be a partial tile #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (unsigned k = 0; k < __k - kk; ++k) { + for (unsigned k = 0; k < __k - kk; ++k) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) - reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); + for (int m = 0; m < REG_M; ++m) + reg_a[m] = svA_scr(thread_id + m * STRIDE_M, k); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); + for (int n = 0; n < REG_N; ++n) + reg_b[n] = svB_scr(k, vlane_id + n * STRIDE_N); #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { + for (int m = 0; m < REG_M; ++m) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) - reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha; - } - } + for (int n = 0; n < REG_N; ++n) + reg_c[m][n] += reg_a[m] * reg_b[n] * __alpha; + } + } - // store results back to global memory - if (__beta == 0.0F) { + // store results back to global memory + if (__beta == 0.0F) { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = thread_offset + m * STRIDE_M; + for (int m = 0; m < REG_M; ++m) { + int cm = thread_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = vlane_offset + n * STRIDE_N; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __ei.__bounds_check_tag); - } - } - } else { + for (int n = 0; n < REG_N; ++n) { + int cn = vlane_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], + __ei.__bounds_check_tag); + } + } + } else { #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int m = 0; m < REG_M; ++m) { - int cm = thread_offset + m * STRIDE_M; + for (int m = 0; m < REG_M; ++m) { + int cm = thread_offset + m * STRIDE_M; #if defined(KOKKOS_ENABLE_PRAGMA_UNROLL) - #pragma unroll +#pragma unroll #endif // KOKKOS_ENABLE_PRAGMA_UNROLL - for (int n = 0; n < REG_N; ++n) { - int cn = vlane_offset + n * STRIDE_N; - fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta, __ei.__bounds_check_tag); + for (int n = 0; n < REG_N; ++n) { + int cn = vlane_offset + n * STRIDE_N; + fma_bounds_check(svC, cm, cn, reg_c[m][n], __beta, + __ei.__bounds_check_tag); + } + } } - } - } + }); }); - }); } #endif }; From 2602b97f4045c190c2b61b30183cf0134b71a822 Mon Sep 17 00:00:00 2001 From: Vinh Dang Date: Tue, 7 Dec 2021 12:57:36 -0700 Subject: [PATCH 18/18] Remove unused parameters tile_m, tile_n, tile_k --- src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp index 13e30117bf..139f30fb6c 100644 --- a/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp +++ b/src/batched/dense/impl/KokkosBatched_Gemm_DblBuf_Impl.hpp @@ -216,8 +216,7 @@ class BatchedDblBufGemm { // below. If those are used, we get an invalid memory error from cuda. I // suspect this is due the values not being copied to device and then // runtime resolution of the host address &__ei. - Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C, - unsigned tile_m = 1, unsigned tile_n = 1, unsigned tile_k = 1) + Functor(BatchedDblBufGemm &ei, AViewType A, BViewType B, CViewType C) : __ei(ei), __A(A), __B(B), __C(C) { if (std::is_same::value) { ei.__c_batch_size = ei.__C.extent_int(0);