Break lock array dependence of Cuda and HIP teams impl

sandialabs · Jan 26, 2023 · 5d93865 · 5d93865
1 parent 5d87aa9
commit 5d93865
Show file tree

Hide file tree

Showing 6 changed files with 54 additions and 37 deletions.
diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/core/src/Cuda/Kokkos_Cuda_Instance.cpp
@@ -29,7 +29,6 @@
 #include <Cuda/Kokkos_Cuda_Error.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
 #include <Cuda/Kokkos_Cuda_Instance.hpp>
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
 #include <impl/Kokkos_Error.hpp>
 #include <impl/Kokkos_Tools.hpp>
@@ -420,10 +419,11 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
     m_team_scratch_ptr[i]          = nullptr;
   }
 
+  m_num_scratch_locks = concurrency();
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency()));
+      cudaMalloc(&m_scratch_locks, sizeof(int32_t) * m_num_scratch_locks));
   KOKKOS_IMPL_CUDA_SAFE_CALL(
-      cudaMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency()));
+      cudaMemset(m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks));
 }
 
 //----------------------------------------------------------------------------
@@ -618,7 +618,8 @@ void CudaInternal::finalize() {
   }
 
   KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(m_scratch_locks));
-  m_scratch_locks = nullptr;
+  m_scratch_locks     = nullptr;
+  m_num_scratch_locks = 0;
 }
 
 //----------------------------------------------------------------------------

diff --git a/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/core/src/Cuda/Kokkos_Cuda_Instance.hpp
@@ -138,7 +138,8 @@ class CudaInternal {
   mutable int64_t m_team_scratch_current_size[10];
   mutable void* m_team_scratch_ptr[10];
   mutable std::atomic_int m_team_scratch_pool[10];
-  std::int32_t* m_scratch_locks;
+  int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
 
   bool was_initialized = false;
   bool was_finalized   = false;

diff --git a/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
@@ -31,7 +31,6 @@
 #include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
 #include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
 #include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
-#include <Cuda/Kokkos_Cuda_Locks.hpp>
 #include <Cuda/Kokkos_Cuda_Team.hpp>
 #include <Kokkos_MinMaxClamp.hpp>
 #include <Kokkos_Vectorization.hpp>
@@ -408,14 +407,15 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
 };
 
 __device__ inline int64_t cuda_get_scratch_index(Cuda::size_type league_size,
-                                                 int32_t* scratch_locks) {
+                                                 int32_t* scratch_locks,
+                                                 size_t num_scratch_locks) {
   int64_t threadid = 0;
   __shared__ int64_t base_thread_id;
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     int64_t const wraparound_len = Kokkos::max(
-        int64_t(1), Kokkos::min(int64_t(league_size),
-                                (int64_t(g_device_cuda_lock_arrays.n)) /
-                                    (blockDim.x * blockDim.y)));
+        int64_t(1),
+        Kokkos::min(int64_t(league_size),
+                    int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)));
     threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
     threadid *= blockDim.x * blockDim.y;
     int done = 0;
@@ -477,6 +477,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
 
   template <class TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
@@ -497,7 +498,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
     // Iterate this block through the league
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks,
+                                        m_num_scratch_locks);
     }
 
     const int int_league_size = (int)m_league_size;
@@ -668,6 +670,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -690,7 +693,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   __device__ inline void operator()() const {
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks,
+                                        m_num_scratch_locks);
     }
 
     using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
@@ -926,9 +930,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_shmem_size =
         m_policy.scratch_size(0, m_team_size) +
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_scratch_size[0]   = m_shmem_size;
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
     if (m_team_size <= 0) {
       m_scratch_ptr[1] = nullptr;
     } else {
@@ -1031,9 +1036,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_shmem_size =
         m_policy.scratch_size(0, m_team_size) +
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_scratch_size[0]   = m_shmem_size;
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
     if (m_team_size <= 0) {
       m_scratch_ptr[1] = nullptr;
     } else {

diff --git a/core/src/HIP/Kokkos_HIP_Instance.cpp b/core/src/HIP/Kokkos_HIP_Instance.cpp
@@ -180,10 +180,11 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) {
     Kokkos::Impl::throw_runtime_exception(msg.str());
   }
 
+  m_num_scratch_locks = concurrency();
   KOKKOS_IMPL_HIP_SAFE_CALL(
-      hipMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency()));
+      hipMalloc(&m_scratch_locks, sizeof(int32_t) * m_num_scratch_locks));
   KOKKOS_IMPL_HIP_SAFE_CALL(
-      hipMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency()));
+      hipMemset(m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks));
 }
 
 //----------------------------------------------------------------------------
@@ -363,7 +364,8 @@ void HIPInternal::finalize() {
   }
 
   KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(m_scratch_locks));
-  m_scratch_locks = nullptr;
+  m_scratch_locks     = nullptr;
+  m_num_scratch_locks = 0;
 }
 
 //----------------------------------------------------------------------------

diff --git a/core/src/HIP/Kokkos_HIP_Instance.hpp b/core/src/HIP/Kokkos_HIP_Instance.hpp
@@ -104,7 +104,8 @@ class HIPInternal {
   mutable int64_t m_team_scratch_current_size[10] = {};
   mutable void *m_team_scratch_ptr[10]            = {};
   mutable std::atomic_int m_team_scratch_pool[10] = {};
-  std::int32_t *m_scratch_locks;
+  int32_t *m_scratch_locks                        = nullptr;
+  size_t m_num_scratch_locks                      = 0;
 
   bool was_finalized = false;
 

diff --git a/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
@@ -22,7 +22,6 @@
 #if defined(__HIPCC__)
 
 #include <HIP/Kokkos_HIP_KernelLaunch.hpp>
-#include <HIP/Kokkos_HIP_Locks.hpp>
 #include <HIP/Kokkos_HIP_Team.hpp>
 #include <HIP/Kokkos_HIP_Instance.hpp>
 #include <Kokkos_MinMaxClamp.hpp>
@@ -384,14 +383,14 @@ class TeamPolicyInternal<HIP, Properties...>
 };
 
 __device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size,
-                                                int32_t* scratch_locks) {
+                                                int32_t* scratch_locks,
+                                                size_t num_scratch_locks) {
   int64_t threadid = 0;
   __shared__ int64_t base_thread_id;
   if (threadIdx.x == 0 && threadIdx.y == 0) {
     int64_t const wraparound_len =
         Kokkos::min(int64_t(league_size),
-                    (int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) /
-                        (blockDim.x * blockDim.y));
+                    int64_t(num_scratch_locks) / (blockDim.x * blockDim.y));
     threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
     threadid *= blockDim.x * blockDim.y;
     int done = 0;
@@ -448,6 +447,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
 
   template <typename TagType>
   __device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
@@ -466,7 +466,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
     // Iterate this block through the league
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
+                                       m_num_scratch_locks);
     }
 
     int const int_league_size = static_cast<int>(m_league_size);
@@ -514,9 +515,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
     m_shmem_size =
         (m_policy.scratch_size(0, m_team_size) +
          FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
-    m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_scratch_size[0]   = m_policy.scratch_size(0, m_team_size);
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
 
     // Functor's reduce memory, team scan memory, and team shared memory depend
     // upon team size.
@@ -618,6 +620,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   size_t m_scratch_size[2];
   int m_scratch_pool_id = -1;
   int32_t* m_scratch_locks;
+  size_t m_num_scratch_locks;
   const size_type m_league_size;
   int m_team_size;
   const size_type m_vector_size;
@@ -656,7 +659,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
   __device__ inline void operator()() const {
     int64_t threadid = 0;
     if (m_scratch_size[1] > 0) {
-      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks);
+      threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
+                                       m_num_scratch_locks);
     }
 
     using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
@@ -842,9 +846,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_shmem_size =
         m_policy.scratch_size(0, m_team_size) +
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_scratch_size[0]   = m_shmem_size;
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
     if (m_team_size <= 0) {
       m_scratch_ptr[1] = nullptr;
     } else {
@@ -936,9 +941,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
     m_shmem_size =
         m_policy.scratch_size(0, m_team_size) +
         FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
-    m_scratch_size[0] = m_shmem_size;
-    m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
-    m_scratch_locks   = internal_space_instance->m_scratch_locks;
+    m_scratch_size[0]   = m_shmem_size;
+    m_scratch_size[1]   = m_policy.scratch_size(1, m_team_size);
+    m_scratch_locks     = internal_space_instance->m_scratch_locks;
+    m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
     if (m_team_size <= 0) {
       m_scratch_ptr[1] = nullptr;
     } else {