Skip to content

Commit

Permalink
Break lock array dependence of Cuda and HIP teams impl
Browse files Browse the repository at this point in the history
  • Loading branch information
dalg24 committed Jan 26, 2023
1 parent 5d87aa9 commit 5d93865
Show file tree
Hide file tree
Showing 6 changed files with 54 additions and 37 deletions.
9 changes: 5 additions & 4 deletions core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <Cuda/Kokkos_Cuda_Instance.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Cuda/Kokkos_Cuda_UniqueToken.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Tools.hpp>
Expand Down Expand Up @@ -420,10 +419,11 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default
m_team_scratch_ptr[i] = nullptr;
}

m_num_scratch_locks = concurrency();
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency()));
cudaMalloc(&m_scratch_locks, sizeof(int32_t) * m_num_scratch_locks));
KOKKOS_IMPL_CUDA_SAFE_CALL(
cudaMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency()));
cudaMemset(m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks));
}

//----------------------------------------------------------------------------
Expand Down Expand Up @@ -618,7 +618,8 @@ void CudaInternal::finalize() {
}

KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(m_scratch_locks));
m_scratch_locks = nullptr;
m_scratch_locks = nullptr;
m_num_scratch_locks = 0;
}

//----------------------------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion core/src/Cuda/Kokkos_Cuda_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ class CudaInternal {
mutable int64_t m_team_scratch_current_size[10];
mutable void* m_team_scratch_ptr[10];
mutable std::atomic_int m_team_scratch_pool[10];
std::int32_t* m_scratch_locks;
int32_t* m_scratch_locks;
size_t m_num_scratch_locks;

bool was_initialized = false;
bool was_finalized = false;
Expand Down
32 changes: 19 additions & 13 deletions core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#include <Cuda/Kokkos_Cuda_KernelLaunch.hpp>
#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <Cuda/Kokkos_Cuda_Locks.hpp>
#include <Cuda/Kokkos_Cuda_Team.hpp>
#include <Kokkos_MinMaxClamp.hpp>
#include <Kokkos_Vectorization.hpp>
Expand Down Expand Up @@ -408,14 +407,15 @@ class TeamPolicyInternal<Kokkos::Cuda, Properties...>
};

__device__ inline int64_t cuda_get_scratch_index(Cuda::size_type league_size,
int32_t* scratch_locks) {
int32_t* scratch_locks,
size_t num_scratch_locks) {
int64_t threadid = 0;
__shared__ int64_t base_thread_id;
if (threadIdx.x == 0 && threadIdx.y == 0) {
int64_t const wraparound_len = Kokkos::max(
int64_t(1), Kokkos::min(int64_t(league_size),
(int64_t(g_device_cuda_lock_arrays.n)) /
(blockDim.x * blockDim.y)));
int64_t(1),
Kokkos::min(int64_t(league_size),
int64_t(num_scratch_locks) / (blockDim.x * blockDim.y)));
threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
threadid *= blockDim.x * blockDim.y;
int done = 0;
Expand Down Expand Up @@ -477,6 +477,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
size_t m_scratch_size[2];
int m_scratch_pool_id = -1;
int32_t* m_scratch_locks;
size_t m_num_scratch_locks;

template <class TagType>
__device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
Expand All @@ -497,7 +498,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
// Iterate this block through the league
int64_t threadid = 0;
if (m_scratch_size[1] > 0) {
threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks,
m_num_scratch_locks);
}

const int int_league_size = (int)m_league_size;
Expand Down Expand Up @@ -668,6 +670,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
size_t m_scratch_size[2];
int m_scratch_pool_id = -1;
int32_t* m_scratch_locks;
size_t m_num_scratch_locks;
const size_type m_league_size;
int m_team_size;
const size_type m_vector_size;
Expand All @@ -690,7 +693,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
__device__ inline void operator()() const {
int64_t threadid = 0;
if (m_scratch_size[1] > 0) {
threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks);
threadid = cuda_get_scratch_index(m_league_size, m_scratch_locks,
m_num_scratch_locks);
}

using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
Expand Down Expand Up @@ -926,9 +930,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
m_shmem_size =
m_policy.scratch_size(0, m_team_size) +
FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
if (m_team_size <= 0) {
m_scratch_ptr[1] = nullptr;
} else {
Expand Down Expand Up @@ -1031,9 +1036,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
m_shmem_size =
m_policy.scratch_size(0, m_team_size) +
FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
if (m_team_size <= 0) {
m_scratch_ptr[1] = nullptr;
} else {
Expand Down
8 changes: 5 additions & 3 deletions core/src/HIP/Kokkos_HIP_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,10 +180,11 @@ void HIPInternal::initialize(hipStream_t stream, bool manage_stream) {
Kokkos::Impl::throw_runtime_exception(msg.str());
}

m_num_scratch_locks = concurrency();
KOKKOS_IMPL_HIP_SAFE_CALL(
hipMalloc(&m_scratch_locks, sizeof(int32_t) * concurrency()));
hipMalloc(&m_scratch_locks, sizeof(int32_t) * m_num_scratch_locks));
KOKKOS_IMPL_HIP_SAFE_CALL(
hipMemset(m_scratch_locks, 0, sizeof(int32_t) * concurrency()));
hipMemset(m_scratch_locks, 0, sizeof(int32_t) * m_num_scratch_locks));
}

//----------------------------------------------------------------------------
Expand Down Expand Up @@ -363,7 +364,8 @@ void HIPInternal::finalize() {
}

KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(m_scratch_locks));
m_scratch_locks = nullptr;
m_scratch_locks = nullptr;
m_num_scratch_locks = 0;
}

//----------------------------------------------------------------------------
Expand Down
3 changes: 2 additions & 1 deletion core/src/HIP/Kokkos_HIP_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,8 @@ class HIPInternal {
mutable int64_t m_team_scratch_current_size[10] = {};
mutable void *m_team_scratch_ptr[10] = {};
mutable std::atomic_int m_team_scratch_pool[10] = {};
std::int32_t *m_scratch_locks;
int32_t *m_scratch_locks = nullptr;
size_t m_num_scratch_locks = 0;

bool was_finalized = false;

Expand Down
36 changes: 21 additions & 15 deletions core/src/HIP/Kokkos_HIP_Parallel_Team.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
#if defined(__HIPCC__)

#include <HIP/Kokkos_HIP_KernelLaunch.hpp>
#include <HIP/Kokkos_HIP_Locks.hpp>
#include <HIP/Kokkos_HIP_Team.hpp>
#include <HIP/Kokkos_HIP_Instance.hpp>
#include <Kokkos_MinMaxClamp.hpp>
Expand Down Expand Up @@ -384,14 +383,14 @@ class TeamPolicyInternal<HIP, Properties...>
};

__device__ inline int64_t hip_get_scratch_index(HIP::size_type league_size,
int32_t* scratch_locks) {
int32_t* scratch_locks,
size_t num_scratch_locks) {
int64_t threadid = 0;
__shared__ int64_t base_thread_id;
if (threadIdx.x == 0 && threadIdx.y == 0) {
int64_t const wraparound_len =
Kokkos::min(int64_t(league_size),
(int64_t(Kokkos::Impl::g_device_hip_lock_arrays.n)) /
(blockDim.x * blockDim.y));
int64_t(num_scratch_locks) / (blockDim.x * blockDim.y));
threadid = (blockIdx.x * blockDim.z + threadIdx.z) % wraparound_len;
threadid *= blockDim.x * blockDim.y;
int done = 0;
Expand Down Expand Up @@ -448,6 +447,7 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
size_t m_scratch_size[2];
int m_scratch_pool_id = -1;
int32_t* m_scratch_locks;
size_t m_num_scratch_locks;

template <typename TagType>
__device__ inline std::enable_if_t<std::is_void<TagType>::value> exec_team(
Expand All @@ -466,7 +466,8 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
// Iterate this block through the league
int64_t threadid = 0;
if (m_scratch_size[1] > 0) {
threadid = hip_get_scratch_index(m_league_size, m_scratch_locks);
threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
m_num_scratch_locks);
}

int const int_league_size = static_cast<int>(m_league_size);
Expand Down Expand Up @@ -514,9 +515,10 @@ class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>, HIP> {
m_shmem_size =
(m_policy.scratch_size(0, m_team_size) +
FunctorTeamShmemSize<FunctorType>::value(m_functor, m_team_size));
m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_scratch_size[0] = m_policy.scratch_size(0, m_team_size);
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;

// Functor's reduce memory, team scan memory, and team shared memory depend
// upon team size.
Expand Down Expand Up @@ -618,6 +620,7 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
size_t m_scratch_size[2];
int m_scratch_pool_id = -1;
int32_t* m_scratch_locks;
size_t m_num_scratch_locks;
const size_type m_league_size;
int m_team_size;
const size_type m_vector_size;
Expand Down Expand Up @@ -656,7 +659,8 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
__device__ inline void operator()() const {
int64_t threadid = 0;
if (m_scratch_size[1] > 0) {
threadid = hip_get_scratch_index(m_league_size, m_scratch_locks);
threadid = hip_get_scratch_index(m_league_size, m_scratch_locks,
m_num_scratch_locks);
}

using ReductionTag = std::conditional_t<UseShflReduction, ShflReductionTag,
Expand Down Expand Up @@ -842,9 +846,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
m_shmem_size =
m_policy.scratch_size(0, m_team_size) +
FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
if (m_team_size <= 0) {
m_scratch_ptr[1] = nullptr;
} else {
Expand Down Expand Up @@ -936,9 +941,10 @@ class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
m_shmem_size =
m_policy.scratch_size(0, m_team_size) +
FunctorTeamShmemSize<FunctorType>::value(arg_functor, m_team_size);
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_scratch_size[0] = m_shmem_size;
m_scratch_size[1] = m_policy.scratch_size(1, m_team_size);
m_scratch_locks = internal_space_instance->m_scratch_locks;
m_num_scratch_locks = internal_space_instance->m_num_scratch_locks;
if (m_team_size <= 0) {
m_scratch_ptr[1] = nullptr;
} else {
Expand Down

0 comments on commit 5d93865

Please sign in to comment.