Skip to content

Commit

Permalink
Merge pull request #5755 from Rombur/hip-fix-global-launch
Browse files Browse the repository at this point in the history
Fix HIP Global Launch with HSA_XNACK=1
  • Loading branch information
crtrott authored Jan 21, 2023
2 parents 6e73a35 + 0db3bd8 commit c304818
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 16 deletions.
32 changes: 27 additions & 5 deletions core/src/HIP/Kokkos_HIP_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,25 +236,45 @@ Kokkos::HIP::size_type *HIPInternal::scratch_flags(const std::size_t size) {
return m_scratchFlags;
}

Kokkos::HIP::size_type *HIPInternal::scratch_functor(
const std::size_t size) const {
Kokkos::HIP::size_type *HIPInternal::stage_functor_for_execution(
void const *driver, std::size_t const size) const {
if (verify_is_initialized("scratch_functor") && m_scratchFunctorSize < size) {
m_scratchFunctorSize = size;

using Record = Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPSpace, void>;
using RecordHost =
Kokkos::Impl::SharedAllocationRecord<Kokkos::HIPHostPinnedSpace, void>;

if (m_scratchFunctor)
if (m_scratchFunctor) {
Record::decrement(Record::get_record(m_scratchFunctor));
RecordHost::decrement(RecordHost::get_record(m_scratchFunctorHost));
}

Record *const r =
Record::allocate(Kokkos::HIPSpace(), "Kokkos::InternalScratchFunctor",
m_scratchFunctorSize);
RecordHost *const r_host = RecordHost::allocate(
Kokkos::HIPHostPinnedSpace(), "Kokkos::InternalScratchFunctorHost",
m_scratchFunctorSize);

Record::increment(r);
RecordHost::increment(r_host);

m_scratchFunctor = reinterpret_cast<size_type *>(r->data());
m_scratchFunctor = reinterpret_cast<size_type *>(r->data());
m_scratchFunctorHost = reinterpret_cast<size_type *>(r_host->data());
}

// When using HSA_XNACK=1, it is necessary to copy the driver to the host to
// ensure that the driver is not destroyed before the computation is done.
// Without this fix, all the atomic tests fail. It is not obvious that this
// problem is limited to HSA_XNACK=1 even if all the tests pass when
// HSA_XNACK=0. That's why we always copy the driver.
KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(m_stream));
std::memcpy(m_scratchFunctorHost, driver, size);
KOKKOS_IMPL_HIP_SAFE_CALL(hipMemcpyAsync(m_scratchFunctor,
m_scratchFunctorHost, size,
hipMemcpyDefault, m_stream));

return m_scratchFunctor;
}

Expand Down Expand Up @@ -318,8 +338,10 @@ void HIPInternal::finalize() {
RecordHIP::decrement(RecordHIP::get_record(m_scratchFlags));
RecordHIP::decrement(RecordHIP::get_record(m_scratchSpace));

if (m_scratchFunctorSize > 0)
if (m_scratchFunctorSize > 0) {
RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctor));
RecordHIP::decrement(RecordHIP::get_record(m_scratchFunctorHost));
}
}

for (int i = 0; i < m_n_team_scratch; ++i) {
Expand Down
15 changes: 9 additions & 6 deletions core/src/HIP/Kokkos_HIP_Instance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,9 +87,11 @@ class HIPInternal {
std::size_t m_scratchFlagsCount = 0;
mutable std::size_t m_scratchFunctorSize = 0;

size_type *m_scratchSpace = nullptr;
size_type *m_scratchFlags = nullptr;
mutable size_type *m_scratchFunctor = nullptr;
size_type *m_scratchSpace = nullptr;
size_type *m_scratchFlags = nullptr;
mutable size_type *m_scratchFunctor = nullptr;
mutable size_type *m_scratchFunctorHost = nullptr;
inline static std::mutex scratchFunctorMutex;

hipStream_t m_stream = nullptr;
uint32_t m_instance_id =
Expand Down Expand Up @@ -133,9 +135,10 @@ class HIPInternal {
HIPInternal() = default;

// Resizing of reduction related scratch spaces
size_type *scratch_space(const std::size_t size);
size_type *scratch_flags(const std::size_t size);
size_type *scratch_functor(const std::size_t size) const;
size_type *scratch_space(std::size_t const size);
size_type *scratch_flags(std::size_t const size);
size_type *stage_functor_for_execution(void const *driver,
std::size_t const size) const;
uint32_t impl_get_instance_id() const noexcept;
int acquire_team_scratch_space();
// Resizing of team level 1 scratch
Expand Down
9 changes: 4 additions & 5 deletions core/src/HIP/Kokkos_HIP_KernelLaunch.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -377,12 +377,11 @@ struct HIPParallelLaunchKernelInvoker<DriverType, LaunchBounds,
static void invoke_kernel(DriverType const &driver, dim3 const &grid,
dim3 const &block, int shmem,
HIPInternal const *hip_instance) {
// Wait until the previous kernel that uses m_scratchFuntor is done
std::lock_guard<std::mutex> lock(HIPInternal::scratchFunctorMutex);
DriverType *driver_ptr = reinterpret_cast<DriverType *>(
hip_instance->scratch_functor(sizeof(DriverType)));

hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault,
hip_instance->m_stream);

hip_instance->stage_functor_for_execution(
reinterpret_cast<void const *>(&driver), sizeof(DriverType)));
(base_t::get_kernel_func())<<<grid, block, shmem, hip_instance->m_stream>>>(
driver_ptr);
}
Expand Down

0 comments on commit c304818

Please sign in to comment.