Skip to content

Commit

Permalink
Merge pull request #1906 from e10harvey/issue1860
Browse files Browse the repository at this point in the history
Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (PSGS and Team_PSGS)
  • Loading branch information
e10harvey authored Aug 29, 2023
2 parents efd226e + d398cd9 commit 3c86b5b
Show file tree
Hide file tree
Showing 13 changed files with 1,177 additions and 277 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/osx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,4 @@ jobs:

- name: test
working-directory: kokkos-kernels/build
run: ctest -j2 --output-on-failure --timeout 3600
run: ctest -j2 --output-on-failure --timeout 7200
80 changes: 71 additions & 9 deletions common/src/KokkosKernels_ExecSpaceUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,29 +157,68 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */,
throw std::runtime_error(oss.str());
}

// Host function to determine free and total device memory.
// Will throw if execution space doesn't support this.
template <typename MemorySpace>
inline void kk_get_free_total_memory(size_t& /* free_mem */,
size_t& /* total_mem */,
int /* n_streams */) {
std::ostringstream oss;
oss << "Error: memory space " << MemorySpace::name()
<< " does not support querying free/total memory.";
throw std::runtime_error(oss.str());
}

#ifdef KOKKOS_ENABLE_CUDA
template <>
inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem,
size_t& total_mem) {
size_t& total_mem,
int n_streams) {
cudaMemGetInfo(&free_mem, &total_mem);
free_mem /= n_streams;
total_mem /= n_streams;
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem,
size_t& total_mem) {
kk_get_free_total_memory<Kokkos::CudaSpace>(free_mem, total_mem, 1);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem,
size_t& total_mem,
int n_streams) {
kk_get_free_total_memory<Kokkos::CudaSpace>(free_mem, total_mem, n_streams);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem,
size_t& total_mem) {
cudaMemGetInfo(&free_mem, &total_mem);
kk_get_free_total_memory<Kokkos::CudaUVMSpace>(free_mem, total_mem, 1);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(
size_t& free_mem, size_t& total_mem, int n_streams) {
kk_get_free_total_memory<Kokkos::CudaSpace>(free_mem, total_mem, n_streams);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(
size_t& free_mem, size_t& total_mem) {
cudaMemGetInfo(&free_mem, &total_mem);
kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(free_mem, total_mem, 1);
}
#endif

#ifdef KOKKOS_ENABLE_HIP
template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
size_t& free_mem, size_t& total_mem) {
size_t& free_mem, size_t& total_mem, int n_streams) {
KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem));
free_mem /= n_streams;
total_mem /= n_streams;
}
template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(free_mem, total_mem,
1);
}
#endif

Expand All @@ -188,7 +227,7 @@ inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
size_t& free_mem, size_t& total_mem) {
size_t& free_mem, size_t& total_mem, int n_streams) {
sycl::queue queue;
sycl::device device = queue.get_device();
auto level_zero_handle =
Expand Down Expand Up @@ -220,20 +259,43 @@ inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
total_mem += memory_states.size;
free_mem += memory_states.free;
}
free_mem /= n_streams;
total_mem /= n_streams;
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
free_mem, total_mem, 1);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLHostUSMSpace>(
size_t& free_mem, size_t& total_mem, int n_streams) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
free_mem, total_mem, n_streams);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLHostUSMSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(free_mem,
total_mem);
kk_get_free_total_memory<Kokkos::Experimental::SYCLHostUSMSpace>(
free_mem, total_mem, 1);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLSharedUSMSpace>(
size_t& free_mem, size_t& total_mem, int n_streams) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
free_mem, total_mem, n_streams);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLSharedUSMSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(free_mem,
total_mem);
kk_get_free_total_memory<Kokkos::Experimental::SYCLSharedUSMSpace>(
free_mem, total_mem, 1);
}
#endif

Expand Down
32 changes: 24 additions & 8 deletions common/src/KokkosKernels_SimpleUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,22 +142,38 @@ inline void kk_exclusive_parallel_prefix_sum(
kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum);
}

/***
* \brief Function performs the inclusive parallel prefix sum. That is each
* entry holds the sum until itself including itself. \param num_elements: size
* of the array \param arr: the array for which the prefix sum will be
* performed.
*/
///
/// \brief Function performs the inclusive parallel prefix sum. That is each
/// entry holds the sum until itself including itself.
/// \param my_exec_space: The execution space instance
/// \param num_elements: size of the array
/// \param arr: the array for which the prefix sum will be performed.
///
template <typename forward_array_type, typename MyExecSpace>
void kk_inclusive_parallel_prefix_sum(
MyExecSpace my_exec_space,
typename forward_array_type::value_type num_elements,
forward_array_type arr) {
typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
typedef Kokkos::RangePolicy<MyExecSpace> range_policy_t;
Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum",
my_exec_space(0, num_elements),
range_policy_t(my_exec_space, 0, num_elements),
InclusiveParallelPrefixSum<forward_array_type>(arr));
}

///
/// \brief Function performs the inclusive parallel prefix sum. That is each
/// entry holds the sum until itself including itself.
/// \param num_elements: size of the array
/// \param arr: the array for which the prefix sum will be performed.
///
template <typename forward_array_type, typename MyExecSpace>
void kk_inclusive_parallel_prefix_sum(
typename forward_array_type::value_type num_elements,
forward_array_type arr) {
MyExecSpace my_exec_space;
return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr);
}

template <typename view_t>
struct ReductionFunctor {
view_t array_sum;
Expand Down
Loading

0 comments on commit 3c86b5b

Please sign in to comment.