Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stream support for Gauss-Seidel: Symbolic, Numeric, Apply (PSGS and Team_PSGS) #1906

Merged
merged 30 commits into from
Aug 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
825fff3
sparse/src: Add execution space inst member to GS handle
e10harvey Jul 18, 2023
ec97583
Update GS point symbolic and friends for streams
e10harvey Jul 18, 2023
98d5a24
sparse/impl: Make PSGS non-blocking
e10harvey Jul 26, 2023
0aa320f
Cleanup and use overload pattern
e10harvey Jul 26, 2023
b6e7eb3
Cleanup and use overload pattern
e10harvey Jul 26, 2023
5340ceb
Add GS handle overloads
e10harvey Jul 26, 2023
c0fb396
Account for streams in memory allocs
e10harvey Jul 26, 2023
0a8b20f
Fix build. Add fall-back overload
e10harvey Aug 8, 2023
ad6161c
sparse/unit_test: Add PSGS stream tests
e10harvey Aug 9, 2023
9d958a9
Merge branch 'develop' into issue1860
e10harvey Aug 9, 2023
d9c5ec6
Fix docs build
e10harvey Aug 9, 2023
9a7775a
Move print statements to avoid timing them
e10harvey Aug 9, 2023
423da45
common/src: Correct kk_get_free_total_memory
e10harvey Aug 9, 2023
9b90ffd
sparse/unit_test: Initialize KernelHandle defaults
e10harvey Aug 10, 2023
14cf552
sparse/src: Fix PSGS stream cuda regressions
e10harvey Aug 10, 2023
609e23c
.github/workflows/osx.yml: Double timeout
e10harvey Aug 10, 2023
f05cbf1
sparse/src: Add gauss_seidel_symbolic overload
e10harvey Aug 10, 2023
92ad57f
docs: Add create_gs_handle docs
e10harvey Aug 10, 2023
21f8aae
docs: Improve docs formatting
e10harvey Aug 10, 2023
1adc5e4
sparse/unit_test: Launch nrm2 on stream
e10harvey Aug 14, 2023
2082b66
sparse/src: Add GS numeric overload
e10harvey Aug 15, 2023
dd3fbb5
sparse/src: Add GS apply overload.
e10harvey Aug 15, 2023
5c41061
Various fixes
e10harvey Aug 21, 2023
f5618f8
sparse/src: Update GS apply docs
e10harvey Aug 22, 2023
e8d809f
Pass format through
e10harvey Aug 22, 2023
956e4c8
Use ExecutionSpace for user-facing APIs
e10harvey Aug 22, 2023
c0f1991
error check to avoid undefined behavior
e10harvey Aug 22, 2023
492b9aa
Fix intel19 CI failure
e10harvey Aug 23, 2023
3af7aac
Add runtime checks to PSGS
e10harvey Aug 23, 2023
d398cd9
Implement PR feedback
e10harvey Aug 24, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/osx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -111,4 +111,4 @@ jobs:

- name: test
working-directory: kokkos-kernels/build
run: ctest -j2 --output-on-failure --timeout 3600
run: ctest -j2 --output-on-failure --timeout 7200
80 changes: 71 additions & 9 deletions common/src/KokkosKernels_ExecSpaceUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,29 +157,68 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */,
throw std::runtime_error(oss.str());
}

// Host function to determine free and total device memory.
// Will throw if execution space doesn't support this.
template <typename MemorySpace>
inline void kk_get_free_total_memory(size_t& /* free_mem */,
size_t& /* total_mem */,
int /* n_streams */) {
std::ostringstream oss;
oss << "Error: memory space " << MemorySpace::name()
<< " does not support querying free/total memory.";
throw std::runtime_error(oss.str());
}

#ifdef KOKKOS_ENABLE_CUDA
template <>
inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem,
size_t& total_mem) {
size_t& total_mem,
int n_streams) {
cudaMemGetInfo(&free_mem, &total_mem);
free_mem /= n_streams;
total_mem /= n_streams;
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaSpace>(size_t& free_mem,
size_t& total_mem) {
kk_get_free_total_memory<Kokkos::CudaSpace>(free_mem, total_mem, 1);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem,
size_t& total_mem,
int n_streams) {
kk_get_free_total_memory<Kokkos::CudaSpace>(free_mem, total_mem, n_streams);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaUVMSpace>(size_t& free_mem,
size_t& total_mem) {
cudaMemGetInfo(&free_mem, &total_mem);
kk_get_free_total_memory<Kokkos::CudaUVMSpace>(free_mem, total_mem, 1);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(
size_t& free_mem, size_t& total_mem, int n_streams) {
kk_get_free_total_memory<Kokkos::CudaSpace>(free_mem, total_mem, n_streams);
}
template <>
inline void kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(
size_t& free_mem, size_t& total_mem) {
cudaMemGetInfo(&free_mem, &total_mem);
kk_get_free_total_memory<Kokkos::CudaHostPinnedSpace>(free_mem, total_mem, 1);
}
#endif

#ifdef KOKKOS_ENABLE_HIP
template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
size_t& free_mem, size_t& total_mem) {
size_t& free_mem, size_t& total_mem, int n_streams) {
KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem));
free_mem /= n_streams;
total_mem /= n_streams;
}
template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(free_mem, total_mem,
1);
}
#endif

Expand All @@ -188,7 +227,7 @@ inline void kk_get_free_total_memory<Kokkos::Experimental::HIPSpace>(
#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU)
template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
size_t& free_mem, size_t& total_mem) {
size_t& free_mem, size_t& total_mem, int n_streams) {
sycl::queue queue;
sycl::device device = queue.get_device();
auto level_zero_handle =
Expand Down Expand Up @@ -220,20 +259,43 @@ inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
total_mem += memory_states.size;
free_mem += memory_states.free;
}
free_mem /= n_streams;
total_mem /= n_streams;
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
free_mem, total_mem, 1);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLHostUSMSpace>(
size_t& free_mem, size_t& total_mem, int n_streams) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
free_mem, total_mem, n_streams);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLHostUSMSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(free_mem,
total_mem);
kk_get_free_total_memory<Kokkos::Experimental::SYCLHostUSMSpace>(
free_mem, total_mem, 1);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLSharedUSMSpace>(
size_t& free_mem, size_t& total_mem, int n_streams) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(
free_mem, total_mem, n_streams);
}

template <>
inline void kk_get_free_total_memory<Kokkos::Experimental::SYCLSharedUSMSpace>(
size_t& free_mem, size_t& total_mem) {
kk_get_free_total_memory<Kokkos::Experimental::SYCLDeviceUSMSpace>(free_mem,
total_mem);
kk_get_free_total_memory<Kokkos::Experimental::SYCLSharedUSMSpace>(
free_mem, total_mem, 1);
}
#endif

Expand Down
32 changes: 24 additions & 8 deletions common/src/KokkosKernels_SimpleUtils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,22 +142,38 @@ inline void kk_exclusive_parallel_prefix_sum(
kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum);
}

/***
* \brief Function performs the inclusive parallel prefix sum. That is each
* entry holds the sum until itself including itself. \param num_elements: size
* of the array \param arr: the array for which the prefix sum will be
* performed.
*/
///
/// \brief Function performs the inclusive parallel prefix sum. That is each
/// entry holds the sum until itself including itself.
/// \param my_exec_space: The execution space instance
/// \param num_elements: size of the array
/// \param arr: the array for which the prefix sum will be performed.
///
template <typename forward_array_type, typename MyExecSpace>
void kk_inclusive_parallel_prefix_sum(
MyExecSpace my_exec_space,
typename forward_array_type::value_type num_elements,
forward_array_type arr) {
typedef Kokkos::RangePolicy<MyExecSpace> my_exec_space;
typedef Kokkos::RangePolicy<MyExecSpace> range_policy_t;
Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum",
my_exec_space(0, num_elements),
range_policy_t(my_exec_space, 0, num_elements),
InclusiveParallelPrefixSum<forward_array_type>(arr));
}

///
/// \brief Function performs the inclusive parallel prefix sum. That is each
/// entry holds the sum until itself including itself.
/// \param num_elements: size of the array
/// \param arr: the array for which the prefix sum will be performed.
///
template <typename forward_array_type, typename MyExecSpace>
void kk_inclusive_parallel_prefix_sum(
typename forward_array_type::value_type num_elements,
forward_array_type arr) {
MyExecSpace my_exec_space;
return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr);
}

template <typename view_t>
struct ReductionFunctor {
view_t array_sum;
Expand Down
Loading