diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 769957b953..8d9f7123f8 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -111,4 +111,4 @@ jobs: - name: test working-directory: kokkos-kernels/build - run: ctest -j2 --output-on-failure --timeout 3600 + run: ctest -j2 --output-on-failure --timeout 7200 diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index a0f6e39f4d..eb629f9e0c 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -157,29 +157,68 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */, throw std::runtime_error(oss.str()); } +// Host function to determine free and total device memory. +// Will throw if execution space doesn't support this. +template +inline void kk_get_free_total_memory(size_t& /* free_mem */, + size_t& /* total_mem */, + int /* n_streams */) { + std::ostringstream oss; + oss << "Error: memory space " << MemorySpace::name() + << " does not support querying free/total memory."; + throw std::runtime_error(oss.str()); +} + #ifdef KOKKOS_ENABLE_CUDA template <> inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { + size_t& total_mem, + int n_streams) { cudaMemGetInfo(&free_mem, &total_mem); + free_mem /= n_streams; + total_mem /= n_streams; +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem, + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + kk_get_free_total_memory(free_mem, total_mem, 1); +} +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif #ifdef KOKKOS_ENABLE_HIP template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { + size_t& free_mem, size_t& total_mem, int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); + free_mem /= n_streams; + total_mem /= n_streams; +} +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, + 1); } #endif @@ -188,7 +227,7 @@ inline void kk_get_free_total_memory( #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { + size_t& free_mem, size_t& total_mem, int n_streams) { sycl::queue queue; sycl::device device = queue.get_device(); auto level_zero_handle = @@ -220,20 +259,43 @@ inline void kk_get_free_total_memory( total_mem += memory_states.size; free_mem += memory_states.free; } + free_mem /= n_streams; + total_mem /= n_streams; +} + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + kk_get_free_total_memory( + free_mem, total_mem, 1); } #endif diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index a271695246..e25ec54eb0 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -142,22 +142,38 @@ inline void kk_exclusive_parallel_prefix_sum( kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } -/*** - * \brief Function performs the inclusive parallel prefix sum. That is each - * entry holds the sum until itself including itself. \param num_elements: size - * of the array \param arr: the array for which the prefix sum will be - * performed. - */ +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param my_exec_space: The execution space instance +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// template void kk_inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, forward_array_type arr) { - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// +template +void kk_inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); +} + template struct ReductionFunctor { view_t array_sum; diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 2a4b749f92..c6780185a4 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -456,10 +456,19 @@ struct Fill_Reverse_Map { template void inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, forward_array_type arr) { - kk_inclusive_parallel_prefix_sum( - num_elements, arr); + return kk_inclusive_parallel_prefix_sum( + my_exec_space, num_elements, arr); +} + +template +void inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template @@ -661,6 +670,7 @@ struct StridedCopy { template void create_reverse_map( + MyExecSpace my_exec_space, const typename reverse_array_type::value_type &num_forward_elements, // num_vertices const typename forward_array_type::value_type @@ -675,11 +685,13 @@ void create_reverse_map( const lno_t MINIMUM_TO_ATOMIC = 64; - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = - reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1); + reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), + num_reverse_elements + 1); reverse_map_adj = reverse_array_type( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "REVERSE_ADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "REVERSE_ADJ"), num_forward_elements); if (num_reverse_elements < MINIMUM_TO_ATOMIC) { @@ -693,58 +705,82 @@ void create_reverse_map( const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) << multiply_shift_for_scale; - reverse_array_type tmp_color_xadj("TMP_REVERSE_XADJ", tmp_reverse_size + 1); + reverse_array_type tmp_color_xadj( + Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), + tmp_reverse_size + 1); Reverse_Map_Scale_Init rmi( forward_map, tmp_color_xadj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapScaleInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); inclusive_parallel_prefix_sum( - tmp_reverse_size + 1, tmp_color_xadj); - MyExecSpace().fence(); + my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); + my_exec_space.fence(); - Kokkos::parallel_for("KokkosKernels::Common::StridedCopy", - my_exec_space(0, num_reverse_elements + 1), - StridedCopy( - tmp_color_xadj, reverse_map_xadj, scale_size)); - MyExecSpace().fence(); + Kokkos::parallel_for( + "KokkosKernels::Common::StridedCopy", + range_policy_t(my_exec_space, 0, num_reverse_elements + 1), + StridedCopy( + tmp_color_xadj, reverse_map_xadj, scale_size)); + my_exec_space.fence(); Fill_Reverse_Scale_Map frm( forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } else // atomic implementation. { reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "TMP_REVERSE_XADJ"), num_reverse_elements + 1); Reverse_Map_Init rmi( forward_map, reverse_map_xadj); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); // print_1Dview(reverse_map_xadj); inclusive_parallel_prefix_sum( - num_reverse_elements + 1, reverse_map_xadj); - MyExecSpace().fence(); - Kokkos::deep_copy(tmp_color_xadj, reverse_map_xadj); - MyExecSpace().fence(); + my_exec_space, num_reverse_elements + 1, reverse_map_xadj); + Kokkos::deep_copy(my_exec_space, tmp_color_xadj, reverse_map_xadj); + my_exec_space.fence(); Fill_Reverse_Map frm( forward_map, tmp_color_xadj, reverse_map_adj); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } } +template +void create_reverse_map( + const typename reverse_array_type::value_type + &num_forward_elements, // num_vertices + const typename forward_array_type::value_type + &num_reverse_elements, // num_colors + + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { + MyExecSpace my_exec_space; + return create_reverse_map(my_exec_space, num_forward_elements, + num_reverse_elements, forward_map, reverse_map_xadj, + reverse_map_adj); +} + template struct PermuteVector { @@ -772,18 +808,30 @@ struct PermuteVector { template -void permute_vector(typename idx_array_type::value_type num_elements, +void permute_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteVector( old_vector, new_vector, old_to_new_index_map)); } +template +void permute_vector(typename idx_array_type::value_type num_elements, + idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, + new_vector); +} + template struct PermuteBlockVector { @@ -817,27 +865,51 @@ struct PermuteBlockVector { template -void permute_block_vector(typename idx_array_type::value_type num_elements, +void permute_block_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, int block_size, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; - + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteBlockVector(block_size, old_vector, new_vector, old_to_new_index_map)); } +template +void permute_block_vector(typename idx_array_type::value_type num_elements, + int block_size, idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_block_vector(MyExecSpace(), num_elements, block_size, + old_to_new_index_map, old_vector, new_vector); +} + // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. +template +void zero_vector(ExecSpaceIn &exec_space_in, + typename value_array_type::value_type /* num_elements */, + value_array_type &vector) { + typedef typename value_array_type::non_const_value_type val_type; + Kokkos::deep_copy(exec_space_in, vector, + Kokkos::ArithTraits::zero()); + exec_space_in.fence(); +} + template void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { - typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(vector, Kokkos::ArithTraits::zero()); + using ne_tmp_t = typename value_array_type::value_type; + ne_tmp_t ne_tmp = ne_tmp_t(0); + MyExecSpace my_exec_space; + zero_vector(my_exec_space, ne_tmp, + vector); } template @@ -1250,17 +1322,30 @@ struct ReduceRowSizeFunctor { // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(const size_t num_rows, +void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, + const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, size_type &max_row_size) { - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxRowSize", my_exec_space(0, num_rows), + "KokkosKernels::Common::ViewReduceMaxRowSize", + range_policy_t(my_exec_space, 0, num_rows), ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), max_row_size); } +// view has num_rows+1 elements. +template +void kk_view_reduce_max_row_size(const size_t num_rows, + const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, + size_type &max_row_size) { + return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, + rowmap_view_begins, rowmap_view_ends, + max_row_size); +} + template struct ReduceMaxRowFunctor { view_type rowmap_view; diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 5129514198..415f72eec8 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -60,11 +60,20 @@ block_spgemm gauss_seidel ------------ +.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) +.. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 119941cebc..2a8b164219 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -220,21 +220,17 @@ void runGS(const GS_Parameters& params) { &kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric); double symbolicLaunchTime = timer.seconds(); - std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double symbolicComputeTime = timer.seconds(); - std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; timer.reset(); KokkosSparse::Experimental::gauss_seidel_numeric( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric); double numericLaunchTime = timer.seconds(); - std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double numericComputeTime = timer.seconds(); - std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; timer.reset(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { @@ -256,11 +252,9 @@ void runGS(const GS_Parameters& params) { } double applyLaunchTime = timer.seconds(); - std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double applyComputeTime = timer.seconds(); - std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; timer.reset(); kh.destroy_gs_handle(); // Now, compute the 2-norm of residual @@ -271,6 +265,12 @@ void runGS(const GS_Parameters& params) { KokkosSparse::spmv("N", alpha, A, x, beta, res); double resnorm = KokkosBlas::nrm2(res); + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; // note: this still works if the solution diverges std::cout << "Relative res norm: " << resnorm / bnorm << '\n'; } diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 0f03eb04b3..7391e00e3d 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -84,7 +84,7 @@ class PointGaussSeidel { typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; - typedef Kokkos::RangePolicy range_pol; + typedef Kokkos::RangePolicy range_policy_t; typedef typename HandleType::GraphColoringHandleType::color_view_t color_view_t; typedef typename HandleType::GraphColoringHandleType::color_t color_t; @@ -825,6 +825,8 @@ class PointGaussSeidel { void initialize_symbolic() { auto gsHandle = get_gs_handle(); const size_type longRowThreshold = gsHandle->get_long_row_threshold(); + const MyExecSpace my_exec_space = gsHandle->get_execution_space(); + const int num_streams = gsHandle->get_num_streams(); // Validate settings if (gsHandle->get_block_size() > 1 && longRowThreshold > 0) @@ -838,6 +840,7 @@ class PointGaussSeidel { #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif + // TODO: Pass my_exec_space into KokkosGraph kernels typename HandleType::GraphColoringHandleType::color_view_t colors; color_t numColors; { @@ -871,6 +874,9 @@ class PointGaussSeidel { colors = gchandle->get_vertex_colors(); numColors = gchandle->get_num_colors(); } + // Wait for coloring to finish on its stream + using ColoringExecSpace = typename HandleType::HandleExecSpace; + ColoringExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "COLORING_TIME:" << timer.seconds() << std::endl; timer.reset(); @@ -886,7 +892,8 @@ class PointGaussSeidel { for (int i = 0; i < num_rows; ++i) { h_colors(i) = i + 1; } - Kokkos::deep_copy(colors, h_colors); + Kokkos::deep_copy(my_exec_space, colors, h_colors); + my_exec_space.fence(); #endif nnz_lno_persistent_work_view_t color_xadj; nnz_lno_persistent_work_view_t color_adj; @@ -896,10 +903,10 @@ class PointGaussSeidel { KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, numColors, colors, color_xadj, color_adj); + my_exec_space, num_rows, numColors, colors, color_xadj, color_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -909,7 +916,7 @@ class PointGaussSeidel { Kokkos::deep_copy(h_color_xadj, color_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "DEEP_COPY:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -917,11 +924,11 @@ class PointGaussSeidel { // Count long rows per color set, and sort color sets so that long rows // come after regular rows nnz_lno_persistent_work_view_t long_rows_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "long_rows_per_color"), numColors); nnz_lno_persistent_work_view_t max_row_length_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "max_row_length_per_color"), numColors); nnz_lno_t mostLongRowsInColor = 0; @@ -930,33 +937,38 @@ class PointGaussSeidel { max_row_length_per_color); int sortLongRowsTeamSize = 1; { - team_policy_t temp(1, 1); + team_policy_t temp(my_exec_space, 1, 1); sortLongRowsTeamSize = temp.team_size_recommended( sortIntoLongRowsFunctor, Kokkos::ParallelReduceTag()); } - Kokkos::parallel_reduce(team_policy_t(numColors, sortLongRowsTeamSize), - sortIntoLongRowsFunctor, - Kokkos::Max(mostLongRowsInColor)); + Kokkos::parallel_reduce( + team_policy_t(my_exec_space, numColors, sortLongRowsTeamSize), + sortIntoLongRowsFunctor, Kokkos::Max(mostLongRowsInColor)); auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color); - Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color); + Kokkos::deep_copy(my_exec_space, host_long_rows_per_color, + long_rows_per_color); + my_exec_space.fence(); gsHandle->set_long_rows_per_color(host_long_rows_per_color); auto host_max_row_length_per_color = Kokkos::create_mirror_view(max_row_length_per_color); - Kokkos::deep_copy(host_max_row_length_per_color, + Kokkos::deep_copy(my_exec_space, host_max_row_length_per_color, max_row_length_per_color); + my_exec_space.fence(); gsHandle->set_max_row_length_per_color(host_max_row_length_per_color); scalar_persistent_work_view_t long_row_x( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "long_row_x"), mostLongRowsInColor); gsHandle->set_long_row_x(long_row_x); } else { // Just sort rows by ID. KokkosSparse::sort_crs_graph(color_xadj, color_adj); + decltype(color_adj)>(my_exec_space, + color_xadj, color_adj); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SORT_TIME:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -968,29 +980,29 @@ class PointGaussSeidel { Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), create_permuted_xadj(color_adj, xadj, permuted_xadj, old_to_new_map)); // std::cout << "create_permuted_xadj" << std::endl; #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl; timer.reset(); #endif KokkosKernels::Impl::inclusive_parallel_prefix_sum< - row_lno_persistent_work_view_t, MyExecSpace>(num_rows + 1, - permuted_xadj); + row_lno_persistent_work_view_t, MyExecSpace>( + my_exec_space, num_rows + 1, permuted_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl; timer.reset(); #endif Kokkos::parallel_for("KokkosSparse::PointGaussSeidel::fill_matrix_symbolic", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_symbolic(num_rows, color_adj, xadj, adj, // adj_vals, permuted_xadj, permuted_adj, @@ -998,7 +1010,7 @@ class PointGaussSeidel { old_to_new_map)); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -1012,8 +1024,8 @@ class PointGaussSeidel { // first calculate max row size. size_type max_row_size = 0; KokkosKernels::Impl::kk_view_reduce_max_row_size( - num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, - max_row_size); + my_exec_space, num_rows, permuted_xadj.data(), + permuted_xadj.data() + 1, max_row_size); nnz_lno_t brows = permuted_xadj.extent(0) - 1; size_type bnnz = permuted_adj.extent(0) * block_size * block_size; @@ -1079,10 +1091,11 @@ class PointGaussSeidel { size_type num_large_rows = 0; KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( - brows, permuted_xadj, num_values_in_l1, num_large_rows); + my_exec_space, brows, permuted_xadj, num_values_in_l1, + num_large_rows); num_big_rows = KOKKOSKERNELS_MACRO_MIN( num_large_rows, - (size_type)(MyExecSpace().concurrency() / suggested_vector_size)); + (size_type)(my_exec_space.concurrency() / suggested_vector_size)); // std::cout << "num_big_rows:" << num_big_rows << std::endl; if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { @@ -1091,8 +1104,8 @@ class PointGaussSeidel { size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< - typename pool_memory_space::memory_space>(free_byte, - total_byte); + typename pool_memory_space::memory_space>(free_byte, total_byte, + num_streams); size_t required_size = size_t(num_big_rows) * level_2_mem; if (required_size + num_big_rows * sizeof(int) > free_byte) { num_big_rows = @@ -1125,6 +1138,7 @@ class PointGaussSeidel { gsHandle->set_new_adj(permuted_adj); gsHandle->set_old_to_new_map(old_to_new_map); gsHandle->set_call_symbolic(true); + my_exec_space.fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "ALLOC:" << timer.seconds() << std::endl; #endif @@ -1335,7 +1349,14 @@ class PointGaussSeidel { if (gsHandle->is_symbolic_called() == false) { this->initialize_symbolic(); } - // else + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // else #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif @@ -1343,6 +1364,7 @@ class PointGaussSeidel { const_lno_row_view_t xadj = this->row_map; const_lno_nnz_view_t adj = this->entries; const_scalar_nnz_view_t adj_vals = this->values; + MyExecSpace my_exec_space = gsHandle->get_execution_space(); size_type nnz = adj_vals.extent(0); @@ -1353,14 +1375,16 @@ class PointGaussSeidel { nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); scalar_persistent_work_view_t permuted_adj_vals( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "newvals_"), nnz); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "newvals_"), + nnz); int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz); int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); nnz_lno_t block_size = gsHandle->get_block_size(); nnz_lno_t block_matrix_size = block_size * block_size; @@ -1384,7 +1408,8 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), fill_matrix_numeric(color_adj, xadj, // adj, @@ -1396,7 +1421,7 @@ class PointGaussSeidel { block_matrix_size)); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::fill_matrix_numeric", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_numeric(color_adj, xadj, // adj, adj_vals, newxadj_, @@ -1409,7 +1434,7 @@ class PointGaussSeidel { gsHandle->set_new_adj_val(permuted_adj_vals); scalar_persistent_work_view_t permuted_inverse_diagonal( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "permuted_inverse_diagonal"), num_rows * block_size); if (!have_diagonal_given) { @@ -1421,13 +1446,14 @@ class PointGaussSeidel { block_size > 1) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::team_get_matrix_diagonals", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd); } else { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::get_matrix_diagonals", - range_pol(0, num_rows), gmd); + range_policy_t(my_exec_space, 0, num_rows), gmd); } } else { @@ -1435,21 +1461,20 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_block_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, given_inverse_diagonal, - permuted_inverse_diagonal); + my_exec_space, num_rows, block_size, old_to_new_map, + given_inverse_diagonal, permuted_inverse_diagonal); else KokkosKernels::Impl::permute_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, given_inverse_diagonal, + my_exec_space, num_rows, old_to_new_map, given_inverse_diagonal, permuted_inverse_diagonal); } - gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal); gsHandle->set_call_numeric(true); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "NUMERIC:" << timer.seconds() << std::endl; #endif } @@ -1511,24 +1536,25 @@ class PointGaussSeidel { scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal(); - color_t numColors = gsHandle->get_num_colors(); + color_t numColors = gsHandle->get_num_colors(); + auto my_exec_space = gsHandle->get_execution_space(); if (update_y_vector) { KokkosKernels::Impl::permute_block_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, y_rhs_input_vec, + my_exec_space, num_rows, block_size, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols * block_size, - Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols * block_size, Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, old_to_new_map, x_lhs_output_vec, + my_exec_space, num_cols, block_size, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); } @@ -1561,7 +1587,7 @@ class PointGaussSeidel { int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), brows); + suggested_team_size, my_exec_space.concurrency(), brows); // size_t shmem_size_to_use = this->handle->get_shmem_size(); size_t l1_shmem_size = gsHandle->get_level_1_mem(); @@ -1591,13 +1617,11 @@ class PointGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, block_size, color_adj, Permuted_Xvector, + x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); @@ -1615,7 +1639,8 @@ class PointGaussSeidel { nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); auto Permuted_Xvector = gsHandle->get_permuted_x_vector(); auto Permuted_Yvector = gsHandle->get_permuted_y_vector(); @@ -1635,16 +1660,19 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); + my_exec_space, num_rows, old_to_new_map, y_rhs_input_vec, + Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols, Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols, Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); + my_exec_space, num_cols, old_to_new_map, x_lhs_output_vec, + Permuted_Xvector); } #if KOKKOSSPARSE_IMPL_PRINTDEBUG @@ -1673,14 +1701,12 @@ class PointGaussSeidel { apply_backward); } - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG + Kokkos::fence(); std::cout << "--point After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); std::cout << "--point Result X:"; @@ -1699,6 +1725,14 @@ class PointGaussSeidel { if (gsHandle->is_numeric_called() == false) { this->initialize_numeric(); } + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // make sure x and y have been allocated with the correct dimensions nnz_lno_t block_size = gsHandle->get_block_size(); gsHandle->allocate_x_y_vectors(this->num_rows * block_size, @@ -1719,7 +1753,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1730,7 +1765,7 @@ class PointGaussSeidel { max_row_length_per_color = gsHandle->get_max_row_length_per_color(); long_row_x = gsHandle->get_long_row_x(); haveLongRows = true; - longrow_apply_team_policy_t tempPolicy(1, 1); + longrow_apply_team_policy_t tempPolicy(my_exec_space, 1, 1); longRowTeamSize = tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag()); } @@ -1779,7 +1814,8 @@ class PointGaussSeidel { Kokkos::parallel_for( labelRegular, Kokkos::Experimental::require( - team_policy_t((numRegularRows + team_row_chunk_size - 1) / + team_policy_t(my_exec_space, + (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), @@ -1789,6 +1825,7 @@ class PointGaussSeidel { labelBlock, Kokkos::Experimental::require( block_apply_team_policy_t( + my_exec_space, (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), @@ -1799,6 +1836,7 @@ class PointGaussSeidel { labelBigBlock, Kokkos::Experimental::require( bigblock_apply_team_policy_t( + my_exec_space, (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), @@ -1821,18 +1859,21 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( - longrow_apply_team_policy_t(numLongRows * teams_per_row, + longrow_apply_team_policy_t(my_exec_space, + numLongRows * teams_per_row, longRowTeamSize), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", Kokkos::Experimental::require( - range_pol(color_index_end - numLongRows, color_index_end), + range_policy_t(my_exec_space, + color_index_end - numLongRows, + color_index_end), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, @@ -1848,7 +1889,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1885,7 +1927,8 @@ class PointGaussSeidel { Kokkos::parallel_for( labelShort, Kokkos::Experimental::require( - range_pol(color_index_begin, color_index_end - numLongRows), + range_policy_t(my_exec_space, color_index_begin, + color_index_end - numLongRows), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } @@ -1901,18 +1944,20 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( Kokkos::RangePolicy( - 0, numLongRows * par_per_row), + my_exec_space, 0, numLongRows * par_per_row), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", Kokkos::Experimental::require( - range_pol(color_index_end - numLongRows, color_index_end), + range_policy_t(my_exec_space, + color_index_end - numLongRows, + color_index_end), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index f04ae34fc9..840ced73b8 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -120,41 +120,47 @@ struct gauss_seidel_apply_eti_spec_avail { namespace KokkosSparse { namespace Impl { -template ::value, bool eti_spec_avail = gauss_seidel_symbolic_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t>::value> struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, bool is_graph_symmetric); }; template < - class KernelHandle, KokkosSparse::SparseMatrixFormat format, - class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, + class ExecSpaceIn, class KernelHandle, + KokkosSparse::SparseMatrixFormat format, class a_size_view_t_, + class a_lno_view_t, class a_scalar_view_t, bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value, bool eti_spec_avail = gauss_seidel_numeric_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value> struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric); static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); }; -template ::value, @@ -163,7 +169,8 @@ template ::value> struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -174,15 +181,19 @@ struct GAUSS_SEIDEL_APPLY { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct GAUSS_SEIDEL_SYMBOLIC +struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t_ entries, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_symbolic"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel< KernelHandle, a_size_view_t_, a_lno_view_t_, @@ -206,17 +217,20 @@ struct GAUSS_SEIDEL_SYMBOLIC -struct GAUSS_SEIDEL_NUMERIC +struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidelget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel -struct GAUSS_SEIDEL_APPLY { +template +struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -289,6 +308,7 @@ struct GAUSS_SEIDEL_APPLYget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel, \ @@ -337,6 +358,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -352,6 +374,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -367,6 +390,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -386,6 +410,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -401,6 +426,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -420,6 +446,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -441,6 +468,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -466,6 +494,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -487,6 +516,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index dae3f12462..d5a24ac1f1 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -601,7 +601,36 @@ class KokkosKernelsHandle { "GS."); return cgs; } + + // clang-format off + /** + * @brief Create a gauss seidel handle object + * + * @param handle_exec_space The execution space instance to execute kernels on. + * @param num_streams The number of streams to allocate memory for. + * @param gs_algorithm Specifies which algorithm to use: + * + * KokkosSpace::GS_DEFAULT PointGaussSeidel + * KokkosSpace::GS_PERMUTED ?? + * KokkosSpace::GS_TEAM ?? + * KokkosSpace::GS_CLUSTER ?? + * KokkosSpace::GS_TWOSTAGE ?? + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) + */ + // clang-format on void create_gs_handle( + const HandleExecSpace &handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { @@ -610,10 +639,45 @@ class KokkosKernelsHandle { // ---------------------------------------- // // Two-stage Gauss-Seidel if (gs_algorithm == KokkosSparse::GS_TWOSTAGE) - this->gsHandle = new TwoStageGaussSeidelHandleType(); - else this->gsHandle = - new PointGaussSeidelHandleType(gs_algorithm, coloring_algorithm); + new TwoStageGaussSeidelHandleType(handle_exec_space, num_streams); + else + this->gsHandle = new PointGaussSeidelHandleType( + handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); + } + + // clang-format off + /** + * @brief Create a gauss seidel handle object + * + * @param gs_algorithm Specifies which algorithm to use: + * + * KokkosSpace::GS_DEFAULT PointGaussSeidel + * KokkosSpace::GS_PERMUTED ?? + * KokkosSpace::GS_TEAM ?? + * KokkosSpace::GS_CLUSTER ?? + * KokkosSpace::GS_TWOSTAGE ?? + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) + */ + // clang-format on + void create_gs_handle( + KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algorithm = + KokkosGraph::COLORING_DEFAULT) { + HandleExecSpace handle_exec_space; + return create_gs_handle(handle_exec_space, 1, gs_algorithm, + coloring_algorithm); } // ---------------------------------------- // // Two-stage Gauss-Seidel handle @@ -672,6 +736,31 @@ class KokkosKernelsHandle { gs2->setCompactForm(compact_form); } + // clang-format off + /** + * @brief Create a gs handle object + * + * @param clusterAlgo Specifies which clustering algorithm to use: + * + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * @param hint_verts_per_cluster Hint how many verticies to use per cluster + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) + */ + // clang-format on void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 4039b6f5a7..f61f470814 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -1885,17 +1885,27 @@ struct ReduceLargerRowCount { template void kk_reduce_numrows_larger_than_threshold( - size_t num_elements, view_type view_to_reduce, - typename view_type::const_value_type threshold, + const MyExecSpace &my_exec_space, size_t num_elements, + view_type view_to_reduce, typename view_type::const_value_type threshold, typename view_type::non_const_value_type &sum_reduction) { - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), ReduceLargerRowCount(view_to_reduce, threshold), sum_reduction); } +template +void kk_reduce_numrows_larger_than_threshold( + size_t num_elements, view_type view_to_reduce, + typename view_type::const_value_type threshold, + typename view_type::non_const_value_type &sum_reduction) { + MyExecSpace my_exec_space; + kk_reduce_numrows_larger_than_threshold( + my_exec_space, num_elements, view_to_reduce, threshold, sum_reduction); +} + // Note: "block" in member name means it's block internal - otherwise it // addresses sparse rows/columns (whole blocks) within whole matrix. template diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 9f1b9d8cb1..036fe1b119 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,10 +29,12 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -42,9 +44,9 @@ namespace Experimental { /// num_rows submatrix of A is structurally symmetric /// @pre handle->create_gs_handle(...) has been called previously /// -template -void gauss_seidel_symbolic(KernelHandle *handle, +template +void gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -95,13 +97,42 @@ void gauss_seidel_symbolic(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_SYMBOLIC< - const_handle_type, Internal_alno_row_view_t_, - Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(&tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, + ExecutionSpace, const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity +/// pattern only) +/// +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously +/// +template +void gauss_seidel_symbolic(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_symbolic(my_exec_space, handle, num_rows, num_cols, row_map, + entries, is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (first phase, based on /// sparsity pattern only) @@ -142,12 +173,14 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -157,11 +190,12 @@ void block_gauss_seidel_symbolic( /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template -void gauss_seidel_numeric(KernelHandle *handle, +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -225,11 +259,12 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, is_graph_symmetric); } @@ -251,7 +286,6 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param row_map The matrix's rowmap /// @param entries The matrix's entries /// @param values The matrix's values -/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// @remark If the inverse diagonal is not already available, it's best to call @@ -263,6 +297,50 @@ template void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + is_graph_symmetric); +} + +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam ExecutionSpace This kernels execution space type. +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param space The execution space instance this kernel will be run on. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -329,15 +407,57 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, - const_a_d, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, const_a_d, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + scalar_nnz_view_t_ given_inverse_diagonal, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + given_inverse_diagonal, is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (second phase, based on /// matrix's numeric values) @@ -384,6 +504,7 @@ void block_gauss_seidel_numeric( /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -394,6 +515,8 @@ void block_gauss_seidel_numeric( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param space The execution space instance this kernel will be run +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -410,13 +533,15 @@ void block_gauss_seidel_numeric( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void symmetric_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecutionSpace &space, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -532,13 +657,63 @@ void symmetric_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, true); +} + +/// +/// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to +/// system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle handle A KokkosKernelsHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run (forward and backward counts as 1) +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void symmetric_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + symmetric_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -621,6 +796,8 @@ void symmetric_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param space The execution space instance this kernel will be run +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -637,13 +814,15 @@ void symmetric_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void forward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecutionSpace &space, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -761,13 +940,62 @@ void forward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, false); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, false); +} + +/// +/// @brief Apply forward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void forward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + forward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -839,6 +1067,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -849,6 +1078,8 @@ void forward_sweep_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param space The execution space instance this kernel will be run +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -865,13 +1096,15 @@ void forward_sweep_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void backward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + const ExecutionSpace &space, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -989,13 +1222,62 @@ void backward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, false, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + false, true); +} + +/// +/// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void backward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + backward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 412985df72..649229918d 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -84,6 +84,9 @@ class GaussSeidelHandle { nnz_lno_persistent_work_host_view_t; // Host view type protected: + HandleExecSpace execution_space; + int num_streams; + GSAlgorithm algorithm_type; nnz_lno_persistent_work_host_view_t color_xadj; @@ -101,7 +104,22 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : algorithm_type(gs), + : execution_space(HandleExecSpace()), + num_streams(1), + algorithm_type(gs), + color_xadj(), + color_adj(), + numColors(0), + called_symbolic(false), + called_numeric(false), + suggested_vector_size(0), + suggested_team_size(0) {} + + GaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs) + : execution_space(handle_exec_space), + num_streams(n_streams), + algorithm_type(gs), color_xadj(), color_adj(), numColors(0), @@ -113,6 +131,10 @@ class GaussSeidelHandle { virtual ~GaussSeidelHandle() = default; // getters + int get_num_streams() const { return num_streams; } + + HandleExecSpace get_execution_space() const { return this->execution_space; } + GSAlgorithm get_algorithm_type() const { return this->algorithm_type; } nnz_lno_persistent_work_host_view_t get_color_xadj() const { @@ -126,7 +148,24 @@ class GaussSeidelHandle { bool is_symbolic_called() const { return this->called_symbolic; } bool is_numeric_called() const { return this->called_numeric; } - // setters + template + void set_execution_space(const ExecSpaceIn exec_space_in) { + static bool is_set = false; + if (!is_set) { + static_assert(std::is_same::value, + "The type of exec_space_in should be the same as " + "GaussSeidelHandle::HandleExecSpace"); + this->execution_space = exec_space_in; + } else { + if (exec_space_in != this->execution_space) + throw std::runtime_error( + "Gauss Seidel cannot be called on different execution spaces " + "without multiple handles. Please create a new handle via " + "create_gs_handle.\n"); + } + is_set = true; + } + void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; @@ -244,10 +283,10 @@ class PointGaussSeidelHandle /** * \brief Default constructor. */ - PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + PointGaussSeidelHandle(GSHandle gs_handle, KokkosGraph::ColoringAlgorithm coloring_algo_ = KokkosGraph::COLORING_DEFAULT) - : GSHandle(gs), + : GSHandle(gs_handle), permuted_xadj(), permuted_adj(), permuted_adj_vals(), @@ -263,9 +302,22 @@ class PointGaussSeidelHandle level_2_mem(0), long_row_threshold(0), coloring_algo(coloring_algo_) { - if (gs == GS_DEFAULT) this->choose_default_algorithm(); + if (gs_handle.get_algorithm_type() == GS_DEFAULT) + this->choose_default_algorithm(); } + PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(gs), coloring_algo_) {} + + PointGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(handle_exec_space, n_streams, gs), + coloring_algo_) {} + void set_block_size(nnz_lno_t bs) { this->block_size = bs; } nnz_lno_t get_block_size() const { return this->block_size; } @@ -613,8 +665,15 @@ class TwoStageGaussSeidelHandle ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>; - TwoStageGaussSeidelHandle() - : GSHandle(GS_TWOSTAGE), + using HandleExecSpace = typename GSHandle::HandleExecSpace; + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param gs_handle The GaussSeidel handle. + */ + TwoStageGaussSeidelHandle(GSHandle gs_handle) + : GSHandle(gs_handle), nrows(0), nrhs(1), direction(GS_SYMMETRIC), @@ -626,6 +685,23 @@ class TwoStageGaussSeidelHandle inner_omega = one; } + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + */ + TwoStageGaussSeidelHandle() + : TwoStageGaussSeidelHandle(GSHandle(GS_TWOSTAGE)) {} + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param handle_exec_space The execution space instance + * @param n_streams the number of streams + */ + TwoStageGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams) + : TwoStageGaussSeidelHandle( + GSHandle(handle_exec_space, n_streams, GS_TWOSTAGE)) {} + // Sweep direction void setSweepDirection(GSDirection direction_) { this->direction = direction_; diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 358205b713..35fbcb44a4 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -56,7 +56,7 @@ namespace Test { // Run GS on the given vectors, where the handle is already set up. template void run_gauss_seidel( - Handle& kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, + Handle &kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, bool is_symmetric_graph, typename crsMat_t::value_type omega, int apply_type = 0 // 0 for symmetric, 1 for forward, 2 for backward. ) { @@ -142,6 +142,59 @@ void run_gauss_seidel( kh.destroy_gs_handle(); } +template +void run_gauss_seidel_streams( + std::vector &instances, std::vector &kh, + std::vector &input_mat, std::vector &x_vector, + std::vector &y_vector, bool is_symmetric_graph, + typename crsMat_t::value_type omega, + int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. + int nstreams = 1) { + for (int i = 0; i < nstreams; i++) { + gauss_seidel_symbolic(instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, is_symmetric_graph); + gauss_seidel_numeric(instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, + is_symmetric_graph); + } + + const int apply_count = 2; + for (int i = 0; i < nstreams; i++) { + switch (apply_type) { + case 0: + symmetric_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + case 1: + forward_sweep_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + case 2: + backward_sweep_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + default: + symmetric_gauss_seidel_apply( + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); + break; + } + } +} } // namespace Test template crsMat_t; - typedef Kokkos::View scalar_view2d_t; - typedef Kokkos::View + typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View host_scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; @@ -396,7 +449,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, // initial solution is zero Kokkos::deep_copy(x_host, zero); // get the inverse diagonal (only needed on host) - Kokkos::View invDiag("diag^-1", numRows); + Kokkos::View invDiag("diag^-1", numRows); for (lno_t i = 0; i < numRows; i++) { for (size_type j = rowmap(i); j < rowmap(i + 1); j++) { if (entries(j) == i) invDiag(i) = one / values(j); @@ -574,11 +627,11 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries); rowmap_view_t rowmapView( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1); - Kokkos::deep_copy(valuesView, Kokkos::View( + Kokkos::deep_copy(valuesView, Kokkos::View( values.data(), totalEntries)); - Kokkos::deep_copy(entriesView, Kokkos::View( + Kokkos::deep_copy(entriesView, Kokkos::View( entries.data(), totalEntries)); - Kokkos::deep_copy(rowmapView, Kokkos::View( + Kokkos::deep_copy(rowmapView, Kokkos::View( rowmap.data(), numRows + 1)); crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); @@ -662,58 +715,201 @@ void test_gauss_seidel_custom_coloring(lno_t numRows, lno_t nnzPerRow) { EXPECT_LT(result_norm_res, 0.25 * initial_norm_res); } -#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank1(2000, 2000 * 20, \ - 200, 10, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank1(2000, 2000 * 20, \ - 200, 10, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_empty(); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_balloon_clustering(5000, 100, 2000); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_sequential_sor(1000, 1000 * 15, 50, \ - 10); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_long_rows(500, 10, 20, \ - true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_custom_coloring(500, \ - 10); \ +template +void test_gauss_seidel_streams_rank1( + lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, + bool symmetric, double omega, + KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, + int nstreams = 1) { + using namespace Test; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using mag_t = typename Kokkos::ArithTraits::mag_type; + using execution_space = typename device::execution_space; + + using const_size_type = const size_type; + using const_lno_t = const lno_t; + using const_scalar_t = const scalar_t; + using KernelHandle = + KokkosKernelsHandle; + srand(245); + lno_t numCols = numRows; + typename crsMat_t::value_type m_omega = omega; + +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same_v) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + std::cerr << "TEST SKIPPED: Not enough concurrency to partition " + "execution space. exec_concurrency: " + << exec_concurrency << std::endl; + return; + } + } +#endif // KOKKOS_ENABLE_OPENMP + + std::vector instances; + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector input_mat_v(nstreams); + std::vector solution_x_v(nstreams); + std::vector x_vector_v(nstreams); + std::vector y_vector_v(nstreams); + std::vector initial_norm_res_v(nstreams); + + const scalar_t one = Kokkos::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); + + for (int i = 0; i < nstreams; i++) { + input_mat_v[i] = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); + + if (symmetric) { + // Symmetrize on host, rather than relying on the parallel versions (those + // can be tested for symmetric=false) + input_mat_v[i] = + Test::symmetrize( + input_mat_v[i]); + } + lno_t nv = input_mat_v[i].numRows(); + scalar_view_t solution_x_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); + solution_x_v[i] = solution_x_tmp; + create_random_x_vector(solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); + y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); + // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the + // behavior of each algorithm _should be_ the same on every execution space, + // which is why we just test GS_DEFAULT. + + scalar_view_t x_vector_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); + x_vector_v[i] = x_vector_tmp; + + kh_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. + kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); + } + + int apply_count = 3; // test symmetric, forward, backward + //*** Point-coloring version **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + + run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, + y_vector_v, symmetric, m_omega, apply_type, + nstreams); + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; + } + } + + for (int i = 0; i < nstreams; i++) kh_v[i].destroy_gs_handle(); +} + +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_empty(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_balloon_clustering(5000, 100, 2000); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_sequential_sor(1000, 1000 * 15, 50, \ + 10); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_long_rows(500, 10, 20, \ + true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_custom_coloring(500, \ + 10); \ } #include