From 825fff31d78f13fd0650c85dc33699320df30e3e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 18 Jul 2023 14:19:28 -0600 Subject: [PATCH 01/29] sparse/src: Add execution space inst member to GS handle --- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 412985df72..3cc85b5bbc 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -84,6 +84,8 @@ class GaussSeidelHandle { nnz_lno_persistent_work_host_view_t; // Host view type protected: + HandleExecSpace execution_space; + GSAlgorithm algorithm_type; nnz_lno_persistent_work_host_view_t color_xadj; @@ -101,7 +103,8 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : algorithm_type(gs), + : execution_space(HandleExecSpace()), + algorithm_type(gs), color_xadj(), color_adj(), numColors(0), @@ -127,6 +130,10 @@ class GaussSeidelHandle { bool is_numeric_called() const { return this->called_numeric; } // setters + void set_execution_space(const HandleExecSpace exec_space) { + this->execution_space = exec_space; + } + void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; From ec975834ef63a2299214b01026fe2f435775a80e Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 18 Jul 2023 15:32:56 -0600 Subject: [PATCH 02/29] Update GS point symbolic and friends for streams --- common/src/KokkosKernels_SimpleUtils.hpp | 6 +- common/src/KokkosKernels_Utils.hpp | 54 +++++++------ .../impl/KokkosSparse_gauss_seidel_impl.hpp | 81 +++++++++++-------- sparse/src/KokkosSparse_Utils.hpp | 7 +- .../src/KokkosSparse_gauss_seidel_handle.hpp | 8 +- 5 files changed, 91 insertions(+), 65 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index a271695246..0c9e82773a 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -151,10 +151,10 @@ inline void kk_exclusive_parallel_prefix_sum( template void kk_inclusive_parallel_prefix_sum( typename forward_array_type::value_type num_elements, - forward_array_type arr) { - typedef Kokkos::RangePolicy my_exec_space; + forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 2a4b749f92..c8cc284b73 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -457,9 +457,9 @@ struct Fill_Reverse_Map { template void inclusive_parallel_prefix_sum( typename forward_array_type::value_type num_elements, - forward_array_type arr) { + forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { kk_inclusive_parallel_prefix_sum( - num_elements, arr); + num_elements, arr, my_exec_space); } template @@ -668,14 +668,15 @@ void create_reverse_map( const forward_array_type &forward_map, // vertex to colors reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj) { // colros to vertex adj + reverse_array_type &reverse_map_adj, + MyExecSpace my_exec_space = MyExecSpace()) { // colros to vertex adj typedef typename reverse_array_type::value_type lno_t; typedef typename forward_array_type::value_type reverse_lno_t; const lno_t MINIMUM_TO_ATOMIC = 64; - typedef Kokkos::RangePolicy my_exec_space; + typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1); reverse_map_adj = reverse_array_type( @@ -699,24 +700,27 @@ void create_reverse_map( forward_map, tmp_color_xadj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapScaleInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); inclusive_parallel_prefix_sum( tmp_reverse_size + 1, tmp_color_xadj); - MyExecSpace().fence(); + my_exec_space.fence(); - Kokkos::parallel_for("KokkosKernels::Common::StridedCopy", - my_exec_space(0, num_reverse_elements + 1), - StridedCopy( - tmp_color_xadj, reverse_map_xadj, scale_size)); - MyExecSpace().fence(); + Kokkos::parallel_for( + "KokkosKernels::Common::StridedCopy", + range_policy_t(my_exec_space, 0, num_reverse_elements + 1), + StridedCopy( + tmp_color_xadj, reverse_map_xadj, scale_size)); + my_exec_space.fence(); Fill_Reverse_Scale_Map frm( forward_map, tmp_color_xadj, reverse_map_adj, multiply_shift_for_scale, division_shift_for_bucket); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } else // atomic implementation. { @@ -728,20 +732,22 @@ void create_reverse_map( forward_map, reverse_map_xadj); Kokkos::parallel_for("KokkosKernels::Common::ReverseMapInit", - my_exec_space(0, num_forward_elements), rmi); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + rmi); + my_exec_space.fence(); // print_1Dview(reverse_map_xadj); inclusive_parallel_prefix_sum( num_reverse_elements + 1, reverse_map_xadj); - MyExecSpace().fence(); + my_exec_space.fence(); Kokkos::deep_copy(tmp_color_xadj, reverse_map_xadj); - MyExecSpace().fence(); + my_exec_space.fence(); Fill_Reverse_Map frm( forward_map, tmp_color_xadj, reverse_map_adj); Kokkos::parallel_for("KokkosKernels::Common::FillReverseMap", - my_exec_space(0, num_forward_elements), frm); - MyExecSpace().fence(); + range_policy_t(my_exec_space, 0, num_forward_elements), + frm); + my_exec_space.fence(); } } @@ -1253,10 +1259,12 @@ template void kk_view_reduce_max_row_size(const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, - size_type &max_row_size) { - typedef Kokkos::RangePolicy my_exec_space; + size_type &max_row_size, + MyExecSpace my_exec_space = MyExecSpace()) { + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( - "KokkosKernels::Common::ViewReduceMaxRowSize", my_exec_space(0, num_rows), + "KokkosKernels::Common::ViewReduceMaxRowSize", + range_policy_t(my_exec_space, 0, num_rows), ReduceRowSizeFunctor(rowmap_view_begins, rowmap_view_ends), max_row_size); } diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index e4cfb4b047..4830011dfc 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -84,7 +84,7 @@ class PointGaussSeidel { typedef typename HandleType::scalar_persistent_work_view_t scalar_persistent_work_view_t; - typedef Kokkos::RangePolicy range_pol; + typedef Kokkos::RangePolicy range_policy_t; typedef typename HandleType::GraphColoringHandleType::color_view_t color_view_t; typedef typename HandleType::GraphColoringHandleType::color_t color_t; @@ -825,6 +825,7 @@ class PointGaussSeidel { void initialize_symbolic() { auto gsHandle = get_gs_handle(); const size_type longRowThreshold = gsHandle->get_long_row_threshold(); + const MyExecSpace my_exec_space = gsHandle->get_execution_space(); // Validate settings if (gsHandle->get_block_size() > 1 && longRowThreshold > 0) @@ -838,6 +839,7 @@ class PointGaussSeidel { #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif + // TODO: Pass my_exec_space into KokkosGraph kernels typename HandleType::GraphColoringHandleType::color_view_t colors; color_t numColors; { @@ -886,7 +888,8 @@ class PointGaussSeidel { for (int i = 0; i < num_rows; ++i) { h_colors(i) = i + 1; } - Kokkos::deep_copy(colors, h_colors); + Kokkos::deep_copy(my_exec_space, colors, h_colors); + my_exec_space.fence(); #endif nnz_lno_persistent_work_view_t color_xadj; nnz_lno_persistent_work_view_t color_adj; @@ -896,10 +899,10 @@ class PointGaussSeidel { KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, numColors, colors, color_xadj, color_adj); + num_rows, numColors, colors, color_xadj, color_adj, my_exec_space); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_REVERSE_MAP:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -909,7 +912,7 @@ class PointGaussSeidel { Kokkos::deep_copy(h_color_xadj, color_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "DEEP_COPY:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -930,21 +933,24 @@ class PointGaussSeidel { max_row_length_per_color); int sortLongRowsTeamSize = 1; { - team_policy_t temp(1, 1); + team_policy_t temp(my_exec_space, 1, 1); sortLongRowsTeamSize = temp.team_size_recommended( sortIntoLongRowsFunctor, Kokkos::ParallelReduceTag()); } - Kokkos::parallel_reduce(team_policy_t(numColors, sortLongRowsTeamSize), - sortIntoLongRowsFunctor, - Kokkos::Max(mostLongRowsInColor)); + Kokkos::parallel_reduce( + team_policy_t(my_exec_space, numColors, sortLongRowsTeamSize), + sortIntoLongRowsFunctor, Kokkos::Max(mostLongRowsInColor)); auto host_long_rows_per_color = Kokkos::create_mirror_view(long_rows_per_color); - Kokkos::deep_copy(host_long_rows_per_color, long_rows_per_color); + Kokkos::deep_copy(my_exec_space, host_long_rows_per_color, + long_rows_per_color); + my_exec_space.fence(); gsHandle->set_long_rows_per_color(host_long_rows_per_color); auto host_max_row_length_per_color = Kokkos::create_mirror_view(max_row_length_per_color); - Kokkos::deep_copy(host_max_row_length_per_color, + Kokkos::deep_copy(my_exec_space, host_max_row_length_per_color, max_row_length_per_color); + my_exec_space.fence(); gsHandle->set_max_row_length_per_color(host_max_row_length_per_color); scalar_persistent_work_view_t long_row_x( Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"), @@ -953,10 +959,11 @@ class PointGaussSeidel { } else { // Just sort rows by ID. KokkosSparse::sort_crs_graph(color_xadj, color_adj); + decltype(color_adj)>(my_exec_space, + color_xadj, color_adj); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SORT_TIME:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -968,29 +975,29 @@ class PointGaussSeidel { Kokkos::parallel_for( "KokkosSparse::PointGaussSeidel::create_permuted_xadj", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), create_permuted_xadj(color_adj, xadj, permuted_xadj, old_to_new_map)); // std::cout << "create_permuted_xadj" << std::endl; #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "CREATE_PERMUTED_XADJ:" << timer.seconds() << std::endl; timer.reset(); #endif KokkosKernels::Impl::inclusive_parallel_prefix_sum< - row_lno_persistent_work_view_t, MyExecSpace>(num_rows + 1, - permuted_xadj); + row_lno_persistent_work_view_t, MyExecSpace>( + num_rows + 1, permuted_xadj, my_exec_space); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "INCLUSIVE_PPS:" << timer.seconds() << std::endl; timer.reset(); #endif Kokkos::parallel_for("KokkosSparse::PointGaussSeidel::fill_matrix_symbolic", - range_pol(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_symbolic(num_rows, color_adj, xadj, adj, // adj_vals, permuted_xadj, permuted_adj, @@ -998,7 +1005,7 @@ class PointGaussSeidel { old_to_new_map)); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "SYMBOLIC_FILL:" << timer.seconds() << std::endl; timer.reset(); #endif @@ -1013,7 +1020,7 @@ class PointGaussSeidel { size_type max_row_size = 0; KokkosKernels::Impl::kk_view_reduce_max_row_size( num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, - max_row_size); + max_row_size, my_exec_space); nnz_lno_t brows = permuted_xadj.extent(0) - 1; size_type bnnz = permuted_adj.extent(0) * block_size * block_size; @@ -1079,15 +1086,17 @@ class PointGaussSeidel { size_type num_large_rows = 0; KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( - brows, permuted_xadj, num_values_in_l1, num_large_rows); + brows, permuted_xadj, num_values_in_l1, num_large_rows, + my_exec_space); num_big_rows = KOKKOSKERNELS_MACRO_MIN( num_large_rows, - (size_type)(MyExecSpace().concurrency() / suggested_vector_size)); + (size_type)(my_exec_space.concurrency() / suggested_vector_size)); // std::cout << "num_big_rows:" << num_big_rows << std::endl; if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. + // TODO: Need to account for number of streams here? size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< @@ -1396,7 +1405,7 @@ class PointGaussSeidel { block_matrix_size)); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::fill_matrix_numeric", - range_pol(0, num_rows), + range_policy_t(0, num_rows), fill_matrix_numeric(color_adj, xadj, // adj, adj_vals, newxadj_, @@ -1427,7 +1436,7 @@ class PointGaussSeidel { } else { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::get_matrix_diagonals", - range_pol(0, num_rows), gmd); + range_policy_t(0, num_rows), gmd); } } else { @@ -1591,8 +1600,8 @@ class PointGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); + // Kokkos::parallel_for( range_policy_t(0,nr), + // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, @@ -1673,8 +1682,8 @@ class PointGaussSeidel { apply_backward); } - // Kokkos::parallel_for( range_pol(0,nr), PermuteVector(x_lhs_output_vec, - // Permuted_Xvector, color_adj)); + // Kokkos::parallel_for( range_policy_t(0,nr), + // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, @@ -1823,7 +1832,8 @@ class PointGaussSeidel { gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + range_policy_t(color_index_end - numLongRows, + color_index_end), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); @@ -1872,10 +1882,10 @@ class PointGaussSeidel { nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows; if (numRegularRows) { - Kokkos::parallel_for( - labelShort, - range_pol(color_index_begin, color_index_end - numLongRows), - gs); + Kokkos::parallel_for(labelShort, + range_policy_t(color_index_begin, + color_index_end - numLongRows), + gs); } if (numLongRows) { gs._color_set_begin = color_index_end - numLongRows; @@ -1896,7 +1906,8 @@ class PointGaussSeidel { gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_pol(color_index_end - numLongRows, color_index_end), + range_policy_t(color_index_end - numLongRows, + color_index_end), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 4039b6f5a7..88258356ef 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -1887,11 +1887,12 @@ template void kk_reduce_numrows_larger_than_threshold( size_t num_elements, view_type view_to_reduce, typename view_type::const_value_type threshold, - typename view_type::non_const_value_type &sum_reduction) { - typedef Kokkos::RangePolicy my_exec_space; + typename view_type::non_const_value_type &sum_reduction, + MyExecSpace my_exec_space = MyExecSpace()) { + typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", - my_exec_space(0, num_elements), + range_policy_t(my_exec_space, 0, num_elements), ReduceLargerRowCount(view_to_reduce, threshold), sum_reduction); } diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 3cc85b5bbc..6e57a23ee2 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -116,6 +116,8 @@ class GaussSeidelHandle { virtual ~GaussSeidelHandle() = default; // getters + HandleExecSpace get_execution_space() const { return this->execution_space; } + GSAlgorithm get_algorithm_type() const { return this->algorithm_type; } nnz_lno_persistent_work_host_view_t get_color_xadj() const { @@ -131,7 +133,11 @@ class GaussSeidelHandle { // setters void set_execution_space(const HandleExecSpace exec_space) { - this->execution_space = exec_space; + static bool is_exec_space_set = false; + if (!is_exec_space_set) { + this->execution_space = exec_space; + is_exec_space_set = true; + } } void set_algorithm_type(const GSAlgorithm sgs_algo) { From 98d5a24e2c53cd936ed84ac268ed6b861017baf8 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 12:11:08 -0600 Subject: [PATCH 03/29] sparse/impl: Make PSGS non-blocking - This change also slightly improves performance perf_test/sparse: Add launch and compute timers --- perf_test/sparse/KokkosSparse_gs.cpp | 26 +++++-- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 68 ++++++++++++------- 2 files changed, 63 insertions(+), 31 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index c11c6bdc02..119941cebc 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -219,14 +219,22 @@ void runGS(const GS_Parameters& params) { KokkosSparse::Experimental::gauss_seidel_symbolic( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric); - double symbolicTime = timer.seconds(); - std::cout << "\n*** Symbolic time: " << symbolicTime << '\n'; + double symbolicLaunchTime = timer.seconds(); + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double symbolicComputeTime = timer.seconds(); + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; timer.reset(); KokkosSparse::Experimental::gauss_seidel_numeric( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric); - double numericTime = timer.seconds(); - std::cout << "\n*** Numeric time: " << numericTime << '\n'; + double numericLaunchTime = timer.seconds(); + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double numericComputeTime = timer.seconds(); + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; timer.reset(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { @@ -246,8 +254,14 @@ void runGS(const GS_Parameters& params) { true, true, 1.0, params.sweeps); break; } - double applyTime = timer.seconds(); - std::cout << "\n*** Apply time: " << applyTime << '\n'; + + double applyLaunchTime = timer.seconds(); + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + timer.reset(); + Kokkos::fence(); + double applyComputeTime = timer.seconds(); + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; + timer.reset(); kh.destroy_gs_handle(); // Now, compute the 2-norm of residual scalar_view_t res("Ax-b", nrows); diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 4830011dfc..abaa01effe 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1787,25 +1787,31 @@ class PointGaussSeidel { if (block_size == 1) { Kokkos::parallel_for( labelRegular, - team_policy_t((numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + team_policy_t((numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else if (gs.num_max_vals_in_l2 == 0) { Kokkos::parallel_for( labelBlock, - block_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + block_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } else { Kokkos::parallel_for( labelBigBlock, - bigblock_apply_team_policy_t( - (numRegularRows + team_row_chunk_size - 1) / - team_row_chunk_size, - suggested_team_size, vector_size), + Kokkos::Experimental::require( + bigblock_apply_team_policy_t( + (numRegularRows + team_row_chunk_size - 1) / + team_row_chunk_size, + suggested_team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); } } @@ -1827,13 +1833,17 @@ class PointGaussSeidel { Kokkos::deep_copy(long_row_x, nnz_scalar_t()); Kokkos::parallel_for( labelLong, - longrow_apply_team_policy_t(numLongRows * teams_per_row, - longRowTeamSize), + Kokkos::Experimental::require( + longrow_apply_team_policy_t(numLongRows * teams_per_row, + longRowTeamSize), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_policy_t(color_index_end - numLongRows, - color_index_end), + Kokkos::Experimental::require( + range_policy_t(color_index_end - numLongRows, + color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); @@ -1882,10 +1892,13 @@ class PointGaussSeidel { nnz_lno_t numRegularRows = color_index_end - color_index_begin - numLongRows; if (numRegularRows) { - Kokkos::parallel_for(labelShort, - range_policy_t(color_index_begin, - color_index_end - numLongRows), - gs); + Kokkos::parallel_for( + labelShort, + Kokkos::Experimental::require( + range_policy_t(color_index_begin, + color_index_end - numLongRows), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); } if (numLongRows) { gs._color_set_begin = color_index_end - numLongRows; @@ -1900,14 +1913,19 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(long_row_x, nnz_scalar_t()); - Kokkos::parallel_for(labelLong, - Kokkos::RangePolicy( - 0, numLongRows * par_per_row), - gs); + Kokkos::parallel_for( + labelLong, + Kokkos::Experimental::require( + Kokkos::RangePolicy( + 0, numLongRows * par_per_row), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", - range_policy_t(color_index_end - numLongRows, - color_index_end), + Kokkos::Experimental::require( + range_policy_t(color_index_end - numLongRows, + color_index_end), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( Xcol, Ycol, long_row_x, gs._permuted_inverse_diagonal, gs.omega, color_index_end - numLongRows)); From 0aa320fc0eaab595faf64812b48c578c234b1338 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 13:32:44 -0600 Subject: [PATCH 04/29] Cleanup and use overload pattern --- common/src/KokkosKernels_SimpleUtils.hpp | 34 +++++++++--- common/src/KokkosKernels_Utils.hpp | 54 ++++++++++++++++--- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 12 ++--- 3 files changed, 76 insertions(+), 24 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 0c9e82773a..64735874c6 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -142,22 +142,42 @@ inline void kk_exclusive_parallel_prefix_sum( kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } -/*** - * \brief Function performs the inclusive parallel prefix sum. That is each - * entry holds the sum until itself including itself. \param num_elements: size - * of the array \param arr: the array for which the prefix sum will be - * performed. - */ +template +void kk_inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, + forward_array_type arr) {} + +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param my_exec_space: The execution space instance +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// template void kk_inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, - forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { + forward_array_type arr) { typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_scan("KokkosKernels::Common::PrefixSum", range_policy_t(my_exec_space, 0, num_elements), InclusiveParallelPrefixSum(arr)); } +/// +/// \brief Function performs the inclusive parallel prefix sum. That is each +/// entry holds the sum until itself including itself. +/// \param num_elements: size of the array +/// \param arr: the array for which the prefix sum will be performed. +/// +template +void kk_inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return kk_inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); +} + template struct ReductionFunctor { view_t array_sum; diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index c8cc284b73..11b4100f31 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -456,10 +456,19 @@ struct Fill_Reverse_Map { template void inclusive_parallel_prefix_sum( + MyExecSpace my_exec_space, typename forward_array_type::value_type num_elements, - forward_array_type arr, MyExecSpace my_exec_space = MyExecSpace()) { - kk_inclusive_parallel_prefix_sum( - num_elements, arr, my_exec_space); + forward_array_type arr) { + return kk_inclusive_parallel_prefix_sum( + my_exec_space, num_elements, arr); +} + +template +void inclusive_parallel_prefix_sum( + typename forward_array_type::value_type num_elements, + forward_array_type arr) { + MyExecSpace my_exec_space; + return inclusive_parallel_prefix_sum(my_exec_space, num_elements, arr); } template @@ -661,6 +670,7 @@ struct StridedCopy { template void create_reverse_map( + MyExecSpace my_exec_space, const typename reverse_array_type::value_type &num_forward_elements, // num_vertices const typename forward_array_type::value_type @@ -668,8 +678,7 @@ void create_reverse_map( const forward_array_type &forward_map, // vertex to colors reverse_array_type &reverse_map_xadj, // colors to vertex xadj - reverse_array_type &reverse_map_adj, - MyExecSpace my_exec_space = MyExecSpace()) { // colros to vertex adj + reverse_array_type &reverse_map_adj) { // colros to vertex adj typedef typename reverse_array_type::value_type lno_t; typedef typename forward_array_type::value_type reverse_lno_t; @@ -751,6 +760,23 @@ void create_reverse_map( } } +template +void create_reverse_map( + const typename reverse_array_type::value_type + &num_forward_elements, // num_vertices + const typename forward_array_type::value_type + &num_reverse_elements, // num_colors + + const forward_array_type &forward_map, // vertex to colors + reverse_array_type &reverse_map_xadj, // colors to vertex xadj + reverse_array_type &reverse_map_adj) { + MyExecSpace my_exec_space; + return create_reverse_map(my_exec_space, num_forward_elements, + num_reverse_elements, forward_map, reverse_map_xadj, + reverse_map_adj); +} + template struct PermuteVector { @@ -1256,11 +1282,11 @@ struct ReduceRowSizeFunctor { // view has num_rows+1 elements. template -void kk_view_reduce_max_row_size(const size_t num_rows, +void kk_view_reduce_max_row_size(MyExecSpace my_exec_space, + const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, - size_type &max_row_size, - MyExecSpace my_exec_space = MyExecSpace()) { + size_type &max_row_size) { typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ViewReduceMaxRowSize", @@ -1269,6 +1295,18 @@ void kk_view_reduce_max_row_size(const size_t num_rows, max_row_size); } +// view has num_rows+1 elements. +template +void kk_view_reduce_max_row_size(const size_t num_rows, + const size_type *rowmap_view_begins, + const size_type *rowmap_view_ends, + size_type &max_row_size) { + MyExecSpace my_exec_space; + return kk_view_reduce_max_row_size(my_exec_space, num_rows, + rowmap_view_begins, rowmap_view_ends, + max_row_size); +} + template struct ReduceMaxRowFunctor { view_type rowmap_view; diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index abaa01effe..196a0de4b4 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -899,7 +899,7 @@ class PointGaussSeidel { KokkosKernels::Impl::create_reverse_map< typename HandleType::GraphColoringHandleType::color_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, numColors, colors, color_xadj, color_adj, my_exec_space); + my_exec_space, num_rows, numColors, colors, color_xadj, color_adj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE my_exec_space.fence(); @@ -988,7 +988,7 @@ class PointGaussSeidel { KokkosKernels::Impl::inclusive_parallel_prefix_sum< row_lno_persistent_work_view_t, MyExecSpace>( - num_rows + 1, permuted_xadj, my_exec_space); + my_exec_space, num_rows + 1, permuted_xadj); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE my_exec_space.fence(); @@ -1096,7 +1096,7 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. - // TODO: Need to account for number of streams here? + // TODO: account for number of streams via handle.nstreams size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< @@ -1600,9 +1600,6 @@ class PointGaussSeidel { this->IterativePSGS(gs, numColors, h_color_xadj, numIter, apply_forward, apply_backward); - // Kokkos::parallel_for( range_policy_t(0,nr), - // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( @@ -1682,9 +1679,6 @@ class PointGaussSeidel { apply_backward); } - // Kokkos::parallel_for( range_policy_t(0,nr), - // PermuteVector(x_lhs_output_vec, Permuted_Xvector, color_adj)); - KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( From b6e7eb37531afa6f238ffd0695c381a2c48bd76f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 15:51:59 -0600 Subject: [PATCH 05/29] Cleanup and use overload pattern --- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 196a0de4b4..a6c5e94184 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1019,8 +1019,8 @@ class PointGaussSeidel { // first calculate max row size. size_type max_row_size = 0; KokkosKernels::Impl::kk_view_reduce_max_row_size( - num_rows, permuted_xadj.data(), permuted_xadj.data() + 1, - max_row_size, my_exec_space); + my_exec_space, num_rows, permuted_xadj.data(), + permuted_xadj.data() + 1, max_row_size); nnz_lno_t brows = permuted_xadj.extent(0) - 1; size_type bnnz = permuted_adj.extent(0) * block_size * block_size; From 5340cebef9d12bcf586b40a4cfe4faa7b68eea38 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 15:54:25 -0600 Subject: [PATCH 06/29] Add GS handle overloads --- sparse/src/KokkosKernels_Handle.hpp | 17 ++++- .../src/KokkosSparse_gauss_seidel_handle.hpp | 71 +++++++++++++++---- 2 files changed, 71 insertions(+), 17 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index dae3f12462..307ff7b91c 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -602,6 +602,7 @@ class KokkosKernelsHandle { return cgs; } void create_gs_handle( + HandleExecSpace handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { @@ -610,10 +611,20 @@ class KokkosKernelsHandle { // ---------------------------------------- // // Two-stage Gauss-Seidel if (gs_algorithm == KokkosSparse::GS_TWOSTAGE) - this->gsHandle = new TwoStageGaussSeidelHandleType(); - else this->gsHandle = - new PointGaussSeidelHandleType(gs_algorithm, coloring_algorithm); + new TwoStageGaussSeidelHandleType(handle_exec_space, num_streams); + else + this->gsHandle = new PointGaussSeidelHandleType( + handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); + } + + void create_gs_handle( + KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algorithm = + KokkosGraph::COLORING_DEFAULT) { + HandleExecSpace handle_exec_space; + return create_gs_handle(handle_exec_space, 1, gs_algorithm, + coloring_algorithm); } // ---------------------------------------- // // Two-stage Gauss-Seidel handle diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 6e57a23ee2..51ad48b580 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -85,6 +85,7 @@ class GaussSeidelHandle { protected: HandleExecSpace execution_space; + int num_streams; GSAlgorithm algorithm_type; @@ -104,6 +105,20 @@ class GaussSeidelHandle { */ GaussSeidelHandle(GSAlgorithm gs) : execution_space(HandleExecSpace()), + num_streams(1), + algorithm_type(gs), + color_xadj(), + color_adj(), + numColors(0), + called_symbolic(false), + called_numeric(false), + suggested_vector_size(0), + suggested_team_size(0) {} + + GaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs) + : execution_space(handle_exec_space), + num_streams(n_streams), algorithm_type(gs), color_xadj(), color_adj(), @@ -131,15 +146,6 @@ class GaussSeidelHandle { bool is_symbolic_called() const { return this->called_symbolic; } bool is_numeric_called() const { return this->called_numeric; } - // setters - void set_execution_space(const HandleExecSpace exec_space) { - static bool is_exec_space_set = false; - if (!is_exec_space_set) { - this->execution_space = exec_space; - is_exec_space_set = true; - } - } - void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; @@ -257,10 +263,10 @@ class PointGaussSeidelHandle /** * \brief Default constructor. */ - PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + PointGaussSeidelHandle(GSHandle gs_handle, KokkosGraph::ColoringAlgorithm coloring_algo_ = KokkosGraph::COLORING_DEFAULT) - : GSHandle(gs), + : GSHandle(gs_handle), permuted_xadj(), permuted_adj(), permuted_adj_vals(), @@ -276,9 +282,22 @@ class PointGaussSeidelHandle level_2_mem(0), long_row_threshold(0), coloring_algo(coloring_algo_) { - if (gs == GS_DEFAULT) this->choose_default_algorithm(); + if (gs_handle.get_algorithm_type() == GS_DEFAULT) + this->choose_default_algorithm(); } + PointGaussSeidelHandle(GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(gs), coloring_algo_) {} + + PointGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams, + GSAlgorithm gs = GS_DEFAULT, + KokkosGraph::ColoringAlgorithm coloring_algo_ = + KokkosGraph::COLORING_DEFAULT) + : PointGaussSeidelHandle(GSHandle(handle_exec_space, n_streams, gs), + coloring_algo_) {} + void set_block_size(nnz_lno_t bs) { this->block_size = bs; } nnz_lno_t get_block_size() const { return this->block_size; } @@ -626,8 +645,15 @@ class TwoStageGaussSeidelHandle ExecutionSpace, TemporaryMemorySpace, PersistentMemorySpace>; - TwoStageGaussSeidelHandle() - : GSHandle(GS_TWOSTAGE), + using HandleExecSpace = typename GSHandle::HandleExecSpace; + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param gsh The GaussSeidel handle. + */ + TwoStageGaussSeidelHandle(GSHandle gs_handle) + : GSHandle(gs_handle), nrows(0), nrhs(1), direction(GS_SYMMETRIC), @@ -639,6 +665,23 @@ class TwoStageGaussSeidelHandle inner_omega = one; } + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + */ + TwoStageGaussSeidelHandle() + : TwoStageGaussSeidelHandle(GSHandle(GS_TWOSTAGE)) {} + + /** + * @brief Construct a new Two Stage Gauss Seidel Handle object + * + * @param handle_exec_space The execution space instance + * @param n_streams the number of streams + */ + TwoStageGaussSeidelHandle(HandleExecSpace handle_exec_space, int n_streams) + : TwoStageGaussSeidelHandle( + GSHandle(handle_exec_space, n_streams, GS_TWOSTAGE)) {} + // Sweep direction void setSweepDirection(GSDirection direction_) { this->direction = direction_; From c0fb396b75cacd00f5614efd8b0f6e409ba9215c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 26 Jul 2023 16:13:55 -0600 Subject: [PATCH 07/29] Account for streams in memory allocs --- common/src/KokkosKernels_ExecSpaceUtils.hpp | 37 ++++++++++++------- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 6 +-- .../src/KokkosSparse_gauss_seidel_handle.hpp | 2 + 3 files changed, 28 insertions(+), 17 deletions(-) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index a0f6e39f4d..a30b2e777d 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -150,7 +150,8 @@ kk_is_a64fx_mem_space() { // Will throw if execution space doesn't support this. template inline void kk_get_free_total_memory(size_t& /* free_mem */, - size_t& /* total_mem */) { + size_t& /* total_mem */, + int /* n_streams */) { std::ostringstream oss; oss << "Error: memory space " << MemorySpace::name() << " does not support querying free/total memory."; @@ -160,26 +161,32 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */, #ifdef KOKKOS_ENABLE_CUDA template <> inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { + size_t& total_mem, + int n_streams = 1) { cudaMemGetInfo(&free_mem, &total_mem); + free_mem /= n_streams; + total_mem /= n_streams; } template <> inline void kk_get_free_total_memory(size_t& free_mem, - size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + size_t& total_mem, + int n_streams = 1) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - cudaMemGetInfo(&free_mem, &total_mem); + size_t& free_mem, size_t& total_mem, int n_streams = 1) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); } #endif #ifdef KOKKOS_ENABLE_HIP template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { + size_t& free_mem, size_t& total_mem, int n_streams = 1) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); + free_mem /= n_streams; + total_mem /= n_streams; } #endif @@ -188,7 +195,7 @@ inline void kk_get_free_total_memory( #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { + size_t& free_mem, size_t& total_mem, int n_streams = 1) { sycl::queue queue; sycl::device device = queue.get_device(); auto level_zero_handle = @@ -220,20 +227,22 @@ inline void kk_get_free_total_memory( total_mem += memory_states.size; free_mem += memory_states.free; } + free_mem /= n_streams; + total_mem /= n_streams; } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + size_t& free_mem, size_t& total_mem, int n_streams = 1) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem) { - kk_get_free_total_memory(free_mem, - total_mem); + size_t& free_mem, size_t& total_mem, int n_streams = 1) { + kk_get_free_total_memory( + free_mem, total_mem, n_streams); } #endif diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index a6c5e94184..41809203e2 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -826,6 +826,7 @@ class PointGaussSeidel { auto gsHandle = get_gs_handle(); const size_type longRowThreshold = gsHandle->get_long_row_threshold(); const MyExecSpace my_exec_space = gsHandle->get_execution_space(); + const int num_streams = gsHandle->get_num_streams(); // Validate settings if (gsHandle->get_block_size() > 1 && longRowThreshold > 0) @@ -1096,12 +1097,11 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { // check if we have enough memory for this. lower the concurrency if // we do not have enugh memory. - // TODO: account for number of streams via handle.nstreams size_t free_byte; size_t total_byte; KokkosKernels::Impl::kk_get_free_total_memory< - typename pool_memory_space::memory_space>(free_byte, - total_byte); + typename pool_memory_space::memory_space>(free_byte, total_byte, + num_streams); size_t required_size = size_t(num_big_rows) * level_2_mem; if (required_size + num_big_rows * sizeof(int) > free_byte) { num_big_rows = diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 51ad48b580..447d96d2a1 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -131,6 +131,8 @@ class GaussSeidelHandle { virtual ~GaussSeidelHandle() = default; // getters + int get_num_streams() const { return num_streams; } + HandleExecSpace get_execution_space() const { return this->execution_space; } GSAlgorithm get_algorithm_type() const { return this->algorithm_type; } From 0a8b20f815ac8de26117c4c9efa1fb1798e38ecd Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 8 Aug 2023 12:44:59 -0600 Subject: [PATCH 08/29] Fix build. Add fall-back overload --- common/src/KokkosKernels_ExecSpaceUtils.hpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index a30b2e777d..2a1cd5e6e0 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -146,6 +146,17 @@ kk_is_a64fx_mem_space() { } #endif // a64fx architectures +// Host function to determine free and total device memory. +// Will throw if execution space doesn't support this. +template +inline void kk_get_free_total_memory(size_t& /* free_mem */, + size_t& /* total_mem */) { + std::ostringstream oss; + oss << "Error: memory space " << MemorySpace::name() + << " does not support querying free/total memory."; + throw std::runtime_error(oss.str()); +} + // Host function to determine free and total device memory. // Will throw if execution space doesn't support this. template From ad6161c96d16560ae640cf7cedc0f840474bef13 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 11:46:02 -0600 Subject: [PATCH 09/29] sparse/unit_test: Add PSGS stream tests --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 295 +++++++++++++++--- 1 file changed, 243 insertions(+), 52 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 358205b713..260bac8a83 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -142,6 +142,56 @@ void run_gauss_seidel( kh.destroy_gs_handle(); } +template +void run_gauss_seidel_streams( + std::vector kh, std::vector input_mat, + std::vector x_vector, std::vector y_vector, + bool is_symmetric_graph, typename crsMat_t::value_type omega, + int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. + int nstreams = 1) { + for (int i = 0; i < nstreams; i++) { + gauss_seidel_symbolic(&kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, is_symmetric_graph); + gauss_seidel_numeric(&kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, is_symmetric_graph); + } + + const int apply_count = 2; + for (int i = 0; i < nstreams; i++) { + switch (apply_type) { + case 0: + symmetric_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + case 1: + forward_sweep_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + case 2: + backward_sweep_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + default: + symmetric_gauss_seidel_apply( + &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), + input_mat[i].graph.row_map, input_mat[i].graph.entries, + input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, + apply_count); + break; + } + } +} } // namespace Test template (2000, 2000 * 20, \ - 200, 10, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, false); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank1(2000, 2000 * 20, \ - 200, 10, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_rank2( \ - 2000, 2000 * 20, 200, 10, 3, true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_empty(); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_balloon_clustering(5000, 100, 2000); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_sequential_sor(1000, 1000 * 15, 50, \ - 10); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_long_rows(500, 10, 20, \ - true); \ - } \ - TEST_F( \ - TestCategory, \ - sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ - test_gauss_seidel_custom_coloring(500, \ - 10); \ +template +void test_gauss_seidel_streams_rank1( + lno_t numRows, size_type nnz, lno_t bandwidth, lno_t row_size_variance, + bool symmetric, double omega, + KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, + int nstreams = 1) { + using namespace Test; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using scalar_view_t = typename crsMat_t::values_type::non_const_type; + using mag_t = typename Kokkos::ArithTraits::mag_type; + using execution_space = typename device::execution_space; + + using const_size_type = const size_type; + using const_lno_t = const lno_t; + using const_scalar_t = const scalar_t; + using KernelHandle = + KokkosKernelsHandle; + srand(245); + lno_t numCols = numRows; + typename crsMat_t::value_type m_omega = omega; + + std::vector instances; + if (nstreams == 1) + instances = Kokkos::Experimental::partition_space(execution_space(), 1); + else if (nstreams == 2) + instances = Kokkos::Experimental::partition_space(execution_space(), 1, 1); + else if (nstreams == 3) + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1); + else + instances = + Kokkos::Experimental::partition_space(execution_space(), 1, 1, 1, 1); + + std::vector kh_v(nstreams); + std::vector input_mat_v(nstreams); + std::vector solution_x_v(nstreams); + std::vector x_vector_v(nstreams); + std::vector y_vector_v(nstreams); + std::vector initial_norm_res_v(nstreams); + + const scalar_t one = Kokkos::ArithTraits::one(); + const scalar_t zero = Kokkos::ArithTraits::zero(); + + for (int i = 0; i < nstreams; i++) { + input_mat_v[i] = + KokkosSparse::Impl::kk_generate_diagonally_dominant_sparse_matrix< + crsMat_t>(numRows, numCols, nnz, row_size_variance, bandwidth); + + if (symmetric) { + // Symmetrize on host, rather than relying on the parallel versions (those + // can be tested for symmetric=false) + input_mat_v[i] = + Test::symmetrize( + input_mat_v[i]); + } + lno_t nv = input_mat_v[i].numRows(); + scalar_view_t solution_x_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); + solution_x_v[i] = solution_x_tmp; + create_random_x_vector(solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); + y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); + // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the + // behavior of each algorithm _should be_ the same on every execution space, + // which is why we just test GS_DEFAULT. + + scalar_view_t x_vector_tmp( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); + x_vector_v[i] = x_vector_tmp; + + kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); + } + + int apply_count = 3; // test symmetric, forward, backward + //*** Point-coloring version **** + for (int apply_type = 0; apply_type < apply_count; ++apply_type) { + Kokkos::Timer timer1; + + for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(x_vector_v[i], zero); + + run_gauss_seidel_streams(kh_v, input_mat_v, x_vector_v, y_vector_v, + symmetric, m_omega, apply_type, nstreams); + // double gs = timer1.seconds(); + // KokkosKernels::Impl::print_1Dview(x_vector); + } + + // Check result + for (int i = 0; i < nstreams; i++) { + KokkosBlas::axpby(one, solution_x_v[i], -one, x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(x_vector_v[i]); + std::string info = "on stream_idx: " + std::to_string(i); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + } +} + +#if 0 + lno_t numRows, + size_type nnz, + lno_t bandwidth, + lno_t row_size_variance, + bool symmetric, + double omega, + KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, + int nstreams = 1 +#endif +#define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, false, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_asymmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, false); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank1(2000, 2000 * 20, \ + 200, 10, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_streams_rank1##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 1); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 2); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 3); \ + test_gauss_seidel_streams_rank1( \ + 2000, 2000 * 20, 200, 10, true, 0.9, KokkosGraph::COLORING_DEFAULT, \ + 4); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_symmetric_rank2##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_rank2( \ + 2000, 2000 * 20, 200, 10, 3, true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_empty##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_empty(); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##balloon_clustering##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_balloon_clustering(5000, 100, 2000); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##sequential_sor##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_sequential_sor(1000, 1000 * 15, 50, \ + 10); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_long_rows##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_long_rows(500, 10, 20, \ + true); \ + } \ + TEST_F( \ + TestCategory, \ + sparse##_##gauss_seidel_custom_coloring##_##SCALAR##_##ORDINAL##_##OFFSET##_##DEVICE) { \ + test_gauss_seidel_custom_coloring(500, \ + 10); \ } #include From d9c5ec6f1e1818efe7e6990a217cb592cca02c15 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 12:12:42 -0600 Subject: [PATCH 10/29] Fix docs build --- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 447d96d2a1..134a100cc7 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -652,7 +652,7 @@ class TwoStageGaussSeidelHandle /** * @brief Construct a new Two Stage Gauss Seidel Handle object * - * @param gsh The GaussSeidel handle. + * @param gs_handle The GaussSeidel handle. */ TwoStageGaussSeidelHandle(GSHandle gs_handle) : GSHandle(gs_handle), From 9a7775a2998cfeff07fc930a6481faca1d0eb19b Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 12:15:03 -0600 Subject: [PATCH 11/29] Move print statements to avoid timing them --- perf_test/sparse/KokkosSparse_gs.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_gs.cpp b/perf_test/sparse/KokkosSparse_gs.cpp index 119941cebc..2a8b164219 100644 --- a/perf_test/sparse/KokkosSparse_gs.cpp +++ b/perf_test/sparse/KokkosSparse_gs.cpp @@ -220,21 +220,17 @@ void runGS(const GS_Parameters& params) { &kh, nrows, nrows, A.graph.row_map, A.graph.entries, params.graph_symmetric); double symbolicLaunchTime = timer.seconds(); - std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double symbolicComputeTime = timer.seconds(); - std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; timer.reset(); KokkosSparse::Experimental::gauss_seidel_numeric( &kh, nrows, nrows, A.graph.row_map, A.graph.entries, A.values, params.graph_symmetric); double numericLaunchTime = timer.seconds(); - std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double numericComputeTime = timer.seconds(); - std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; timer.reset(); // Last two parameters are damping factor (should be 1) and sweeps switch (params.direction) { @@ -256,11 +252,9 @@ void runGS(const GS_Parameters& params) { } double applyLaunchTime = timer.seconds(); - std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; timer.reset(); Kokkos::fence(); double applyComputeTime = timer.seconds(); - std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; timer.reset(); kh.destroy_gs_handle(); // Now, compute the 2-norm of residual @@ -271,6 +265,12 @@ void runGS(const GS_Parameters& params) { KokkosSparse::spmv("N", alpha, A, x, beta, res); double resnorm = KokkosBlas::nrm2(res); + std::cout << "\n*** Symbolic launch time: " << symbolicLaunchTime << '\n'; + std::cout << "\n*** Symbolic compute time: " << symbolicComputeTime << '\n'; + std::cout << "\n*** Numeric launch time: " << numericLaunchTime << '\n'; + std::cout << "\n*** Numeric compute time: " << numericComputeTime << '\n'; + std::cout << "\n*** Apply launch time: " << applyLaunchTime << '\n'; + std::cout << "\n*** Apply compute time: " << applyComputeTime << '\n'; // note: this still works if the solution diverges std::cout << "Relative res norm: " << resnorm / bnorm << '\n'; } From 423da45e0b3d707a00f3e233a28cff9897b8a7e2 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 9 Aug 2023 12:53:17 -0600 Subject: [PATCH 12/29] common/src: Correct kk_get_free_total_memory --- common/src/KokkosKernels_ExecSpaceUtils.hpp | 60 +++++++++++++++++---- 1 file changed, 51 insertions(+), 9 deletions(-) diff --git a/common/src/KokkosKernels_ExecSpaceUtils.hpp b/common/src/KokkosKernels_ExecSpaceUtils.hpp index 2a1cd5e6e0..eb629f9e0c 100644 --- a/common/src/KokkosKernels_ExecSpaceUtils.hpp +++ b/common/src/KokkosKernels_ExecSpaceUtils.hpp @@ -173,32 +173,53 @@ inline void kk_get_free_total_memory(size_t& /* free_mem */, template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, - int n_streams = 1) { + int n_streams) { cudaMemGetInfo(&free_mem, &total_mem); free_mem /= n_streams; total_mem /= n_streams; } template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); +} +template <> inline void kk_get_free_total_memory(size_t& free_mem, size_t& total_mem, - int n_streams = 1) { - kk_get_free_total_memory(free_mem, total_mem, n_streams); + int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); +} +template <> +inline void kk_get_free_total_memory(size_t& free_mem, + size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { - kk_get_free_total_memory(free_mem, total_mem, n_streams); + size_t& free_mem, size_t& total_mem, int n_streams) { + kk_get_free_total_memory(free_mem, total_mem, n_streams); +} +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, 1); } #endif #ifdef KOKKOS_ENABLE_HIP template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { KOKKOSKERNELS_IMPL_HIP_SAFE_CALL(hipMemGetInfo(&free_mem, &total_mem)); free_mem /= n_streams; total_mem /= n_streams; } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory(free_mem, total_mem, + 1); +} #endif // FIXME_SYCL Use compiler extension instead of low level interface when @@ -206,7 +227,7 @@ inline void kk_get_free_total_memory( #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { sycl::queue queue; sycl::device device = queue.get_device(); auto level_zero_handle = @@ -242,19 +263,40 @@ inline void kk_get_free_total_memory( total_mem /= n_streams; } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory( free_mem, total_mem, n_streams); } +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} + template <> inline void kk_get_free_total_memory( - size_t& free_mem, size_t& total_mem, int n_streams = 1) { + size_t& free_mem, size_t& total_mem, int n_streams) { kk_get_free_total_memory( free_mem, total_mem, n_streams); } + +template <> +inline void kk_get_free_total_memory( + size_t& free_mem, size_t& total_mem) { + kk_get_free_total_memory( + free_mem, total_mem, 1); +} #endif template From 9b90ffd82029b0322c7edcca112042c3a3eece0c Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 09:39:17 -0600 Subject: [PATCH 13/29] sparse/unit_test: Initialize KernelHandle defaults --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 260bac8a83..662d81fd0b 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -56,7 +56,7 @@ namespace Test { // Run GS on the given vectors, where the handle is already set up. template void run_gauss_seidel( - Handle& kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, + Handle &kh, crsMat_t input_mat, vec_t x_vector, vec_t y_vector, bool is_symmetric_graph, typename crsMat_t::value_type omega, int apply_type = 0 // 0 for symmetric, 1 for forward, 2 for backward. ) { @@ -144,8 +144,8 @@ void run_gauss_seidel( template void run_gauss_seidel_streams( - std::vector kh, std::vector input_mat, - std::vector x_vector, std::vector y_vector, + std::vector &kh, std::vector &input_mat, + std::vector &x_vector, std::vector &y_vector, bool is_symmetric_graph, typename crsMat_t::value_type omega, int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. int nstreams = 1) { @@ -289,8 +289,8 @@ void test_gauss_seidel_rank2(lno_t numRows, size_type nnz, lno_t bandwidth, typedef typename KokkosSparse::CrsMatrix crsMat_t; - typedef Kokkos::View scalar_view2d_t; - typedef Kokkos::View + typedef Kokkos::View scalar_view2d_t; + typedef Kokkos::View host_scalar_view2d_t; typedef typename Kokkos::ArithTraits::mag_type mag_t; @@ -446,7 +446,7 @@ void test_sequential_sor(lno_t numRows, size_type nnz, lno_t bandwidth, // initial solution is zero Kokkos::deep_copy(x_host, zero); // get the inverse diagonal (only needed on host) - Kokkos::View invDiag("diag^-1", numRows); + Kokkos::View invDiag("diag^-1", numRows); for (lno_t i = 0; i < numRows; i++) { for (size_type j = rowmap(i); j < rowmap(i + 1); j++) { if (entries(j) == i) invDiag(i) = one / values(j); @@ -624,11 +624,11 @@ void test_gauss_seidel_long_rows(lno_t numRows, lno_t numLongRows, Kokkos::view_alloc(Kokkos::WithoutInitializing, "Entries"), totalEntries); rowmap_view_t rowmapView( Kokkos::view_alloc(Kokkos::WithoutInitializing, "Rowmap"), numRows + 1); - Kokkos::deep_copy(valuesView, Kokkos::View( + Kokkos::deep_copy(valuesView, Kokkos::View( values.data(), totalEntries)); - Kokkos::deep_copy(entriesView, Kokkos::View( + Kokkos::deep_copy(entriesView, Kokkos::View( entries.data(), totalEntries)); - Kokkos::deep_copy(rowmapView, Kokkos::View( + Kokkos::deep_copy(rowmapView, Kokkos::View( rowmap.data(), numRows + 1)); crsMat_t input_mat("A", numRows, numRows, totalEntries, valuesView, rowmapView, entriesView); @@ -786,6 +786,7 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "x vector"), nv); x_vector_v[i] = x_vector_tmp; + kh_v[i] = KernelHandle(); // Initialize KokkosKernelsHandle defaults. kh_v[i].create_gs_handle(instances[i], nstreams, GS_DEFAULT, coloringAlgo); } @@ -808,6 +809,7 @@ void test_gauss_seidel_streams_rank1( mag_t result_norm_res = KokkosBlas::nrm2(x_vector_v[i]); std::string info = "on stream_idx: " + std::to_string(i); EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + kh_v[i].destroy_gs_handle(); } } From 14cf5529f2bdc8cf90bc83f5c416ecdcb37140a6 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 11:50:20 -0600 Subject: [PATCH 14/29] sparse/src: Fix PSGS stream cuda regressions --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 662d81fd0b..47ee0eb6f5 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -805,24 +805,14 @@ void test_gauss_seidel_streams_rank1( // Check result for (int i = 0; i < nstreams; i++) { - KokkosBlas::axpby(one, solution_x_v[i], -one, x_vector_v[i]); - mag_t result_norm_res = KokkosBlas::nrm2(x_vector_v[i]); + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); std::string info = "on stream_idx: " + std::to_string(i); EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; kh_v[i].destroy_gs_handle(); } } -#if 0 - lno_t numRows, - size_type nnz, - lno_t bandwidth, - lno_t row_size_variance, - bool symmetric, - double omega, - KokkosGraph::ColoringAlgorithm coloringAlgo = KokkosGraph::COLORING_DEFAULT, - int nstreams = 1 -#endif #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ TEST_F( \ TestCategory, \ From 609e23cbb1b5854074ab421c415a4769de1e9194 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 14:18:01 -0600 Subject: [PATCH 15/29] .github/workflows/osx.yml: Double timeout --- .github/workflows/osx.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/osx.yml b/.github/workflows/osx.yml index 769957b953..8d9f7123f8 100644 --- a/.github/workflows/osx.yml +++ b/.github/workflows/osx.yml @@ -111,4 +111,4 @@ jobs: - name: test working-directory: kokkos-kernels/build - run: ctest -j2 --output-on-failure --timeout 3600 + run: ctest -j2 --output-on-failure --timeout 7200 From f05cbf117528e3e9e538870ab0628d4268ce643a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 15:22:09 -0600 Subject: [PATCH 16/29] sparse/src: Add gauss_seidel_symbolic overload --- docs/developer/apidocs/sparse.rst | 1 + .../impl/KokkosSparse_gauss_seidel_spec.hpp | 18 +++++--- sparse/src/KokkosSparse_gauss_seidel.hpp | 41 +++++++++++++++++-- .../src/KokkosSparse_gauss_seidel_handle.hpp | 12 ++++++ sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 17 ++++---- 5 files changed, 73 insertions(+), 16 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index f73b507439..091c0d02d7 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -60,6 +60,7 @@ block_spgemm gauss_seidel ------------ +.. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index f04ae34fc9..026c6c932d 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -120,14 +120,16 @@ struct gauss_seidel_apply_eti_spec_avail { namespace KokkosSparse { namespace Impl { -template ::value, bool eti_spec_avail = gauss_seidel_symbolic_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t>::value> struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, bool is_graph_symmetric); }; @@ -174,15 +176,19 @@ struct GAUSS_SEIDEL_APPLY { #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY -template -struct GAUSS_SEIDEL_SYMBOLIC +struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t_ entries, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_symbolic"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel< KernelHandle, a_size_view_t_, a_lno_view_t_, @@ -322,6 +328,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -337,6 +344,7 @@ struct GAUSS_SEIDEL_APPLY, \ diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 9f1b9d8cb1..93e88f5875 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,10 +29,13 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -42,9 +45,9 @@ namespace Experimental { /// num_rows submatrix of A is structurally symmetric /// @pre handle->create_gs_handle(...) has been called previously /// -template -void gauss_seidel_symbolic(KernelHandle *handle, +void gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -95,13 +98,43 @@ void gauss_seidel_symbolic(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_SYMBOLIC< - const_handle_type, Internal_alno_row_view_t_, - Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(&tmp_handle, num_rows, + ExecSpaceIn, const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(exec_space_in, + &tmp_handle, num_rows, num_cols, const_a_r, const_a_l, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity +/// pattern only) +/// +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @pre handle->create_gs_handle(...) has been called previously +/// +template +void gauss_seidel_symbolic(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_symbolic(my_exec_space, handle, num_rows, num_cols, row_map, + entries, is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (first phase, based on /// sparsity pattern only) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 134a100cc7..98624a4137 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -148,6 +148,18 @@ class GaussSeidelHandle { bool is_symbolic_called() const { return this->called_symbolic; } bool is_numeric_called() const { return this->called_numeric; } + template + void set_execution_space(const ExecSpaceIn exec_space_in) { + static bool is_set = false; + if (!is_set) { + static_assert(std::is_same::value, + "The type of exec_space_in should be the same as " + "GaussSeidelHandle::HandleExecSpace"); + this->execution_space = exec_space_in; + } + is_set = true; + } + void set_algorithm_type(const GSAlgorithm sgs_algo) { this->algorithm_type = sgs_algo; this->called_symbolic = false; diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 47ee0eb6f5..73b363a0b0 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -142,15 +142,17 @@ void run_gauss_seidel( kh.destroy_gs_handle(); } -template +template void run_gauss_seidel_streams( - std::vector &kh, std::vector &input_mat, - std::vector &x_vector, std::vector &y_vector, - bool is_symmetric_graph, typename crsMat_t::value_type omega, + std::vector &instances, std::vector &kh, + std::vector &input_mat, std::vector &x_vector, + std::vector &y_vector, bool is_symmetric_graph, + typename crsMat_t::value_type omega, int apply_type, // 0 for symmetric, 1 for forward, 2 for backward. int nstreams = 1) { for (int i = 0; i < nstreams; i++) { - gauss_seidel_symbolic(&kh[i], input_mat[i].numRows(), + gauss_seidel_symbolic(instances[i], &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), input_mat[i].graph.row_map, input_mat[i].graph.entries, is_symmetric_graph); gauss_seidel_numeric(&kh[i], input_mat[i].numRows(), input_mat[i].numCols(), @@ -797,8 +799,9 @@ void test_gauss_seidel_streams_rank1( for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(x_vector_v[i], zero); - run_gauss_seidel_streams(kh_v, input_mat_v, x_vector_v, y_vector_v, - symmetric, m_omega, apply_type, nstreams); + run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, + y_vector_v, symmetric, m_omega, apply_type, + nstreams); // double gs = timer1.seconds(); // KokkosKernels::Impl::print_1Dview(x_vector); } From 92ad57faac33ef7db7db60180931805468e7396a Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 15:43:53 -0600 Subject: [PATCH 17/29] docs: Add create_gs_handle docs --- docs/developer/apidocs/sparse.rst | 3 ++ sparse/src/KokkosKernels_Handle.hpp | 64 +++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 091c0d02d7..7dcf65ea2c 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -60,6 +60,9 @@ block_spgemm gauss_seidel ------------ +.. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(HandleExecSpace, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 307ff7b91c..7776d746af 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -601,6 +601,30 @@ class KokkosKernelsHandle { "GS."); return cgs; } + + /** + * @brief Create a gauss seidel handle object + * + * @param handle_exec_space The execution space instance to execute kernels + * on. + * @param num_streams The number of streams to allocate memory for. + * @param gs_algorithm Specifies which algorithm to use: + * KokkosSpace::GS_DEFAULT PointGaussSeidel + * KokkosSpace::GS_PERMUTED ?? + * KokkosSpace::GS_TEAM ?? + * KokkosSpace::GS_CLUSTER ?? + * KokkosSpace::GS_TWOSTAGE ?? + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL + * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with + * bit array KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept + * here for backwards compatibility for SPGEMM and other use cases) + */ void create_gs_handle( HandleExecSpace handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, @@ -618,6 +642,26 @@ class KokkosKernelsHandle { handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); } + /** + * @brief Create a gauss seidel handle object + * + * @param gs_algorithm Specifies which algorithm to use: + * KokkosSpace::GS_DEFAULT PointGaussSeidel + * KokkosSpace::GS_PERMUTED ?? + * KokkosSpace::GS_TEAM ?? + * KokkosSpace::GS_CLUSTER ?? + * KokkosSpace::GS_TWOSTAGE ?? + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL + * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with + * bit array KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept + * here for backwards compatibility for SPGEMM and other use cases) + */ void create_gs_handle( KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = @@ -683,6 +727,26 @@ class KokkosKernelsHandle { gs2->setCompactForm(compact_form); } + /** + * @brief Create a gs handle object + * + * @param clusterAlgo Specifies which clustering algorithm to use: + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * @param hint_verts_per_cluster Hint how many verticies to use per cluster + * @param coloring_algorithm Specifies which coloring algorithm to color the + * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL + * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with + * bit array KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept + * here for backwards compatibility for SPGEMM and other use cases) + */ void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = From 21f8aae598f43f6ad668747d87b05264c873e877 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 10 Aug 2023 15:56:50 -0600 Subject: [PATCH 18/29] docs: Improve docs formatting --- sparse/src/KokkosKernels_Handle.hpp | 92 +++++++++++++++++------------ 1 file changed, 53 insertions(+), 39 deletions(-) diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index 7776d746af..a23826f864 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -602,29 +602,33 @@ class KokkosKernelsHandle { return cgs; } + // clang-format off /** * @brief Create a gauss seidel handle object - * - * @param handle_exec_space The execution space instance to execute kernels - * on. + * + * @param handle_exec_space The execution space instance to execute kernels on. * @param num_streams The number of streams to allocate memory for. * @param gs_algorithm Specifies which algorithm to use: + * * KokkosSpace::GS_DEFAULT PointGaussSeidel * KokkosSpace::GS_PERMUTED ?? * KokkosSpace::GS_TEAM ?? * KokkosSpace::GS_CLUSTER ?? * KokkosSpace::GS_TWOSTAGE ?? - * @param coloring_algorithm Specifies which coloring algorithm to color the - * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL - * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring - * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array - * KokkosGraph::COLORING_VBCS Vertex Based Color Set - * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring - * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with - * bit array KokkosGraph::COLORING_EB Edge Based Coloring - * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept - * here for backwards compatibility for SPGEMM and other use cases) + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) */ + // clang-format on void create_gs_handle( HandleExecSpace handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, @@ -642,26 +646,31 @@ class KokkosKernelsHandle { handle_exec_space, num_streams, gs_algorithm, coloring_algorithm); } + // clang-format off /** * @brief Create a gauss seidel handle object - * + * * @param gs_algorithm Specifies which algorithm to use: + * * KokkosSpace::GS_DEFAULT PointGaussSeidel * KokkosSpace::GS_PERMUTED ?? * KokkosSpace::GS_TEAM ?? * KokkosSpace::GS_CLUSTER ?? * KokkosSpace::GS_TWOSTAGE ?? - * @param coloring_algorithm Specifies which coloring algorithm to color the - * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL - * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring - * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array - * KokkosGraph::COLORING_VBCS Vertex Based Color Set - * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring - * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with - * bit array KokkosGraph::COLORING_EB Edge Based Coloring - * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept - * here for backwards compatibility for SPGEMM and other use cases) + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) */ + // clang-format on void create_gs_handle( KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = @@ -727,26 +736,31 @@ class KokkosKernelsHandle { gs2->setCompactForm(compact_form); } + // clang-format off /** * @brief Create a gs handle object - * + * * @param clusterAlgo Specifies which clustering algorithm to use: - * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? - * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? - * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? + * + * KokkosSparse::ClusteringAlgorithm::CLUSTER_DEFAULT ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_MIS2 ?? + * KokkosSparse::ClusteringAlgorithm::CLUSTER_BALLOON ?? + * KokkosSparse::ClusteringAlgorithm::NUM_CLUSTERING_ALGORITHMS ?? * @param hint_verts_per_cluster Hint how many verticies to use per cluster - * @param coloring_algorithm Specifies which coloring algorithm to color the - * graph with: KokkosGraph::COLORING_DEFAULT ?? KokkosGraph::COLORING_SERIAL - * Serial Greedy Coloring KokkosGraph::COLORING_VB Vertex Based Coloring - * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array - * KokkosGraph::COLORING_VBCS Vertex Based Color Set - * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring - * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with - * bit array KokkosGraph::COLORING_EB Edge Based Coloring - * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept - * here for backwards compatibility for SPGEMM and other use cases) + * @param coloring_algorithm Specifies which coloring algorithm to color the graph with: + * + * KokkosGraph::COLORING_DEFAULT ?? + * KokkosGraph::COLORING_SERIAL Serial Greedy Coloring + * KokkosGraph::COLORING_VB Vertex Based Coloring + * KokkosGraph::COLORING_VBBIT Vertex Based Coloring with bit array + * KokkosGraph::COLORING_VBCS Vertex Based Color Set + * KokkosGraph::COLORING_VBD Vertex Based Deterministic Coloring + * KokkosGraph::COLORING_VBDBIT Vertex Based Deterministic Coloring with bit array + * KokkosGraph::COLORING_EB Edge Based Coloring + * KokkosGraph::COLORING_SERIAL2 Serial Distance-2 Graph Coloring (kept here for + * backwards compatibility for SPGEMM and other use cases) */ + // clang-format on void create_gs_handle(KokkosSparse::ClusteringAlgorithm clusterAlgo, nnz_lno_t hint_verts_per_cluster, KokkosGraph::ColoringAlgorithm coloring_algorithm = From 1adc5e418ad6f3e6e2db53ce9a8b1dfee3a7315f Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 14 Aug 2023 14:48:56 -0600 Subject: [PATCH 19/29] sparse/unit_test: Launch nrm2 on stream --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 73b363a0b0..90875e18bd 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -778,7 +778,7 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); solution_x_v[i] = solution_x_tmp; create_random_x_vector(solution_x_v[i]); - initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(instances[i], solution_x_v[i]); y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the // behavior of each algorithm _should be_ the same on every execution space, From 2082b66c2b5a7f0c10e993823e499824230b7f31 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 14 Aug 2023 18:14:13 -0600 Subject: [PATCH 20/29] sparse/src: Add GS numeric overload --- .../impl/KokkosSparse_gauss_seidel_spec.hpp | 32 ++-- sparse/src/KokkosSparse_gauss_seidel.hpp | 153 ++++++++++++++++-- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 7 +- 3 files changed, 168 insertions(+), 24 deletions(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 026c6c932d..b9a89d2579 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -135,20 +135,23 @@ struct GAUSS_SEIDEL_SYMBOLIC { }; template < - class KernelHandle, KokkosSparse::SparseMatrixFormat format, - class a_size_view_t_, class a_lno_view_t, class a_scalar_view_t, + class ExecSpaceIn, class KernelHandle, + KokkosSparse::SparseMatrixFormat format, class a_size_view_t_, + class a_lno_view_t, class a_scalar_view_t, bool tpl_spec_avail = gauss_seidel_numeric_tpl_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value, bool eti_spec_avail = gauss_seidel_numeric_eti_spec_avail< KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value> struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric); static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); @@ -212,17 +215,20 @@ struct GAUSS_SEIDEL_SYMBOLIC -struct GAUSS_SEIDEL_NUMERIC +struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric) { Kokkos::Profiling::pushRegion("KokkosSparse::Impl::gauss_seidel_numeric"); auto gsHandle = handle->get_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidelget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel, \ @@ -375,6 +384,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -394,6 +404,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -409,6 +420,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_NUMERIC< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 93e88f5875..b7485e68dc 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -175,12 +175,15 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -190,11 +193,12 @@ void block_gauss_seidel_symbolic( /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template -void gauss_seidel_numeric(KernelHandle *handle, +void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -258,9 +262,10 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, + &tmp_handle, num_rows, num_cols, const_a_r, const_a_l, const_a_v, is_graph_symmetric); @@ -284,7 +289,6 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param row_map The matrix's rowmap /// @param entries The matrix's entries /// @param values The matrix's values -/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// @remark If the inverse diagonal is not already available, it's best to call @@ -296,6 +300,50 @@ template void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, is_graph_symmetric); +} + +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -362,25 +410,71 @@ void gauss_seidel_numeric(KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - const_handle_type, format, Internal_alno_row_view_t_, + ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(&tmp_handle, num_rows, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, + &tmp_handle, num_rows, num_cols, const_a_r, const_a_l, const_a_v, const_a_d, is_graph_symmetric); } +/// +/// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's +/// numeric values). This version accepts the matrix's inverse diagonal from the +/// user. +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided +/// inverse diagonal must share this type. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param given_inverse_diagonal The inverse (reciprocal) of diagonal +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// @remark If the inverse diagonal is not already available, it's best to call +/// the version of gauss_seidel_numeric that +/// doesn't take it as an argument. The inverse diagonal will be +/// computed internally. +template +void gauss_seidel_numeric(KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + lno_row_view_t_ row_map, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, + scalar_nnz_view_t_ given_inverse_diagonal, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, given_inverse_diagonal, + is_graph_symmetric); +} + /// /// @brief Block Gauss-Seidel preconditioner setup (second phase, based on /// matrix's numeric values) /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -391,12 +485,14 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template void block_gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -409,8 +505,43 @@ void block_gauss_seidel_numeric( } gsHandle->set_block_size(block_size); - gauss_seidel_numeric(handle, num_rows, num_cols, row_map, entries, - values, is_graph_symmetric); + gauss_seidel_numeric(exec_space_in, handle, num_rows, num_cols, + row_map, entries, values, is_graph_symmetric); +} + +/// +/// @brief Block Gauss-Seidel preconditioner setup (second phase, based on +/// matrix's numeric values) +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @param handle handle A KokkosKernelsHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param block_size The number of degrees of freedom per block +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param is_graph_symmetric Whether the upper-left num_rows x +/// num_rows submatrix of A is structurally symmetric +/// +template +void block_gauss_seidel_numeric( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, + typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + bool is_graph_symmetric = true) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, is_graph_symmetric); } /// diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 90875e18bd..0a5849cbb0 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -155,9 +155,10 @@ void run_gauss_seidel_streams( gauss_seidel_symbolic(instances[i], &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), input_mat[i].graph.row_map, input_mat[i].graph.entries, is_symmetric_graph); - gauss_seidel_numeric(&kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, is_symmetric_graph); + gauss_seidel_numeric(instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, + is_symmetric_graph); } const int apply_count = 2; From dd3fbb5375680ccf4ea299a64e95eeac6b617dfa Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 14 Aug 2023 18:15:19 -0600 Subject: [PATCH 21/29] sparse/src: Add GS apply overload. - Use execution space instance throughout. --- common/src/KokkosKernels_SimpleUtils.hpp | 4 - common/src/KokkosKernels_Utils.hpp | 60 +++-- docs/developer/apidocs/sparse.rst | 5 + .../impl/KokkosSparse_gauss_seidel_impl.hpp | 68 ++++-- .../impl/KokkosSparse_gauss_seidel_spec.hpp | 32 ++- sparse/src/KokkosSparse_gauss_seidel.hpp | 224 +++++++++++++++--- .../src/KokkosSparse_gauss_seidel_handle.hpp | 3 +- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 55 +++-- 8 files changed, 339 insertions(+), 112 deletions(-) diff --git a/common/src/KokkosKernels_SimpleUtils.hpp b/common/src/KokkosKernels_SimpleUtils.hpp index 64735874c6..e25ec54eb0 100644 --- a/common/src/KokkosKernels_SimpleUtils.hpp +++ b/common/src/KokkosKernels_SimpleUtils.hpp @@ -142,10 +142,6 @@ inline void kk_exclusive_parallel_prefix_sum( kk_exclusive_parallel_prefix_sum(MyExecSpace(), num_elements, arr, finalSum); } -template -void kk_inclusive_parallel_prefix_sum(MyExecSpace my_exec_space, - forward_array_type arr) {} - /// /// \brief Function performs the inclusive parallel prefix sum. That is each /// entry holds the sum until itself including itself. diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 11b4100f31..552e994892 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -687,9 +687,11 @@ void create_reverse_map( typedef Kokkos::RangePolicy range_policy_t; reverse_map_xadj = - reverse_array_type("Reverse Map Xadj", num_reverse_elements + 1); + reverse_array_type(Kokkos::view_alloc(my_exec_space, "Reverse Map Xadj"), + num_reverse_elements + 1); reverse_map_adj = reverse_array_type( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "REVERSE_ADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "REVERSE_ADJ"), num_forward_elements); if (num_reverse_elements < MINIMUM_TO_ATOMIC) { @@ -703,7 +705,9 @@ void create_reverse_map( const reverse_lno_t tmp_reverse_size = (num_reverse_elements + 1) << multiply_shift_for_scale; - reverse_array_type tmp_color_xadj("TMP_REVERSE_XADJ", tmp_reverse_size + 1); + reverse_array_type tmp_color_xadj( + Kokkos::view_alloc(my_exec_space, "TMP_REVERSE_XADJ"), + tmp_reverse_size + 1); Reverse_Map_Scale_Init rmi( forward_map, tmp_color_xadj, multiply_shift_for_scale, @@ -714,7 +718,7 @@ void create_reverse_map( my_exec_space.fence(); inclusive_parallel_prefix_sum( - tmp_reverse_size + 1, tmp_color_xadj); + my_exec_space, tmp_reverse_size + 1, tmp_color_xadj); my_exec_space.fence(); Kokkos::parallel_for( @@ -734,7 +738,8 @@ void create_reverse_map( // atomic implementation. { reverse_array_type tmp_color_xadj( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "TMP_REVERSE_XADJ"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "TMP_REVERSE_XADJ"), num_reverse_elements + 1); Reverse_Map_Init rmi( @@ -747,9 +752,8 @@ void create_reverse_map( // print_1Dview(reverse_map_xadj); inclusive_parallel_prefix_sum( - num_reverse_elements + 1, reverse_map_xadj); - my_exec_space.fence(); - Kokkos::deep_copy(tmp_color_xadj, reverse_map_xadj); + my_exec_space, num_reverse_elements + 1, reverse_map_xadj); + Kokkos::deep_copy(my_exec_space, tmp_color_xadj, reverse_map_xadj); my_exec_space.fence(); Fill_Reverse_Map frm( forward_map, tmp_color_xadj, reverse_map_adj); @@ -804,18 +808,30 @@ struct PermuteVector { template -void permute_vector(typename idx_array_type::value_type num_elements, +void permute_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteVector( old_vector, new_vector, old_to_new_index_map)); } +template +void permute_vector(typename idx_array_type::value_type num_elements, + idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_vector(MyExecSpace(), num_elements, old_to_new_index_map, old_vector, + new_vector); +} + template struct PermuteBlockVector { @@ -849,19 +865,30 @@ struct PermuteBlockVector { template -void permute_block_vector(typename idx_array_type::value_type num_elements, +void permute_block_vector(MyExecSpace my_exec_space, + typename idx_array_type::value_type num_elements, int block_size, idx_array_type &old_to_new_index_map, value_array_type &old_vector, out_value_array_type &new_vector) { - typedef Kokkos::RangePolicy my_exec_space; - + using range_policy_t = Kokkos::RangePolicy; Kokkos::parallel_for( - "KokkosKernels::Common::PermuteVector", my_exec_space(0, num_elements), + "KokkosKernels::Common::PermuteVector", + range_policy_t(my_exec_space, 0, num_elements), PermuteBlockVector(block_size, old_vector, new_vector, old_to_new_index_map)); } +template +void permute_block_vector(typename idx_array_type::value_type num_elements, + int block_size, idx_array_type &old_to_new_index_map, + value_array_type &old_vector, + out_value_array_type &new_vector) { + permute_block_vector(MyExecSpace(), num_elements, block_size, + old_to_new_index_map, old_vector, new_vector); +} + // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. @@ -1301,8 +1328,7 @@ void kk_view_reduce_max_row_size(const size_t num_rows, const size_type *rowmap_view_begins, const size_type *rowmap_view_ends, size_type &max_row_size) { - MyExecSpace my_exec_space; - return kk_view_reduce_max_row_size(my_exec_space, num_rows, + return kk_view_reduce_max_row_size(MyExecSpace(), num_rows, rowmap_view_begins, rowmap_view_ends, max_row_size); } diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 7dcf65ea2c..1f7e702fb4 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -65,10 +65,15 @@ gauss_seidel .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) .. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 41809203e2..d9d45dbb85 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -874,6 +874,9 @@ class PointGaussSeidel { colors = gchandle->get_vertex_colors(); numColors = gchandle->get_num_colors(); } + // Wait for coloring to finish on its stream + using ColoringExecSpace = typename HandleType::HandleExecSpace; + ColoringExecSpace().fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "COLORING_TIME:" << timer.seconds() << std::endl; timer.reset(); @@ -921,11 +924,11 @@ class PointGaussSeidel { // Count long rows per color set, and sort color sets so that long rows // come after regular rows nnz_lno_persistent_work_view_t long_rows_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "long_rows_per_color"), numColors); nnz_lno_persistent_work_view_t max_row_length_per_color( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "max_row_length_per_color"), numColors); nnz_lno_t mostLongRowsInColor = 0; @@ -954,7 +957,8 @@ class PointGaussSeidel { my_exec_space.fence(); gsHandle->set_max_row_length_per_color(host_max_row_length_per_color); scalar_persistent_work_view_t long_row_x( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "long_row_x"), + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "long_row_x"), mostLongRowsInColor); gsHandle->set_long_row_x(long_row_x); } else { @@ -1134,6 +1138,7 @@ class PointGaussSeidel { gsHandle->set_new_adj(permuted_adj); gsHandle->set_old_to_new_map(old_to_new_map); gsHandle->set_call_symbolic(true); + my_exec_space.fence(); #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE std::cout << "ALLOC:" << timer.seconds() << std::endl; #endif @@ -1352,6 +1357,7 @@ class PointGaussSeidel { const_lno_row_view_t xadj = this->row_map; const_lno_nnz_view_t adj = this->entries; const_scalar_nnz_view_t adj_vals = this->values; + MyExecSpace my_exec_space = gsHandle->get_execution_space(); size_type nnz = adj_vals.extent(0); @@ -1362,14 +1368,16 @@ class PointGaussSeidel { nnz_lno_persistent_work_view_t color_adj = gsHandle->get_color_adj(); scalar_persistent_work_view_t permuted_adj_vals( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "newvals_"), nnz); + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, + "newvals_"), + nnz); int suggested_vector_size = this->handle->get_suggested_vector_size(num_rows, nnz); int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t rows_per_team = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), num_rows); + suggested_team_size, my_exec_space.concurrency(), num_rows); nnz_lno_t block_size = gsHandle->get_block_size(); nnz_lno_t block_matrix_size = block_size * block_size; @@ -1393,7 +1401,8 @@ class PointGaussSeidel { if (KokkosKernels::Impl::kk_is_gpu_exec_space()) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::Team_fill_matrix_numeric", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), fill_matrix_numeric(color_adj, xadj, // adj, @@ -1405,7 +1414,7 @@ class PointGaussSeidel { block_matrix_size)); } else { Kokkos::parallel_for("KokkosSparse::GaussSeidel::fill_matrix_numeric", - range_policy_t(0, num_rows), + range_policy_t(my_exec_space, 0, num_rows), fill_matrix_numeric(color_adj, xadj, // adj, adj_vals, newxadj_, @@ -1418,7 +1427,7 @@ class PointGaussSeidel { gsHandle->set_new_adj_val(permuted_adj_vals); scalar_persistent_work_view_t permuted_inverse_diagonal( - Kokkos::view_alloc(Kokkos::WithoutInitializing, + Kokkos::view_alloc(my_exec_space, Kokkos::WithoutInitializing, "permuted_inverse_diagonal"), num_rows * block_size); if (!have_diagonal_given) { @@ -1430,13 +1439,14 @@ class PointGaussSeidel { block_size > 1) { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::team_get_matrix_diagonals", - team_policy_t((num_rows + rows_per_team - 1) / rows_per_team, + team_policy_t(my_exec_space, + (num_rows + rows_per_team - 1) / rows_per_team, suggested_team_size, suggested_vector_size), gmd); } else { Kokkos::parallel_for( "KokkosSparse::GaussSeidel::get_matrix_diagonals", - range_policy_t(0, num_rows), gmd); + range_policy_t(my_exec_space, 0, num_rows), gmd); } } else { @@ -1444,13 +1454,13 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_block_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, given_inverse_diagonal, - permuted_inverse_diagonal); + my_exec_space, num_rows, block_size, old_to_new_map, + given_inverse_diagonal, permuted_inverse_diagonal); else KokkosKernels::Impl::permute_vector< const_scalar_nnz_view_t, scalar_persistent_work_view_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, given_inverse_diagonal, + my_exec_space, num_rows, old_to_new_map, given_inverse_diagonal, permuted_inverse_diagonal); } @@ -1458,7 +1468,7 @@ class PointGaussSeidel { gsHandle->set_call_numeric(true); } #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE - MyExecSpace().fence(); + my_exec_space.fence(); std::cout << "NUMERIC:" << timer.seconds() << std::endl; #endif } @@ -1684,6 +1694,7 @@ class PointGaussSeidel { nnz_lno_persistent_work_view_t, MyExecSpace>( num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG + Kokkos::fence(); std::cout << "--point After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); std::cout << "--point Result X:"; @@ -1722,7 +1733,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1733,7 +1745,7 @@ class PointGaussSeidel { max_row_length_per_color = gsHandle->get_max_row_length_per_color(); long_row_x = gsHandle->get_long_row_x(); haveLongRows = true; - longrow_apply_team_policy_t tempPolicy(1, 1); + longrow_apply_team_policy_t tempPolicy(my_exec_space, 1, 1); longRowTeamSize = tempPolicy.team_size_recommended(gs, Kokkos::ParallelForTag()); } @@ -1782,7 +1794,8 @@ class PointGaussSeidel { Kokkos::parallel_for( labelRegular, Kokkos::Experimental::require( - team_policy_t((numRegularRows + team_row_chunk_size - 1) / + team_policy_t(my_exec_space, + (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), Kokkos::Experimental::WorkItemProperty::HintLightWeight), @@ -1792,6 +1805,7 @@ class PointGaussSeidel { labelBlock, Kokkos::Experimental::require( block_apply_team_policy_t( + my_exec_space, (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), @@ -1802,6 +1816,7 @@ class PointGaussSeidel { labelBigBlock, Kokkos::Experimental::require( bigblock_apply_team_policy_t( + my_exec_space, (numRegularRows + team_row_chunk_size - 1) / team_row_chunk_size, suggested_team_size, vector_size), @@ -1828,14 +1843,16 @@ class PointGaussSeidel { Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( - longrow_apply_team_policy_t(numLongRows * teams_per_row, + longrow_apply_team_policy_t(my_exec_space, + numLongRows * teams_per_row, longRowTeamSize), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", Kokkos::Experimental::require( - range_policy_t(color_index_end - numLongRows, + range_policy_t(my_exec_space, + color_index_end - numLongRows, color_index_end), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( @@ -1852,7 +1869,8 @@ class PointGaussSeidel { nnz_lno_persistent_work_host_view_t h_color_xadj, int num_iteration, bool apply_forward, bool apply_backward) { - auto gsHandle = this->get_gs_handle(); + auto gsHandle = this->get_gs_handle(); + MyExecSpace my_exec_space = gsHandle->get_execution_space(); nnz_lno_persistent_work_host_view_t long_rows_per_color; nnz_lno_persistent_work_host_view_t max_row_length_per_color; scalar_persistent_work_view_t long_row_x; @@ -1889,7 +1907,7 @@ class PointGaussSeidel { Kokkos::parallel_for( labelShort, Kokkos::Experimental::require( - range_policy_t(color_index_begin, + range_policy_t(my_exec_space, color_index_begin, color_index_end - numLongRows), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); @@ -1906,18 +1924,20 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); + my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( Kokkos::RangePolicy( - 0, numLongRows * par_per_row), + my_exec_space, 0, numLongRows * par_per_row), Kokkos::Experimental::WorkItemProperty::HintLightWeight), gs); Kokkos::parallel_for( "KokkosSparse::GaussSeidel::LongRows::x_update", Kokkos::Experimental::require( - range_policy_t(color_index_end - numLongRows, + range_policy_t(my_exec_space, + color_index_end - numLongRows, color_index_end), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LongRowUpdateFunctor( diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index b9a89d2579..84c9dccf5c 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -157,9 +157,10 @@ struct GAUSS_SEIDEL_NUMERIC { a_scalar_view_t given_inverse_diagonal, bool is_graph_symmetric); }; -template ::value, @@ -168,7 +169,8 @@ template ::value> struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -287,14 +289,17 @@ struct GAUSS_SEIDEL_NUMERIC -struct GAUSS_SEIDEL_APPLY { +template +struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -303,6 +308,7 @@ struct GAUSS_SEIDEL_APPLYget_gs_handle(); + gsHandle->set_execution_space(exec_space_in); if (gsHandle->get_algorithm_type() == GS_CLUSTER) { using SGS = typename Impl::ClusterGaussSeidel, \ @@ -461,6 +468,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ extern template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ @@ -486,6 +494,7 @@ struct GAUSS_SEIDEL_APPLY, \ @@ -507,6 +516,7 @@ struct GAUSS_SEIDEL_APPLY >, \ false, true>; \ template struct GAUSS_SEIDEL_APPLY< \ + EXEC_SPACE_TYPE, \ KokkosKernels::Experimental::KokkosKernelsHandle< \ const OFFSET_TYPE, const ORDINAL_TYPE, const SCALAR_TYPE, \ EXEC_SPACE_TYPE, MEM_SPACE_TYPE, MEM_SPACE_TYPE>, \ diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index b7485e68dc..505b8c55a8 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -505,8 +505,13 @@ void block_gauss_seidel_numeric( } gsHandle->set_block_size(block_size); - gauss_seidel_numeric(exec_space_in, handle, num_rows, num_cols, - row_map, entries, values, is_graph_symmetric); + gauss_seidel_numeric(exec_space_in, handle, num_rows, + num_cols, row_map, entries, values, + is_graph_symmetric); + + /* gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, + entries, values, given_inverse_diagonal, + is_graph_symmetric); */ } /// @@ -540,14 +545,16 @@ void block_gauss_seidel_numeric( lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric = true) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, is_graph_symmetric); + block_gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, + block_size, row_map, entries, values, + is_graph_symmetric); } /// /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -558,6 +565,8 @@ void block_gauss_seidel_numeric( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -574,13 +583,15 @@ void block_gauss_seidel_numeric( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void symmetric_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -696,13 +707,63 @@ void symmetric_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, true); + gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, + const_a_r, const_a_l, const_a_v, nonconst_x_v, + const_y_v, init_zero_x_vector, update_y_vector, omega, + numIter, true, true); +} + +/// +/// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to +/// system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle handle A KokkosKernelsHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run (forward and backward counts as 1) +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void symmetric_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + symmetric_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, + row_map, entries, values, x_lhs_output_vec, + y_rhs_input_vec, init_zero_x_vector, + update_y_vector, omega, numIter); } /// @@ -785,6 +846,8 @@ void symmetric_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -801,13 +864,15 @@ void symmetric_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void forward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -925,13 +990,62 @@ void forward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, true, false); + gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, + const_a_r, const_a_l, const_a_v, nonconst_x_v, + const_y_v, init_zero_x_vector, update_y_vector, omega, + numIter, true, false); +} + +/// +/// @brief Apply forward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void forward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + forward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, + row_map, entries, values, x_lhs_output_vec, + y_rhs_input_vec, init_zero_x_vector, + update_y_vector, omega, numIter); } /// @@ -1003,6 +1117,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// +/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -1013,6 +1128,8 @@ void forward_sweep_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. +/// @param exec_space_in The execution space instance this kernel will be run +/// on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -1029,13 +1146,15 @@ void forward_sweep_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void backward_sweep_gauss_seidel_apply( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + ExecSpaceIn &exec_space_in, KernelHandle *handle, + typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, @@ -1153,13 +1272,62 @@ void backward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(&tmp_handle, num_rows, num_cols, const_a_r, const_a_l, - const_a_v, nonconst_x_v, const_y_v, init_zero_x_vector, - update_y_vector, omega, numIter, false, true); + gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, + const_a_r, const_a_l, const_a_v, nonconst_x_v, + const_y_v, init_zero_x_vector, update_y_vector, omega, + numIter, false, true); +} + +/// +/// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y +/// +/// @tparam format The matrix storage format, CRS or BSR +/// @tparam KernelHandle A specialization of +/// KokkosKernels::Experimental::KokkosKernelsHandle +/// @tparam lno_row_view_t_ The matrix's rowmap type +/// @tparam lno_nnz_view_t_ The matrix's entries type +/// @tparam scalar_nnz_view_t_ The matrix's values type +/// @tparam x_scalar_view_t The type of the X (left-hand side, unknown) vector. +/// May be rank-1 or rank-2 View. +/// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be +/// rank-1 or rank-2 View. +/// @param handle KernelHandle instance +/// @param num_rows Number of rows in the matrix +/// @param num_cols Number of columns in the matrix +/// @param row_map The matrix's rowmap +/// @param entries The matrix's entries +/// @param values The matrix's values +/// @param x_lhs_output_vec The X (left-hand side, unknown) vector +/// @param y_rhs_input_vec The Y (right-hand side) vector +/// @param init_zero_x_vector Whether to zero out X before applying +/// @param update_y_vector Whether Y has changed since the last call to apply +/// @param omega The damping factor for successive over-relaxation +/// @param numIter How many iterations to run +/// @pre x_lhs_output_vec.extent(0) == num_cols +/// @pre y_rhs_input_vec.extent(0) == num_rows +/// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) +/// +template +void backward_sweep_gauss_seidel_apply( + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, + typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, + x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, + bool init_zero_x_vector, bool update_y_vector, + typename KernelHandle::nnz_scalar_t omega, int numIter) { + auto my_exec_space = handle->get_gs_handle()->get_execution_space(); + backward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, + row_map, entries, values, x_lhs_output_vec, + y_rhs_input_vec, init_zero_x_vector, + update_y_vector, omega, numIter); } /// diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 98624a4137..80ba6e5153 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -104,8 +104,7 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : execution_space(HandleExecSpace()), - num_streams(1), + : num_streams(1), algorithm_type(gs), color_xadj(), color_adj(), diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 0a5849cbb0..820d9ae447 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -166,31 +166,31 @@ void run_gauss_seidel_streams( switch (apply_type) { case 0: symmetric_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; case 1: forward_sweep_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; case 2: backward_sweep_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; default: symmetric_gauss_seidel_apply( - &kh[i], input_mat[i].numRows(), input_mat[i].numCols(), - input_mat[i].graph.row_map, input_mat[i].graph.entries, - input_mat[i].values, x_vector[i], y_vector[i], false, true, omega, - apply_count); + instances[i], &kh[i], input_mat[i].numRows(), + input_mat[i].numCols(), input_mat[i].graph.row_map, + input_mat[i].graph.entries, input_mat[i].values, x_vector[i], + y_vector[i], false, true, omega, apply_count); break; } } @@ -779,7 +779,7 @@ void test_gauss_seidel_streams_rank1( Kokkos::view_alloc(Kokkos::WithoutInitializing, "X (correct)"), nv); solution_x_v[i] = solution_x_tmp; create_random_x_vector(solution_x_v[i]); - initial_norm_res_v[i] = KokkosBlas::nrm2(instances[i], solution_x_v[i]); + initial_norm_res_v[i] = KokkosBlas::nrm2(solution_x_v[i]); y_vector_v[i] = create_random_y_vector(input_mat_v[i], solution_x_v[i]); // GS_DEFAULT is GS_TEAM on CUDA and GS_PERMUTED on other spaces, and the // behavior of each algorithm _should be_ the same on every execution space, @@ -798,23 +798,26 @@ void test_gauss_seidel_streams_rank1( for (int apply_type = 0; apply_type < apply_count; ++apply_type) { Kokkos::Timer timer1; - for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(x_vector_v[i], zero); + for (int i = 0; i < nstreams; i++) + Kokkos::deep_copy(instances[i], x_vector_v[i], zero); + for (int i = 0; i < nstreams; i++) instances[i].fence(); run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, y_vector_v, symmetric, m_omega, apply_type, nstreams); // double gs = timer1.seconds(); // KokkosKernels::Impl::print_1Dview(x_vector); + for (int i = 0; i < nstreams; i++) { + instances[i].fence(); // Wait for apply to finish updating x_vector + KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, + x_vector_v[i]); + mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); + std::string info = "on stream_idx: " + std::to_string(i); + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + } } - // Check result - for (int i = 0; i < nstreams; i++) { - KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, x_vector_v[i]); - mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); - std::string info = "on stream_idx: " + std::to_string(i); - EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; - kh_v[i].destroy_gs_handle(); - } + for (int i = 0; i < nstreams; i++) kh_v[i].destroy_gs_handle(); } #define KOKKOSKERNELS_EXECUTE_TEST(SCALAR, ORDINAL, OFFSET, DEVICE) \ From 5c41061189d9ca4d218368e97405ffdebf195874 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Mon, 21 Aug 2023 16:26:36 -0600 Subject: [PATCH 22/29] Various fixes --- common/src/KokkosKernels_Utils.hpp | 17 +++++- .../impl/KokkosSparse_gauss_seidel_impl.hpp | 42 ++++++++------- sparse/src/KokkosSparse_Utils.hpp | 17 ++++-- sparse/src/KokkosSparse_gauss_seidel.hpp | 54 ++----------------- .../src/KokkosSparse_gauss_seidel_handle.hpp | 3 +- 5 files changed, 58 insertions(+), 75 deletions(-) diff --git a/common/src/KokkosKernels_Utils.hpp b/common/src/KokkosKernels_Utils.hpp index 552e994892..c6780185a4 100644 --- a/common/src/KokkosKernels_Utils.hpp +++ b/common/src/KokkosKernels_Utils.hpp @@ -892,11 +892,24 @@ void permute_block_vector(typename idx_array_type::value_type num_elements, // TODO BMK: clean this up by removing 1st argument. It is unused but // its name gives the impression that only num_elements of the vector are // zeroed, when really it's always the whole thing. +template +void zero_vector(ExecSpaceIn &exec_space_in, + typename value_array_type::value_type /* num_elements */, + value_array_type &vector) { + typedef typename value_array_type::non_const_value_type val_type; + Kokkos::deep_copy(exec_space_in, vector, + Kokkos::ArithTraits::zero()); + exec_space_in.fence(); +} + template void zero_vector(typename value_array_type::value_type /* num_elements */, value_array_type &vector) { - typedef typename value_array_type::non_const_value_type val_type; - Kokkos::deep_copy(vector, Kokkos::ArithTraits::zero()); + using ne_tmp_t = typename value_array_type::value_type; + ne_tmp_t ne_tmp = ne_tmp_t(0); + MyExecSpace my_exec_space; + zero_vector(my_exec_space, ne_tmp, + vector); } template diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index d9d45dbb85..1f386a28a4 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1091,8 +1091,8 @@ class PointGaussSeidel { size_type num_large_rows = 0; KokkosSparse::Impl::kk_reduce_numrows_larger_than_threshold< row_lno_persistent_work_view_t, MyExecSpace>( - brows, permuted_xadj, num_values_in_l1, num_large_rows, - my_exec_space); + my_exec_space, brows, permuted_xadj, num_values_in_l1, + num_large_rows); num_big_rows = KOKKOSKERNELS_MACRO_MIN( num_large_rows, (size_type)(my_exec_space.concurrency() / suggested_vector_size)); @@ -1463,7 +1463,6 @@ class PointGaussSeidel { my_exec_space, num_rows, old_to_new_map, given_inverse_diagonal, permuted_inverse_diagonal); } - gsHandle->set_permuted_inverse_diagonal(permuted_inverse_diagonal); gsHandle->set_call_numeric(true); } @@ -1530,24 +1529,25 @@ class PointGaussSeidel { scalar_persistent_work_view_t permuted_inverse_diagonal = gsHandle->get_permuted_inverse_diagonal(); - color_t numColors = gsHandle->get_num_colors(); + color_t numColors = gsHandle->get_num_colors(); + auto my_exec_space = gsHandle->get_execution_space(); if (update_y_vector) { KokkosKernels::Impl::permute_block_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, block_size, old_to_new_map, y_rhs_input_vec, + my_exec_space, num_rows, block_size, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols * block_size, - Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols * block_size, Permuted_Xvector); } else { KokkosKernels::Impl::permute_block_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, old_to_new_map, x_lhs_output_vec, + my_exec_space, num_cols, block_size, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); } @@ -1580,7 +1580,7 @@ class PointGaussSeidel { int suggested_team_size = this->handle->get_suggested_team_size(suggested_vector_size); nnz_lno_t team_row_chunk_size = this->handle->get_team_work_size( - suggested_team_size, MyExecSpace().concurrency(), brows); + suggested_team_size, my_exec_space.concurrency(), brows); // size_t shmem_size_to_use = this->handle->get_shmem_size(); size_t l1_shmem_size = gsHandle->get_level_1_mem(); @@ -1613,7 +1613,8 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_block_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, block_size, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, block_size, color_adj, Permuted_Xvector, + x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG std::cout << "After X:"; KokkosKernels::Impl::print_1Dview(Permuted_Xvector); @@ -1631,7 +1632,8 @@ class PointGaussSeidel { nnz_scalar_t omega = Kokkos::ArithTraits::one(), bool apply_forward = true, bool apply_backward = true, bool update_y_vector = true) { - auto gsHandle = get_gs_handle(); + auto gsHandle = get_gs_handle(); + auto my_exec_space = gsHandle->get_execution_space(); auto Permuted_Xvector = gsHandle->get_permuted_x_vector(); auto Permuted_Yvector = gsHandle->get_permuted_y_vector(); @@ -1651,16 +1653,19 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_vector< y_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_rows, old_to_new_map, y_rhs_input_vec, Permuted_Yvector); + my_exec_space, num_rows, old_to_new_map, y_rhs_input_vec, + Permuted_Yvector); } if (init_zero_x_vector) { - KokkosKernels::Impl::zero_vector(num_cols, Permuted_Xvector); + KokkosKernels::Impl::zero_vector< + MyExecSpace, scalar_persistent_work_view2d_t, MyExecSpace>( + my_exec_space, num_cols, Permuted_Xvector); } else { KokkosKernels::Impl::permute_vector< x_value_array_type, scalar_persistent_work_view2d_t, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, old_to_new_map, x_lhs_output_vec, Permuted_Xvector); + my_exec_space, num_cols, old_to_new_map, x_lhs_output_vec, + Permuted_Xvector); } #if KOKKOSSPARSE_IMPL_PRINTDEBUG @@ -1692,7 +1697,7 @@ class PointGaussSeidel { KokkosKernels::Impl::permute_vector< scalar_persistent_work_view2d_t, x_value_array_type, nnz_lno_persistent_work_view_t, MyExecSpace>( - num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); + my_exec_space, num_cols, color_adj, Permuted_Xvector, x_lhs_output_vec); #if KOKKOSSPARSE_IMPL_PRINTDEBUG Kokkos::fence(); std::cout << "--point After X:"; @@ -1839,7 +1844,8 @@ class PointGaussSeidel { auto Ycol = Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; - Kokkos::deep_copy(long_row_x, nnz_scalar_t()); + Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); + my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( diff --git a/sparse/src/KokkosSparse_Utils.hpp b/sparse/src/KokkosSparse_Utils.hpp index 88258356ef..f61f470814 100644 --- a/sparse/src/KokkosSparse_Utils.hpp +++ b/sparse/src/KokkosSparse_Utils.hpp @@ -1885,10 +1885,9 @@ struct ReduceLargerRowCount { template void kk_reduce_numrows_larger_than_threshold( - size_t num_elements, view_type view_to_reduce, - typename view_type::const_value_type threshold, - typename view_type::non_const_value_type &sum_reduction, - MyExecSpace my_exec_space = MyExecSpace()) { + const MyExecSpace &my_exec_space, size_t num_elements, + view_type view_to_reduce, typename view_type::const_value_type threshold, + typename view_type::non_const_value_type &sum_reduction) { typedef Kokkos::RangePolicy range_policy_t; Kokkos::parallel_reduce( "KokkosKernels::Common::ReduceNumRowsLargerThanThreshold", @@ -1897,6 +1896,16 @@ void kk_reduce_numrows_larger_than_threshold( sum_reduction); } +template +void kk_reduce_numrows_larger_than_threshold( + size_t num_elements, view_type view_to_reduce, + typename view_type::const_value_type threshold, + typename view_type::non_const_value_type &sum_reduction) { + MyExecSpace my_exec_space; + kk_reduce_numrows_larger_than_threshold( + my_exec_space, num_elements, view_to_reduce, threshold, sum_reduction); +} + // Note: "block" in member name means it's block internal - otherwise it // addresses sparse rows/columns (whole blocks) within whole matrix. template diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 505b8c55a8..9db3a1b2d3 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -466,15 +466,12 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @brief Block Gauss-Seidel preconditioner setup (second phase, based on /// matrix's numeric values) /// -/// @tparam ExecSpaceIn This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param exec_space_in The execution space instance this kernel will be run -/// on. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -485,14 +482,12 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template void block_gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, - typename KernelHandle::const_nnz_lno_t num_rows, + KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -505,49 +500,8 @@ void block_gauss_seidel_numeric( } gsHandle->set_block_size(block_size); - gauss_seidel_numeric(exec_space_in, handle, num_rows, - num_cols, row_map, entries, values, - is_graph_symmetric); - - /* gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, given_inverse_diagonal, - is_graph_symmetric); */ -} - -/// -/// @brief Block Gauss-Seidel preconditioner setup (second phase, based on -/// matrix's numeric values) -/// -/// @tparam format The matrix storage format, CRS or BSR -/// @tparam KernelHandle A specialization of -/// KokkosKernels::Experimental::KokkosKernelsHandle -/// @tparam lno_row_view_t_ The matrix's rowmap type -/// @tparam lno_nnz_view_t_ The matrix's entries type -/// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param handle handle A KokkosKernelsHandle instance -/// @param num_rows Number of rows in the matrix -/// @param num_cols Number of columns in the matrix -/// @param block_size The number of degrees of freedom per block -/// @param row_map The matrix's rowmap -/// @param entries The matrix's entries -/// @param values The matrix's values -/// @param is_graph_symmetric Whether the upper-left num_rows x -/// num_rows submatrix of A is structurally symmetric -/// -template -void block_gauss_seidel_numeric( - KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, - typename KernelHandle::const_nnz_lno_t num_cols, - typename KernelHandle::const_nnz_lno_t block_size, lno_row_view_t_ row_map, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, - bool is_graph_symmetric = true) { - auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - block_gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, - block_size, row_map, entries, values, - is_graph_symmetric); + gauss_seidel_numeric(handle, num_rows, num_cols, row_map, entries, + values, is_graph_symmetric); } /// diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 80ba6e5153..98624a4137 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -104,7 +104,8 @@ class GaussSeidelHandle { * \brief Default constructor. */ GaussSeidelHandle(GSAlgorithm gs) - : num_streams(1), + : execution_space(HandleExecSpace()), + num_streams(1), algorithm_type(gs), color_xadj(), color_adj(), From f5618f874750d9f3d518a655368e2bb8dc8ed445 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 07:28:55 -0600 Subject: [PATCH 23/29] sparse/src: Update GS apply docs --- sparse/src/KokkosSparse_gauss_seidel.hpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 9db3a1b2d3..b7ce643cf9 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -520,7 +520,7 @@ void block_gauss_seidel_numeric( /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. /// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -801,7 +801,7 @@ void symmetric_block_gauss_seidel_apply( /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. /// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -1083,7 +1083,7 @@ void forward_sweep_block_gauss_seidel_apply( /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. /// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix From e8d809ffc3968906148ef0aaffc4892eb3aaacf8 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 14:05:50 -0600 Subject: [PATCH 24/29] Pass format through --- sparse/src/KokkosSparse_gauss_seidel.hpp | 35 ++++++++++++------------ 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index b7ce643cf9..02faca0729 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -306,8 +306,9 @@ void gauss_seidel_numeric(KernelHandle *handle, scalar_nnz_view_t_ values, bool is_graph_symmetric = true) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, is_graph_symmetric); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + is_graph_symmetric); } /// @@ -457,9 +458,9 @@ void gauss_seidel_numeric(KernelHandle *handle, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric = true) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - gauss_seidel_numeric(my_exec_space, handle, num_rows, num_cols, row_map, - entries, values, given_inverse_diagonal, - is_graph_symmetric); + gauss_seidel_numeric( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + given_inverse_diagonal, is_graph_symmetric); } /// @@ -714,10 +715,10 @@ void symmetric_gauss_seidel_apply( bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - symmetric_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, - row_map, entries, values, x_lhs_output_vec, - y_rhs_input_vec, init_zero_x_vector, - update_y_vector, omega, numIter); + symmetric_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -996,10 +997,10 @@ void forward_sweep_gauss_seidel_apply( bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - forward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, - row_map, entries, values, x_lhs_output_vec, - y_rhs_input_vec, init_zero_x_vector, - update_y_vector, omega, numIter); + forward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// @@ -1278,10 +1279,10 @@ void backward_sweep_gauss_seidel_apply( bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) { auto my_exec_space = handle->get_gs_handle()->get_execution_space(); - backward_sweep_gauss_seidel_apply(my_exec_space, handle, num_rows, num_cols, - row_map, entries, values, x_lhs_output_vec, - y_rhs_input_vec, init_zero_x_vector, - update_y_vector, omega, numIter); + backward_sweep_gauss_seidel_apply( + my_exec_space, handle, num_rows, num_cols, row_map, entries, values, + x_lhs_output_vec, y_rhs_input_vec, init_zero_x_vector, update_y_vector, + omega, numIter); } /// From 956e4c80ffaa23785ece6ef579f46090a2f69262 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 14:54:48 -0600 Subject: [PATCH 25/29] Use ExecutionSpace for user-facing APIs --- docs/developer/apidocs/sparse.rst | 12 +-- sparse/src/KokkosSparse_gauss_seidel.hpp | 111 +++++++++++------------ 2 files changed, 59 insertions(+), 64 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index 1f7e702fb4..d35a4eb851 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -63,17 +63,17 @@ gauss_seidel .. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(HandleExecSpace, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) -.. doxygenfunction:: gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) -.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index 02faca0729..f67d3bd17b 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -29,13 +29,12 @@ namespace Experimental { /// @brief Gauss-Seidel preconditioner setup (first phase, based on sparsity /// pattern only) /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type -/// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -45,9 +44,9 @@ namespace Experimental { /// num_rows submatrix of A is structurally symmetric /// @pre handle->create_gs_handle(...) has been called previously /// -template -void gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, +template +void gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -98,11 +97,10 @@ void gauss_seidel_symbolic(ExecSpaceIn &exec_space_in, KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_SYMBOLIC< - ExecSpaceIn, const_handle_type, Internal_alno_row_view_t_, - Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(exec_space_in, - &tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, + ExecutionSpace, const_handle_type, Internal_alno_row_view_t_, + Internal_alno_nnz_view_t_>::gauss_seidel_symbolic(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, is_graph_symmetric); } @@ -175,15 +173,14 @@ void block_gauss_seidel_symbolic( /// @brief Gauss-Seidel preconditioner setup (second phase, based on matrix's /// numeric values) /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle /// @tparam lno_row_view_t_ The matrix's rowmap type /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type -/// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -193,12 +190,12 @@ void block_gauss_seidel_symbolic( /// @param is_graph_symmetric Whether the upper-left num_rows x /// num_rows submatrix of A is structurally symmetric /// -template -void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, +void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -262,12 +259,12 @@ void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, - &tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, is_graph_symmetric); } @@ -316,7 +313,7 @@ void gauss_seidel_numeric(KernelHandle *handle, /// numeric values). This version accepts the matrix's inverse diagonal from the /// user. /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -324,8 +321,7 @@ void gauss_seidel_numeric(KernelHandle *handle, /// @tparam lno_nnz_view_t_ The matrix's entries type /// @tparam scalar_nnz_view_t_ The matrix's values type. The user-provided /// inverse diagonal must share this type. -/// @param exec_space_in The execution space instance this kernel will be run -/// on. +/// @param space The execution space instance this kernel will be run on. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix /// @param num_cols Number of columns in the matrix @@ -339,12 +335,12 @@ void gauss_seidel_numeric(KernelHandle *handle, /// the version of gauss_seidel_numeric that /// doesn't take it as an argument. The inverse diagonal will be /// computed internally. -template -void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, +void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -411,13 +407,12 @@ void gauss_seidel_numeric(ExecSpaceIn &exec_space_in, KernelHandle *handle, using namespace KokkosSparse::Impl; GAUSS_SEIDEL_NUMERIC< - ExecSpaceIn, const_handle_type, format, Internal_alno_row_view_t_, + ExecutionSpace, const_handle_type, format, Internal_alno_row_view_t_, Internal_alno_nnz_view_t_, - Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(exec_space_in, - &tmp_handle, num_rows, - num_cols, const_a_r, - const_a_l, const_a_v, - const_a_d, + Internal_ascalar_nnz_view_t_>::gauss_seidel_numeric(space, &tmp_handle, + num_rows, num_cols, + const_a_r, const_a_l, + const_a_v, const_a_d, is_graph_symmetric); } @@ -509,7 +504,7 @@ void block_gauss_seidel_numeric( /// @brief Apply symmetric (forward + backward) Gauss-Seidel preconditioner to /// system AX=Y /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -520,7 +515,7 @@ void block_gauss_seidel_numeric( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. -/// @param exec_space_in The execution space instance this kernel will be run +/// @param space The execution space instance this kernel will be run /// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle handle A KokkosKernelsHandle instance /// @param num_rows Number of rows in the matrix @@ -538,14 +533,14 @@ void block_gauss_seidel_numeric( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void symmetric_gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -662,14 +657,14 @@ void symmetric_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, - const_a_r, const_a_l, const_a_v, nonconst_x_v, - const_y_v, init_zero_x_vector, update_y_vector, omega, - numIter, true, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, true); } /// @@ -801,7 +796,7 @@ void symmetric_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. -/// @param exec_space_in The execution space instance this kernel will be run +/// @param space The execution space instance this kernel will be run /// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix @@ -819,14 +814,14 @@ void symmetric_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void forward_sweep_gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -945,14 +940,14 @@ void forward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, - const_a_r, const_a_l, const_a_v, nonconst_x_v, - const_y_v, init_zero_x_vector, update_y_vector, omega, - numIter, true, false); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + true, false); } /// @@ -1072,7 +1067,7 @@ void forward_sweep_block_gauss_seidel_apply( /// /// @brief Apply backward Gauss-Seidel preconditioner to system AX=Y /// -/// @tparam ExecSpaceIn This kernels execution space type. +/// @tparam ExecutionSpace This kernels execution space type. /// @tparam format The matrix storage format, CRS or BSR /// @tparam KernelHandle A specialization of /// KokkosKernels::Experimental::KokkosKernelsHandle @@ -1083,7 +1078,7 @@ void forward_sweep_block_gauss_seidel_apply( /// May be rank-1 or rank-2 View. /// @tparam y_scalar_view_t The type of the Y (right-hand side) vector. May be /// rank-1 or rank-2 View. -/// @param exec_space_in The execution space instance this kernel will be run +/// @param space The execution space instance this kernel will be run /// on. NOTE: Currently only used for GS_DEFAULT. /// @param handle KernelHandle instance /// @param num_rows Number of rows in the matrix @@ -1101,14 +1096,14 @@ void forward_sweep_block_gauss_seidel_apply( /// @pre y_rhs_input_vec.extent(0) == num_rows /// @pre x_lhs_output_vec.extent(1) == y_rhs_input_vec.extent(1) /// -template void backward_sweep_gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -1227,14 +1222,14 @@ void backward_sweep_gauss_seidel_apply( using namespace KokkosSparse::Impl; - GAUSS_SEIDEL_APPLY:: - gauss_seidel_apply(exec_space_in, &tmp_handle, num_rows, num_cols, - const_a_r, const_a_l, const_a_v, nonconst_x_v, - const_y_v, init_zero_x_vector, update_y_vector, omega, - numIter, false, true); + gauss_seidel_apply(space, &tmp_handle, num_rows, num_cols, const_a_r, + const_a_l, const_a_v, nonconst_x_v, const_y_v, + init_zero_x_vector, update_y_vector, omega, numIter, + false, true); } /// From c0f199185e7c172c3841a9bbea77c39caff0bfed Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Tue, 22 Aug 2023 15:08:32 -0600 Subject: [PATCH 26/29] error check to avoid undefined behavior --- sparse/src/KokkosSparse_gauss_seidel_handle.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp index 98624a4137..649229918d 100644 --- a/sparse/src/KokkosSparse_gauss_seidel_handle.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel_handle.hpp @@ -156,6 +156,12 @@ class GaussSeidelHandle { "The type of exec_space_in should be the same as " "GaussSeidelHandle::HandleExecSpace"); this->execution_space = exec_space_in; + } else { + if (exec_space_in != this->execution_space) + throw std::runtime_error( + "Gauss Seidel cannot be called on different execution spaces " + "without multiple handles. Please create a new handle via " + "create_gs_handle.\n"); } is_set = true; } From 492b9aa3a3f534787f7b343e9d382762f9413066 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 23 Aug 2023 07:57:45 -0600 Subject: [PATCH 27/29] Fix intel19 CI failure --- sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index 820d9ae447..a3b4d8ca37 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -740,6 +740,18 @@ void test_gauss_seidel_streams_rank1( lno_t numCols = numRows; typename crsMat_t::value_type m_omega = omega; +#ifdef KOKKOS_ENABLE_OPENMP + if (std::is_same_v) { + int exec_concurrency = execution_space().concurrency(); + if (exec_concurrency < nstreams) { + std::cerr << "TEST SKIPPED: Not enough concurrency to partition " + "execution space. exec_concurrency: " + << exec_concurrency << std::endl; + return; + } + } +#endif // KOKKOS_ENABLE_OPENMP + std::vector instances; if (nstreams == 1) instances = Kokkos::Experimental::partition_space(execution_space(), 1); From 3af7aace1894fe55499e7929749d73468ea3a5df Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Wed, 23 Aug 2023 07:58:19 -0600 Subject: [PATCH 28/29] Add runtime checks to PSGS --- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 1f386a28a4..45f42083a6 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1349,7 +1349,14 @@ class PointGaussSeidel { if (gsHandle->is_symbolic_called() == false) { this->initialize_symbolic(); } - // else + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // else #ifdef KOKKOSSPARSE_IMPL_TIME_REVERSE Kokkos::Timer timer; #endif @@ -1718,6 +1725,14 @@ class PointGaussSeidel { if (gsHandle->is_numeric_called() == false) { this->initialize_numeric(); } + + // Check settings + if (gsHandle->get_block_size() > 1 && + format != KokkosSparse::SparseMatrixFormat::BSR) + throw std::runtime_error( + "PointGaussSeidel block size > 1 but format is not " + "KokkosSparse::SparseMatrixFormat::BSR.\n"); + // make sure x and y have been allocated with the correct dimensions nnz_lno_t block_size = gsHandle->get_block_size(); gsHandle->allocate_x_y_vectors(this->num_rows * block_size, From d398cd976ec48fc35a3e92a9c7a159c0065d8a92 Mon Sep 17 00:00:00 2001 From: Evan Harvey Date: Thu, 24 Aug 2023 08:04:16 -0600 Subject: [PATCH 29/29] Implement PR feedback Remove a couple fences from the unit tests. --- docs/developer/apidocs/sparse.rst | 14 +++++++------- sparse/impl/KokkosSparse_gauss_seidel_impl.hpp | 2 -- sparse/impl/KokkosSparse_gauss_seidel_spec.hpp | 16 ++++++++-------- sparse/src/KokkosKernels_Handle.hpp | 2 +- sparse/src/KokkosSparse_gauss_seidel.hpp | 12 ++++++------ sparse/unit_test/Test_Sparse_gauss_seidel.hpp | 10 ++-------- 6 files changed, 24 insertions(+), 32 deletions(-) diff --git a/docs/developer/apidocs/sparse.rst b/docs/developer/apidocs/sparse.rst index d35a4eb851..165d8334ae 100644 --- a/docs/developer/apidocs/sparse.rst +++ b/docs/developer/apidocs/sparse.rst @@ -61,19 +61,19 @@ block_spgemm gauss_seidel ------------ .. doxygenfunction:: create_gs_handle(KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) -.. doxygenfunction:: create_gs_handle(HandleExecSpace, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) +.. doxygenfunction:: create_gs_handle(const HandleExecSpace&, int, KokkosSparse::GSAlgorithm gs_algorithm, KokkosGraph::ColoringAlgorithm coloring_algorithm) .. doxygenfunction:: create_gs_handle(KokkosSparse::ClusteringAlgorithm, nnz_lno_t, KokkosGraph::ColoringAlgorithm) -.. doxygenfunction:: gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_symbolic(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, bool is_graph_symmetric) -.. doxygenfunction:: gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) +.. doxygenfunction:: gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) .. doxygenfunction:: gauss_seidel_numeric(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, scalar_nnz_view_t_ given_inverse_diagonal, bool is_graph_symmetric) -.. doxygenfunction:: symmetric_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: symmetric_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: symmetric_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: forward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: forward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: forward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) -.. doxygenfunction:: backward_sweep_gauss_seidel_apply(ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) +.. doxygenfunction:: backward_sweep_gauss_seidel_apply(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) .. doxygenfunction:: backward_sweep_gauss_seidel_apply(KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, x_scalar_view_t x_lhs_output_vec, y_scalar_view_t y_rhs_input_vec, bool init_zero_x_vector, bool update_y_vector, typename KernelHandle::nnz_scalar_t omega, int numIter) block_gauss_seidel diff --git a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp index 45f42083a6..7391e00e3d 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_impl.hpp @@ -1860,7 +1860,6 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); - my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( @@ -1946,7 +1945,6 @@ class PointGaussSeidel { Kokkos::subview(gs._Yvector, Kokkos::ALL(), long_row_col); gs._long_row_col = long_row_col; Kokkos::deep_copy(my_exec_space, long_row_x, nnz_scalar_t()); - my_exec_space.fence(); Kokkos::parallel_for( labelLong, Kokkos::Experimental::require( diff --git a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp index 84c9dccf5c..840ced73b8 100644 --- a/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp +++ b/sparse/impl/KokkosSparse_gauss_seidel_spec.hpp @@ -128,7 +128,7 @@ template ::value> struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, bool is_graph_symmetric); @@ -144,13 +144,13 @@ template < KernelHandle, a_size_view_t_, a_lno_view_t, a_scalar_view_t>::value> struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric); static void gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, @@ -169,7 +169,7 @@ template ::value> struct GAUSS_SEIDEL_APPLY { static void gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, @@ -187,7 +187,7 @@ struct GAUSS_SEIDEL_SYMBOLIC { static void gauss_seidel_symbolic( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t_ entries, bool is_graph_symmetric) { @@ -224,7 +224,7 @@ struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_numeric( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, bool is_graph_symmetric) { @@ -255,7 +255,7 @@ struct GAUSS_SEIDEL_NUMERIC { static void gauss_seidel_apply( - ExecSpaceIn &exec_space_in, KernelHandle *handle, + const ExecSpaceIn &exec_space_in, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, a_size_view_t_ row_map, a_lno_view_t entries, a_scalar_view_t values, diff --git a/sparse/src/KokkosKernels_Handle.hpp b/sparse/src/KokkosKernels_Handle.hpp index a23826f864..d5a24ac1f1 100644 --- a/sparse/src/KokkosKernels_Handle.hpp +++ b/sparse/src/KokkosKernels_Handle.hpp @@ -630,7 +630,7 @@ class KokkosKernelsHandle { */ // clang-format on void create_gs_handle( - HandleExecSpace handle_exec_space, int num_streams, + const HandleExecSpace &handle_exec_space, int num_streams, KokkosSparse::GSAlgorithm gs_algorithm = KokkosSparse::GS_DEFAULT, KokkosGraph::ColoringAlgorithm coloring_algorithm = KokkosGraph::COLORING_DEFAULT) { diff --git a/sparse/src/KokkosSparse_gauss_seidel.hpp b/sparse/src/KokkosSparse_gauss_seidel.hpp index f67d3bd17b..036fe1b119 100644 --- a/sparse/src/KokkosSparse_gauss_seidel.hpp +++ b/sparse/src/KokkosSparse_gauss_seidel.hpp @@ -46,7 +46,7 @@ namespace Experimental { /// template -void gauss_seidel_symbolic(ExecutionSpace &space, KernelHandle *handle, +void gauss_seidel_symbolic(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -195,7 +195,7 @@ template -void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -340,7 +340,7 @@ template -void gauss_seidel_numeric(ExecutionSpace &space, KernelHandle *handle, +void gauss_seidel_numeric(const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, @@ -540,7 +540,7 @@ template void symmetric_gauss_seidel_apply( - ExecutionSpace &space, KernelHandle *handle, + const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -821,7 +821,7 @@ template void forward_sweep_gauss_seidel_apply( - ExecutionSpace &space, KernelHandle *handle, + const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, @@ -1103,7 +1103,7 @@ template void backward_sweep_gauss_seidel_apply( - ExecutionSpace &space, KernelHandle *handle, + const ExecutionSpace &space, KernelHandle *handle, typename KernelHandle::const_nnz_lno_t num_rows, typename KernelHandle::const_nnz_lno_t num_cols, lno_row_view_t_ row_map, lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, diff --git a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp index a3b4d8ca37..35fbcb44a4 100644 --- a/sparse/unit_test/Test_Sparse_gauss_seidel.hpp +++ b/sparse/unit_test/Test_Sparse_gauss_seidel.hpp @@ -808,24 +808,18 @@ void test_gauss_seidel_streams_rank1( int apply_count = 3; // test symmetric, forward, backward //*** Point-coloring version **** for (int apply_type = 0; apply_type < apply_count; ++apply_type) { - Kokkos::Timer timer1; - for (int i = 0; i < nstreams; i++) Kokkos::deep_copy(instances[i], x_vector_v[i], zero); - for (int i = 0; i < nstreams; i++) instances[i].fence(); run_gauss_seidel_streams(instances, kh_v, input_mat_v, x_vector_v, y_vector_v, symmetric, m_omega, apply_type, nstreams); - // double gs = timer1.seconds(); - // KokkosKernels::Impl::print_1Dview(x_vector); for (int i = 0; i < nstreams; i++) { - instances[i].fence(); // Wait for apply to finish updating x_vector KokkosBlas::axpby(instances[i], one, solution_x_v[i], -one, x_vector_v[i]); mag_t result_norm_res = KokkosBlas::nrm2(instances[i], x_vector_v[i]); - std::string info = "on stream_idx: " + std::to_string(i); - EXPECT_LT(result_norm_res, initial_norm_res_v[i]) << info; + EXPECT_LT(result_norm_res, initial_norm_res_v[i]) + << "on stream_idx: " << i; } }