diff --git a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp index 7605f03fa2..0b9afa7796 100644 --- a/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -22,10 +22,11 @@ namespace KokkosSparse { namespace Impl { -template -void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, +void sptrsvcuSPARSE_symbolic(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -61,6 +62,9 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int64_t nnz = static_cast(entries.extent(0)); size_t pBufferSize; void *rm; @@ -98,13 +102,13 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, CUSPARSE_INDEX_BASE_ZERO, cudaValueType)); // Create dummy dense vector B (RHS) - nnz_scalar_view_t b_dummy("b_dummy", nrows); + nnz_scalar_view_t b_dummy(Kokkos::view_alloc(space, "b_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecBDescr_dummy), static_cast(nrows), b_dummy.data(), cudaValueType)); // Create dummy dense vector X (LHS) - nnz_scalar_view_t x_dummy("x_dummy", nrows); + nnz_scalar_view_t x_dummy(Kokkos::view_alloc(space, "x_dummy"), nrows); KOKKOS_CUSPARSE_SAFE_CALL( cusparseCreateDnVec(&(h->vecXDescr_dummy), static_cast(nrows), x_dummy.data(), cudaValueType)); @@ -163,9 +167,12 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, bool is_lower = sptrsv_handle->is_lower_tri(); sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + cusparseStatus_t status; status = cusparseCreateCsrsv2Info(&(h->info)); if (CUSPARSE_STATUS_SUCCESS != status) @@ -178,85 +185,86 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, if (!std::is_same::value) sptrsv_handle->allocate_tmp_int_rowmap(row_map.extent(0)); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr_copy(row_map) - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); if (std::is_same::value) { cusparseDcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (double*)vals, (int*)rm, (int*)ent, h->info, + (double *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseDcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (double*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same::value) { cusparseScsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (float*)vals, (int*)rm, (int*)ent, h->info, + (float *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseScsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (float *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseZcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; - status = cusparseZcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuDoubleComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + status = + cusparseZcsrsv2_analysis(h->handle, h->transpose, nrows, nnz, + h->descr, (cuDoubleComplex *)vals, (int *)rm, + (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; } else if (std::is_same >::value) { cusparseCcsrsv2_bufferSize(h->handle, h->transpose, nrows, nnz, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, - &pBufferSize); + (cuComplex *)vals, (int *)rm, (int *)ent, + h->info, &pBufferSize); // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. cudaError_t my_error; - my_error = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + my_error = cudaMalloc((void **)&(h->pBuffer), pBufferSize); if (cudaSuccess != my_error) std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(my_error) << std::endl; status = cusparseCcsrsv2_analysis( - h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex*)vals, - (int*)rm, (int*)ent, h->info, h->policy, h->pBuffer); + h->handle, h->transpose, nrows, nnz, h->descr, (cuComplex *)vals, + (int *)rm, (int *)ent, h->info, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "analysis status error name " << (status) << std::endl; @@ -281,10 +289,11 @@ void sptrsvcuSPARSE_symbolic(KernelHandle *sptrsv_handle, } template < - typename KernelHandle, typename ain_row_index_view_type, - typename ain_nonzero_index_view_type, typename ain_values_scalar_view_type, - typename b_values_scalar_view_type, typename x_values_scalar_view_type> -void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, + typename ExecutionSpace, typename KernelHandle, + typename ain_row_index_view_type, typename ain_nonzero_index_view_type, + typename ain_values_scalar_view_type, typename b_values_scalar_view_type, + typename x_values_scalar_view_type> +void sptrsvcuSPARSE_solve(ExecutionSpace &space, KernelHandle *sptrsv_handle, typename KernelHandle::nnz_lno_t nrows, ain_row_index_view_type row_map, ain_nonzero_index_view_type entries, @@ -323,6 +332,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + const scalar_type alpha = scalar_type(1.0); cudaDataType cudaValueType = cuda_data_type_from(); @@ -354,18 +366,21 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, if (std::is_same::value) { cusparseStatus_t status; - typename KernelHandle::SPTRSVcuSparseHandleType* h = + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + KOKKOS_CUSPARSE_SAFE_CALL( + cusparseSetStream(h->handle, space.cuda_stream())); + int nnz = entries.extent_int(0); - const int* rm = !std::is_same::value + const int *rm = !std::is_same::value ? sptrsv_handle->get_int_rowmap_ptr() - : (const int*)row_map.data(); - const int* ent = (const int*)entries.data(); - const scalar_type* vals = values.data(); - const scalar_type* bv = rhs.data(); - scalar_type* xv = lhs.data(); + : (const int *)row_map.data(); + const int *ent = (const int *)entries.data(); + const scalar_type *vals = values.data(); + const scalar_type *bv = rhs.data(); + scalar_type *xv = lhs.data(); if (std::is_same::value) { if (h->pBuffer == nullptr) { @@ -373,10 +388,10 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, } const double alpha = double(1); - status = cusparseDcsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (double*)vals, (int*)rm, - (int*)ent, h->info, (double*)bv, - (double*)xv, h->policy, h->pBuffer); + status = cusparseDcsrsv2_solve( + h->handle, h->transpose, nrows, nnz, &alpha, h->descr, (double *)vals, + (int *)rm, (int *)ent, h->info, (double *)bv, (double *)xv, h->policy, + h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -387,9 +402,9 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, const float alpha = float(1); status = cusparseScsrsv2_solve(h->handle, h->transpose, nrows, nnz, - &alpha, h->descr, (float*)vals, (int*)rm, - (int*)ent, h->info, (float*)bv, (float*)xv, - h->policy, h->pBuffer); + &alpha, h->descr, (float *)vals, (int *)rm, + (int *)ent, h->info, (float *)bv, + (float *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -399,8 +414,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseZcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuDoubleComplex*)vals, (int*)rm, (int*)ent, h->info, - (cuDoubleComplex*)bv, (cuDoubleComplex*)xv, h->policy, h->pBuffer); + (cuDoubleComplex *)vals, (int *)rm, (int *)ent, h->info, + (cuDoubleComplex *)bv, (cuDoubleComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -410,8 +425,8 @@ void sptrsvcuSPARSE_solve(KernelHandle *sptrsv_handle, cualpha.y = 0.0; status = cusparseCcsrsv2_solve( h->handle, h->transpose, nrows, nnz, &cualpha, h->descr, - (cuComplex*)vals, (int*)rm, (int*)ent, h->info, (cuComplex*)bv, - (cuComplex*)xv, h->policy, h->pBuffer); + (cuComplex *)vals, (int *)rm, (int *)ent, h->info, (cuComplex *)bv, + (cuComplex *)xv, h->policy, h->pBuffer); if (CUSPARSE_STATUS_SUCCESS != status) std::cout << "solve status error name " << (status) << std::endl; @@ -539,13 +554,13 @@ void sptrsvcuSPARSE_solve_streams( "CUSPARSE requires local ordinals to be integer.\n"); } else { const scalar_type alpha = scalar_type(1.0); - std::vector sptrsv_handle_v(nstreams); - std::vector h_v(nstreams); - std::vector rm_v(nstreams); - std::vector ent_v(nstreams); - std::vector vals_v(nstreams); - std::vector bv_v(nstreams); - std::vector xv_v(nstreams); + std::vector sptrsv_handle_v(nstreams); + std::vector h_v(nstreams); + std::vector rm_v(nstreams); + std::vector ent_v(nstreams); + std::vector vals_v(nstreams); + std::vector bv_v(nstreams); + std::vector xv_v(nstreams); for (int i = 0; i < nstreams; i++) { sptrsv_handle_v[i] = handle_v[i].get_sptrsv_handle(); @@ -560,8 +575,8 @@ void sptrsvcuSPARSE_solve_streams( } rm_v[i] = !std::is_same::value ? sptrsv_handle_v[i]->get_int_rowmap_ptr() - : reinterpret_cast(row_map_v[i].data()); - ent_v[i] = reinterpret_cast(entries_v[i].data()); + : reinterpret_cast(row_map_v[i].data()); + ent_v[i] = reinterpret_cast(entries_v[i].data()); vals_v[i] = values_v[i].data(); bv_v[i] = rhs_v[i].data(); xv_v[i] = lhs_v[i].data(); @@ -573,42 +588,42 @@ void sptrsvcuSPARSE_solve_streams( if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseScsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseZcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else if (std::is_same >::value) { KOKKOS_CUSPARSE_SAFE_CALL(cusparseCcsrsv2_solve( h_v[i]->handle, h_v[i]->transpose, nrows, nnz, - reinterpret_cast(&alpha), h_v[i]->descr, - reinterpret_cast(vals_v[i]), - reinterpret_cast(rm_v[i]), - reinterpret_cast(ent_v[i]), h_v[i]->info, - reinterpret_cast(bv_v[i]), - reinterpret_cast(xv_v[i]), h_v[i]->policy, + reinterpret_cast(&alpha), h_v[i]->descr, + reinterpret_cast(vals_v[i]), + reinterpret_cast(rm_v[i]), + reinterpret_cast(ent_v[i]), h_v[i]->info, + reinterpret_cast(bv_v[i]), + reinterpret_cast(xv_v[i]), h_v[i]->policy, h_v[i]->pBuffer)); } else { throw std::runtime_error("CUSPARSE wrapper error: unsupported type.\n"); diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index b7e7fa1650..c2ae0759f9 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -2891,16 +2891,15 @@ void upper_tri_solve_cg(TriSolveHandle &thandle, const RowMapType row_map, #endif -template -void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void lower_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -2913,8 +2912,10 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); auto nodes_grouped_by_level_host = thandle.get_host_nodes_grouped_by_level(); -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) +#if defined( \ + KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) // Runs on the default stream using namespace KokkosSparse::Experimental; + using execution_space = typename TriSolveHandle::execution_space; using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; @@ -2972,8 +2973,10 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedRPSolverFunctor( @@ -2981,8 +2984,8 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm:: SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; - int team_size = thandle.get_team_size(); + using team_policy_t = Kokkos::TeamPolicy; + int team_size = thandle.get_team_size(); #ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED TriLvlSchedTP1SolverFunctor; + using team_policy_type = Kokkos::TeamPolicy; using supernode_view_type = Kokkos::View; @@ -3070,9 +3081,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; @@ -3109,7 +3123,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, auto Ljj = Kokkos::subview( viewL, range_type(0, nsrow), Kokkos::ALL()); // s-th supernocal column of L - KokkosBlas::gemv("N", one, Ljj, Xj, zero, Y); + KokkosBlas::gemv(space, "N", one, Ljj, Xj, zero, Y); } else { auto Xj = Kokkos::subview( lhs, @@ -3122,7 +3136,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (invert_diagonal) { auto Y = Kokkos::subview( work, range_type(workoffset, workoffset + nscol)); - KokkosBlas::gemv("N", one, Ljj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ljj, Y, zero, Xj); } else { char unit_diag = (unit_diagonal ? 'U' : 'N'); // NOTE: we currently supports only default_layout = @@ -3130,7 +3144,9 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "N", &unit_diag, one, Ljj, Xjj); + KokkosBlas::trsm(space, "L", "L", "N", &unit_diag, one, Ljj, + Xjj); + // TODO: space.fence(); Kokkos::fence(); } // update off-diagonal blocks @@ -3146,7 +3162,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, viewL, range_type(nscol, nsrow), Kokkos::ALL()); // off-diagonal blocks of s-th supernodal // column of L - KokkosBlas::gemv("N", one, Lij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Lij, Xj, zero, Z); } } } @@ -3156,9 +3172,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3169,9 +3188,12 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, supercols, row_map, entries, values, lvl, kernel_type, diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3191,7 +3213,7 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, #endif // initialize input & output vectors - using team_policy_type = Kokkos::TeamPolicy; + using team_policy_type = Kokkos::TeamPolicy; // update with spmv (one or two SpMV) bool transpose_spmv = @@ -3201,36 +3223,45 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // copy lhs corresponding to diagonal blocks to work and zero out in // lhs SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update off-diagonals (potentiall combined with solve with // diagonals) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); // reinitialize workspace SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3263,16 +3294,16 @@ void lower_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, } // end lower_tri_solve -template -void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs) { +template +void upper_tri_solve(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, + LHSType &lhs) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; - + using memory_space = typename ExecutionSpace::memory_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3287,8 +3318,10 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) +#if defined( \ + KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) // Runs on the default stream using namespace KokkosSparse::Experimental; + using execution_space = typename TriSolveHandle::execution_space; using memory_space = typename TriSolveHandle::memory_space; using integer_view_t = typename TriSolveHandle::integer_view_t; using integer_view_host_t = typename TriSolveHandle::integer_view_host_t; @@ -3349,14 +3382,16 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP) { Kokkos::parallel_for( "parfor_fixed_lvl", - Kokkos::RangePolicy(node_count, - node_count + lvl_nodes), + Kokkos::Experimental::require( + Kokkos::RangePolicy(space, node_count, + node_count + lvl_nodes), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedRPSolverFunctor( row_map, entries, values, lhs, rhs, nodes_grouped_by_level)); } else if (thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1) { - typedef Kokkos::TeamPolicy policy_type; + using team_policy_t = Kokkos::TeamPolicy; int team_size = thandle.get_team_size(); @@ -3372,11 +3407,19 @@ void upper_tri_solve(TriSolveHandle &thandle, const RowMapType row_map, node_count); #endif if (team_size == -1) - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, Kokkos::AUTO), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); else - Kokkos::parallel_for("parfor_u_team", - policy_type(lvl_nodes, team_size), tstf); + Kokkos::parallel_for( + "parfor_u_team", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, team_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } // TP2 algorithm has issues with some offset-ordinal combo to be addressed /* @@ -3441,9 +3484,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-2, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } for (size_type league_rank = 0; league_rank < lvl_nodes; league_rank++) { @@ -3484,7 +3530,7 @@ tstf); } // end elseif workoffset, workoffset + nsrow)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } else { // extract part of the solution, corresponding to the diagonal // block @@ -3501,14 +3547,14 @@ tstf); } // end elseif workoffset, workoffset + nscol)); // needed for gemv instead of trmv/trsv - KokkosBlas::gemv("N", one, Ujj, Y, zero, Xj); + KokkosBlas::gemv(space, "N", one, Ujj, Y, zero, Xj); } else { // NOTE: we currently supports only default_layout = // LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "U", "N", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "U", "N", "N", one, Ujj, Xjj); } // update off-diagonal blocks if (nsrow2 > 0) { @@ -3522,7 +3568,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("N", one, Uij, Xj, zero, Z); + KokkosBlas::gemv(space, "N", one, Uij, Xj, zero, Z); } } } @@ -3532,9 +3578,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } @@ -3546,10 +3595,13 @@ tstf); } // end elseif diag_kernel_type, lhs, work, work_offset, nodes_grouped_by_level, node_count); - using policy_type = Kokkos::TeamPolicy; - Kokkos::parallel_for("parfor_usolve_tran_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_tran_supernode", + Kokkos::Experimental::require( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); } else { // U stored in CSR // launching sparse-triangular solve functor UpperTriSupernodalFunctor; - Kokkos::parallel_for("parfor_usolve_supernode", - policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_functor); + using team_policy_t = Kokkos::TeamPolicy; + Kokkos::parallel_for( + "parfor_usolve_supernode", + Kokkos::Experimental::team_policy_t( + team_policy_t(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_functor); if (diag_kernel_type_host(lvl) == 3) { // using device-level kernels (functor is called to gather the input @@ -3618,7 +3673,7 @@ tstf); } // end elseif workoffset + nscol, workoffset + nscol + nsrow2)); // needed with gemv for update&scatter - KokkosBlas::gemv("T", -one, Uij, Z, one, Xj); + KokkosBlas::gemv(space, "T", -one, Uij, Z, one, Xj); } // "triangular-solve" to compute Xj @@ -3626,13 +3681,13 @@ tstf); } // end elseif auto Ujj = Kokkos::subview(viewU, range_type(0, nscol), Kokkos::ALL()); if (invert_diagonal) { - KokkosBlas::gemv("T", one, Ujj, Xj, zero, Y); + KokkosBlas::gemv(space, "T", one, Ujj, Xj, zero, Y); } else { // NOTE: we currently supports only default_layout = LayoutLeft Kokkos::View Xjj(Xj.data(), nscol, 1); - KokkosBlas::trsm("L", "L", "T", "N", one, Ujj, Xjj); + KokkosBlas::trsm(space, "L", "L", "T", "N", one, Ujj, Xjj); } } if (invert_diagonal) { @@ -3641,9 +3696,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, work_offset_data, lhs, work); - Kokkos::parallel_for("parfor_tri_supernode_spmv", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_tri_supernode_spmv", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } } } @@ -3675,28 +3733,34 @@ tstf); } // end elseif if (!invert_offdiagonal) { // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, lhs, one, work); + KokkosSparse::spmv(space, tran, one, digmat, lhs, one, work); // copy from work to lhs corresponding to diagonal blocks SparseTriSupernodalSpMVFunctor sptrsv_init_functor(-1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } else { // zero out lhs corresponding to diagonal blocks in lhs, and copy to // work SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); } // update with off-diagonals (potentiall combined with diagonal // solves) auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, submat, work, one, lhs); } else { if (!invert_offdiagonal) { // zero out lhs corresponding to diagonal blocks in lhs, and copy to @@ -3704,17 +3768,20 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_init_functor(1, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_init_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_init_functor); // update with off-diagonals auto submat = thandle.get_submatrix(lvl); - KokkosSparse::spmv(tran, one, submat, lhs, one, work); + KokkosSparse::spmv(tspace, 0ran, one, submat, lhs, one, work); // solve with diagonals auto digmat = thandle.get_diagblock(lvl); - KokkosSparse::spmv(tran, one, digmat, work, one, lhs); + KokkosSparse::spmv(space, tran, one, digmat, work, one, lhs); } else { std::cout << " ** invert_offdiag with U in CSR not supported **" << std::endl; @@ -3724,9 +3791,12 @@ tstf); } // end elseif SparseTriSupernodalSpMVFunctor sptrsv_finalize_functor(0, node_count, nodes_grouped_by_level, supercols, supercols, lhs, work); - Kokkos::parallel_for("parfor_lsolve_supernode", - team_policy_type(lvl_nodes, Kokkos::AUTO), - sptrsv_finalize_functor); + Kokkos::parallel_for( + "parfor_lsolve_supernode", + Kokkos::Experimental::require( + team_policy_type(space, lvl_nodes, Kokkos::AUTO), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + sptrsv_finalize_functor); #ifdef profile_supernodal_etree Kokkos::fence(); @@ -3749,23 +3819,22 @@ tstf); } // end elseif double sptrsv_time_seconds = sptrsv_timer.seconds(); std::cout << " + SpTrsv(uppper) time: " << sptrsv_time_seconds << std::endl << std::endl; - std::cout << " + Execution space : " << execution_space::name() + std::cout << " + Execution space : " << ExecutionSpace::name() << std::endl; std::cout << " + Memory space : " << memory_space::name() << std::endl; #endif } // end upper_tri_solve -template -void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - const RHSType &rhs, LHSType &lhs, +template +void tri_solve_chain(ExecutionSpace &space, TriSolveHandle &thandle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, const RHSType &rhs, LHSType &lhs, const bool /*is_lowertri_*/) { #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) cudaProfilerStop(); #endif - typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; @@ -3786,9 +3855,9 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, size_type node_count = 0; // REFACTORED to cleanup; next, need debug and timer routines - using policy_type = Kokkos::TeamPolicy; + using policy_type = Kokkos::TeamPolicy; using large_cutoff_policy_type = - Kokkos::TeamPolicy; + Kokkos::TeamPolicy; /* using TP1Functor = TriLvlSchedTP1SolverFunctor; using LTP1Functor = @@ -3849,14 +3918,17 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_l_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_l_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3868,7 +3940,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, if (team_size_singleblock <= 0) { team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -3891,7 +3963,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -3909,11 +3984,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_l_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? @@ -3939,16 +4018,19 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif if (team_size == -1) { team_size = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended(tstf, Kokkos::ParallelForTag()); } // TODO To use cudagraph here, need to know how many non-unit chains // there are, create a graph for each and launch accordingly size_type lvl_nodes = hnodes_per_level(schain); // lvl == echain???? - Kokkos::parallel_for("parfor_u_team_chain1", - policy_type(lvl_nodes, team_size, vector_size), - tstf); + Kokkos::parallel_for( + "parfor_u_team_chain1", + Kokkos::Experimental::require( + policy_type(space, lvl_nodes, team_size, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); node_count += lvl_nodes; } else { @@ -3964,7 +4046,7 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, // values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), // Kokkos::ParallelForTag()); team_size_singleblock = - policy_type(1, 1, vector_size) + policy_type(space, 1, 1, vector_size) .team_size_recommended( SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, @@ -3987,7 +4069,10 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti", - policy_type(1, team_size_singleblock, vector_size), tstf); + Kokkos::Experimental::require( + policy_type(space, 1, team_size_singleblock, vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), + tstf); } else { // team_size_singleblock < cutoff => kernel must allow for a // block-stride internally @@ -4005,11 +4090,15 @@ void tri_solve_chain(TriSolveHandle &thandle, const RowMapType row_map, #endif Kokkos::parallel_for( "parfor_u_team_chainmulti_cutoff", - large_cutoff_policy_type(1, team_size_singleblock, vector_size), + Kokkos::Experimental::require( + large_cutoff_policy_type(1, team_size_singleblock, + vector_size), + Kokkos::Experimental::WorkItemProperty::HintLightWeight), tstf); } node_count += lvl_nodes; } + // TODO: space.fence() Kokkos::fence(); // TODO - is this necessary? that is, can the // parallel_for launch before the s/echain values have // been updated? diff --git a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index e36b9df236..22f40588f7 100644 --- a/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -96,9 +96,9 @@ template ::value> struct SPTRSV_SOLVE { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x); + static void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x); static void sptrsv_solve_streams( const std::vector &execspace_v, @@ -117,9 +117,9 @@ template { - static void sptrsv_solve(KernelHandle *handle, const RowMapType row_map, - const EntriesType entries, const ValuesType values, - BType b, XType x) { + static void sptrsv_solve(ExecutionSpace, &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries, + const ValuesType values, BType b, XType x) { // Call specific algorithm type auto sptrsv_handle = handle->get_sptrsv_handle(); Kokkos::Profiling::pushRegion(sptrsv_handle->is_lower_tri() @@ -127,40 +127,44 @@ struct SPTRSV_SOLVEis_lower_tri()) { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, true); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, true); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::lower_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::lower_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::lower_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } else { if (sptrsv_handle->is_symbolic_complete() == false) { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, + entries); } if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN) { - Experimental::tri_solve_chain(*sptrsv_handle, row_map, entries, values, - b, x, false); + Experimental::tri_solve_chain(space, *sptrsv_handle, row_map, entries, + values, b, x, false); } else { #ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT using ExecSpace = typename RowMapType::memory_space::execution_space; if (std::is_same::value) + // TODO: set stream in thandle's sptrsvCudaGraph Experimental::upper_tri_solve_cg(*sptrsv_handle, row_map, entries, values, b, x); else #endif - Experimental::upper_tri_solve(*sptrsv_handle, row_map, entries, + Experimental::upper_tri_solve(space, *sptrsv_handle, row_map, entries, values, b, x); } } diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 3ef3be8780..3615092400 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -147,9 +147,10 @@ void symbolic_chain_phase(TriSolveHandle& thandle, #endif } // end symbolic_chain_phase -template -void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, - const EntriesType dentries) { +template +void lower_tri_symbolic(ExecSpaceIn& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_lowertri_total; Kokkos::Timer timer; @@ -177,10 +178,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -193,11 +194,12 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // wait for deep copy write to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -227,9 +229,10 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, // Create the chain now if (thandle.algm_requires_symb_chain()) { + // No need to pass in space, chain phase runs on the host symbolic_chain_phase(thandle, nodes_per_level); } - + // TODO: continue thandle.set_symbolic_complete(); // Output check @@ -257,9 +260,9 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -279,6 +282,7 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", @@ -292,7 +296,8 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, } #endif } -#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV +#ifdef KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV // Runs on default execution + // space else if (thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_NAIVE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_ETREE || thandle.get_algorithm() == SPTRSVAlgorithm::SUPERNODAL_DAG || @@ -598,9 +603,20 @@ void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif } // end lower_tri_symbolic +#if 0 // TODO: remove template -void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, +void lower_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, const EntriesType dentries) { + using ExecutionSpace = typename RowMapType::execution_space; + auto my_exec_space = ExecutionSpace(); + lower_tri_symbolic(my_exec_space, thandle, drow_map, dentries); +} +#endif + +template +void upper_tri_symbolic(ExecutionSpace& space, TriSolveHandle& thandle, + const RowMapType drow_map, const EntriesType dentries) { #ifdef TRISOLVE_SYMB_TIMERS Kokkos::Timer timer_sym_uppertri_total; Kokkos::Timer timer; @@ -626,10 +642,10 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, size_type nrows = drow_map.extent(0) - 1; auto row_map = Kokkos::create_mirror_view(drow_map); - Kokkos::deep_copy(row_map, drow_map); + Kokkos::deep_copy(space, row_map, drow_map); auto entries = Kokkos::create_mirror_view(dentries); - Kokkos::deep_copy(entries, dentries); + Kokkos::deep_copy(space, entries, dentries); // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); @@ -642,11 +658,12 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); - Kokkos::deep_copy(level_list, dlevel_list); + Kokkos::deep_copy(space, level_list, dlevel_list); signed_integral_t level = 0; size_type node_count = 0; + space.fence(); // Wait for deep copy writes to land typename DeviceEntriesType::HostMirror level_ptr( "lp", nrows + 1); // temp View used for index bookkeeping level_ptr(0) = 0; @@ -708,9 +725,9 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif // Deep copy to device views - Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); - Kokkos::deep_copy(dnodes_per_level, nodes_per_level); - Kokkos::deep_copy(dlevel_list, level_list); + Kokkos::deep_copy(space, dnodes_grouped_by_level, nodes_grouped_by_level); + Kokkos::deep_copy(space, dnodes_per_level, nodes_per_level); + Kokkos::deep_copy(space, dlevel_list, level_list); // Extra check: #ifdef LVL_OUTPUT_INFO @@ -730,6 +747,7 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, check_count); std::cout << " host check_count= " << check_count << std::endl; + space.fence(); // wait for deep copy writes to land check_count = 0; // reset Kokkos::parallel_reduce( "check_count device", @@ -1069,6 +1087,16 @@ void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, #endif } // end upper_tri_symbolic +#if 0 // TODO: remove +template +void upper_tri_symbolic(TriSolveHandle& thandle, const RowMapType drow_map, + const EntriesType dentries) { + using ExecutionSpace = typename RowMapType::execution_space; + auto my_exec_space = ExecutionSpace(); + upper_tri_symbolic(my_exec_space, thandle, drow_map, dentries); +} +#endif + } // namespace Experimental } // namespace Impl } // namespace KokkosSparse diff --git a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index 73389d10d0..ccdd2cfff5 100644 --- a/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -67,33 +67,37 @@ namespace Impl { // Unification layer /// \brief Implementation of KokkosSparse::sptrsv_symbolic -template ::value, bool eti_spec_avail = sptrsv_symbolic_eti_spec_avail< KernelHandle, RowMapType, EntriesType>::value> struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, + static void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries); }; #if !defined(KOKKOSKERNELS_ETI_ONLY) || KOKKOSKERNELS_IMPL_COMPILE_LIBRARY //! Full specialization of sptrsv_symbolic // Unification layer -template -struct SPTRSV_SYMBOLIC { - static void sptrsv_symbolic(KernelHandle *handle, const RowMapType row_map, +template +struct SPTRSV_SYMBOLIC { + static void sptrsv_symbolic(ExecSpaceIn &space, KernelHandle *handle, + const RowMapType row_map, const EntriesType entries) { auto sptrsv_handle = handle->get_sptrsv_handle(); auto nrows = row_map.extent(0) - 1; sptrsv_handle->new_init_handle(nrows); if (sptrsv_handle->is_lower_tri()) { - Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::lower_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } else { - Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); + Experimental::upper_tri_symbolic(space, *sptrsv_handle, row_map, entries); sptrsv_handle->set_symbolic_complete(); } } @@ -113,6 +117,7 @@ struct SPTRSV_SYMBOLIC, \ @@ -130,6 +135,7 @@ struct SPTRSV_SYMBOLIC, \ diff --git a/sparse/src/KokkosSparse_sptrsv.hpp b/sparse/src/KokkosSparse_sptrsv.hpp index 859918c58d..8b63bc194e 100644 --- a/sparse/src/KokkosSparse_sptrsv.hpp +++ b/sparse/src/KokkosSparse_sptrsv.hpp @@ -40,10 +40,22 @@ namespace Experimental { std::is_same::type, \ typename std::remove_const::type>::value -template -void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries) { +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ +template +void sptrsv_symbolic(const ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; @@ -95,7 +107,8 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, KokkosSparse::Impl::SPTRSV_SYMBOLIC< const_handle_type, RowMap_Internal, - Entries_Internal>::sptrsv_symbolic(&tmp_handle, rowmap_i, entries_i); + Entries_Internal>::sptrsv_symbolic(space, &tmp_handle, rowmap_i, + entries_i); #ifdef KK_TRISOLVE_TIMERS std::cout << " > sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -103,10 +116,45 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + */ template void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + lno_nnz_view_t_ entries) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries); +} + +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space type + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ +template +void sptrsv_symbolic(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -180,10 +228,10 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, KokkosSparse::Impl::sptrsvcuSPARSE_symbolic< sptrsvHandleType, RowMap_Internal, Entries_Internal, Values_Internal>( - sh, nrows, rowmap_i, entries_i, values_i, false); + space, sh, nrows, rowmap_i, entries_i, values_i, false); } else { - KokkosSparse::Experimental::sptrsv_symbolic(handle, rowmap, entries); + KokkosSparse::Experimental::sptrsv_symbolic(space, handle, rowmap, entries); } #ifdef KK_TRISOLVE_TIMERS std::cout << " + sptrsv_symbolic time = " << timer_sptrsv.seconds() @@ -191,12 +239,52 @@ void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, #endif } // sptrsv_symbolic +/** + * @brief sptrsv symbolic phase for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + */ template -void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, - lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, - XType x) { + typename lno_nnz_view_t_, typename scalar_nnz_view_t_> +void sptrsv_symbolic(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_symbolic(my_exec_space, handle, rowmap, entries, values); +} + +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam ExecutionSpace This kernels execution space + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param space The execution space instance this kernel will be run on + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(ExecutionSpace &space, KernelHandle *handle, + lno_row_view_t_ rowmap, lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values, BType b, XType x) { typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; @@ -308,7 +396,7 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, KokkosSparse::Impl::sptrsvcuSPARSE_solve( - sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + space, sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); } else { KokkosSparse::Impl::SPTRSV_SOLVE< @@ -320,7 +408,36 @@ void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, } // sptrsv_solve -#if defined(KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) +/** + * @brief sptrsv solve phase of x for linear system Ax=b + * + * @tparam KernelHandle A specialization of + * KokkosKernels::Experimental::KokkosKernelsHandle + * @tparam lno_row_view_t_ The CRS matrix's (A) rowmap type + * @tparam lno_nnz_view_t_ The CRS matrix's (A) entries type + * @tparam scalar_nnz_view_t_ The CRS matrix's (A) values type + * @tparam BType The b vector type + * @tparam XType The x vector type + * @param handle KernelHandle instance + * @param rowmap The CRS matrix's (A) rowmap + * @param entries The CRS matrix's (A) entries + * @param values The CRS matrix's (A) values + * @param b The b vector + * @param x The x vector + */ +template +void sptrsv_solve(KernelHandle *handle, lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, scalar_nnz_view_t_ values, BType b, + XType x) { + using ExecutionSpace = typename KernelHandle::HandleExecSpace; + auto my_exec_space = ExecutionSpace(); + sptrsv_solve(my_exec_space, handle, rowmap, entries, values, b, x); +} + +#if defined( \ + KOKKOSKERNELS_ENABLE_SUPERNODAL_SPTRSV) // Runs on the default stream // --------------------------------------------------------------------- template void sptrsv_solve(KernelHandle *handle, XType x, XType b) {