diff --git a/example/buildlib/compileKokkosKernelsSimple.sh b/example/buildlib/compileKokkosKernelsSimple.sh index b947e2b3e0..20d0a7aef4 100755 --- a/example/buildlib/compileKokkosKernelsSimple.sh +++ b/example/buildlib/compileKokkosKernelsSimple.sh @@ -4,10 +4,10 @@ KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate. KOKKOSKERNELS_ORDINALS=int #ordinal types to instantiate KOKKOSKERNELS_OFFSETS=int #offset types to instantiate KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory. -CXX=icpc #${KOKKOS_PATH}/config/nvcc_wrapper #icpc # +CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc # KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels -KOKKOS_DEVICES=OpenMP # other devices Cuda,Serial .. -KOKKOS_ARCHS=KNL +KOKKOS_DEVICES=Cuda # other devices Cuda,Serial .. +KOKKOS_ARCHS=SKX,Volta70 CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wtype-limits -Wuninitialized" ../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS} --cxxflags="${CXXFLAGS}" diff --git a/perf_test/Makefile b/perf_test/Makefile index b219dbc7cd..28e1aa0223 100644 --- a/perf_test/Makefile +++ b/perf_test/Makefile @@ -119,7 +119,7 @@ default: $(TEST_TARGETS) build: $(TEST_TARGETS) %.exe:%.o $(KOKKOS_LINK_DEPENDS) $(KOKKOSKERNELS_LINK_DEPENDS) $(TEST_HEADERS) - $(LINK) $(EXTRA_PATH) $< $(KOKKOSKERNELS_LDFLAGS) $(KOKKOSKERNELS_LIBS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o $@ + $(LINK) $(EXTRA_PATH) $< $(KOKKOSKERNELS_LDFLAGS) $(KOKKOSKERNELS_LIBS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(LDLIB) -o $@ %.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(KOKKOSKERNELS_CPP_DEPENDS) $(TEST_HEADERS) $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOSKERNELS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(INC) $(CXXFLAGS) $(EXTRA_INC) -I. -c $< -o $(notdir $@) diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp index c817a1b1d8..5573d10cfb 100644 --- a/perf_test/sparse/KokkosSparse_spiluk.cpp +++ b/perf_test/sparse/KokkosSparse_spiluk.cpp @@ -90,7 +90,7 @@ using namespace KokkosKernels::Experimental; enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1/*, LVLSCHED_TP2*/}; template -int test_spiluk_perf(std::vector tests, std::string afilename, int k, int team_size, int vector_length, /*int idx_offset,*/ int loop) { +int test_spiluk_perf(std::vector tests, std::string afilename, int K, int team_size, int vector_length, /*int idx_offset,*/ int loop) { typedef Scalar scalar_t; typedef default_lno_t lno_t; @@ -119,12 +119,12 @@ int test_spiluk_perf(std::vector tests, std::string afilename, int k, int t std::cout << "\n\n" << std::endl; if (!afilename.empty()) { - std::cout << "ILU(k) Begin: Read matrix filename " << afilename << std::endl; + std::cout << "ILU(K) Begin: Read matrix filename " << afilename << std::endl; crsmat_t A = KokkosKernels::Impl::read_kokkos_crst_matrix(afilename.c_str()); //in_matrix graph_t graph = A.graph; // in_graph const size_type nrows = graph.numRows(); const int nnz = A.nnz(); - const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(k) ; + const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(K) ; #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE //cuSPARSE requires lno_t = size_type = int. For both, int is always used (if enabled) diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp index 81057409c9..bae058d175 100644 --- a/perf_test/sparse/KokkosSparse_sptrsv.cpp +++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp @@ -64,6 +64,8 @@ #include "KokkosSparse_CrsMatrix.hpp" #include +//#define INTERNAL_CUSPARSE + #if defined(KOKKOSKERNELS_INST_ORDINAL_INT) typedef int default_lno_t; #elif defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T) @@ -86,12 +88,51 @@ using namespace KokkosSparse::Experimental; using namespace KokkosKernels; using namespace KokkosKernels::Experimental; -enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1/*, LVLSCHED_TP2*/}; +//#define PRINTVIEWSSPTRSVPERF +//#define PRINT_HLEVEL_FREQ_PLOT +//#define PRINT_LEVEL_LIST +enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1, /*LVLSCHED_TP2,*/ LVLSCHED_TP1CHAIN, CUSPARSE_K}; +#ifdef PRINTVIEWSSPTRSVPERF +template +void print_view1d(const ViewType dv) { + auto v = Kokkos::create_mirror_view(dv); + Kokkos::deep_copy(v,dv); + std::cout << "Output for view " << v.label() << std::endl; + for (size_t i = 0; i < v.extent(0); ++i) { + std::cout << "v(" << i << ") = " << v(i) << " , "; + } + std::cout << std::endl; +} +#else +template +void print_view1d(const ViewType /*dv*/) {} +#endif + +template +void check_entries_sorted(const RowMapType drow_map, const EntriesType dentries) { + auto row_map = Kokkos::create_mirror_view(drow_map); + Kokkos::deep_copy(row_map,drow_map); + auto entries = Kokkos::create_mirror_view(dentries); + Kokkos::deep_copy(entries,dentries); + + for (size_t row = 0; row < row_map.extent(0)-1; ++row) { + size_t start = row_map(row); + size_t end = row_map(row+1); + for (size_t offset = start; offset < end-1; ++offset) { + size_t pcol = entries(offset); + size_t ncol = entries(offset+1); + if (pcol > ncol) { + std::cout << " UNSORTED!!" << std::endl; + } + } + } + +} template -int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string& ufilename, int team_size, int vector_length, int idx_offset, int loop) { +int test_sptrsv_perf(std::vector tests, const std::string& lfilename, const std::string& ufilename, const int team_size, const int vector_length, const int idx_offset, const int loop, const int chain_threshold = 0, const float dense_row_percent = -1.0) { typedef Scalar scalar_t; typedef default_lno_t lno_t; @@ -107,8 +148,8 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - scalar_t ZERO = scalar_t(0); - scalar_t ONE = scalar_t(1); + const scalar_t ZERO = scalar_t(0); + const scalar_t ONE = scalar_t(1); // Read lmtx @@ -144,9 +185,24 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string auto entries = graph.entries; auto values = triMtx.values; + std::cout << "Lower Perf: row_map.extent(0) = " << row_map.extent(0) << std::endl; + std::cout << "Lower Perf: entries.extent(0) = " << entries.extent(0) << std::endl; + std::cout << "Lower Perf: values.extent(0) = " << values.extent(0) << std::endl; + std::cout << "Lower Perf: lhs.extent(0) = " << lhs.extent(0) << std::endl; + std::cout << "Lower Perf: rhs.extent(0) = " << rhs.extent(0) << std::endl; -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + check_entries_sorted(row_map, entries); + +#ifdef PRINTVIEWSSPTRSVPERF + print_view1d(row_map); + print_view1d(entries); + print_view1d(values); + print_view1d(known_lhs); + print_view1d(rhs); +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) //std::cout << " cusparse: create handle" << std::endl; cusparseStatus_t status; cusparseHandle_t handle = 0; @@ -198,7 +254,7 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string KernelHandle kh; bool is_lower_tri = true; - std::cout << "Create handle" << std::endl; + std::cout << "Create handle (lower)" << std::endl; switch(test) { case LVLSCHED_RP: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, is_lower_tri); @@ -206,22 +262,44 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string break; case LVLSCHED_TP1: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + std::cout << "TP1 set team_size = " << team_size << std::endl; + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); + kh.get_sptrsv_handle()->print_algorithm(); + break; + case LVLSCHED_TP1CHAIN: + printf("TP1 with CHAIN\n"); + printf("chain_threshold %d\n", chain_threshold); + printf("team_size %d\n", team_size); + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri); + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); + if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length); kh.get_sptrsv_handle()->print_algorithm(); break; /* case LVLSCHED_TP2: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2, nrows, is_lower_tri); + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); + if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length); kh.get_sptrsv_handle()->print_algorithm(); break; */ -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + case CUSPARSE_K: + printf("CUSPARSE WRAPPER\n"); + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri); + kh.get_sptrsv_handle()->print_algorithm(); + break; case CUSPARSE: +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) std::cout << "CUSPARSE: No kk interface added yet" << std::endl; //cusparse_matvec(A, x, y, rows_per_thread, team_size, vector_length); break; +#else + std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl; #endif default: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); kh.get_sptrsv_handle()->print_algorithm(); } @@ -230,7 +308,14 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string Kokkos::Timer timer; if (test != CUSPARSE) { timer.reset(); - sptrsv_symbolic( &kh, row_map, entries ); + if (test == CUSPARSE_K) { + printf("cusparsek symbolic\n"); + sptrsv_symbolic( &kh, row_map, entries, values ); + printf(" finished cusparsek symbolic\n"); + } + else { + sptrsv_symbolic( &kh, row_map, entries ); + } std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl; //std::cout << "TriSolve Solve" << std::endl; @@ -240,16 +325,9 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string std::cout << "LTRI Solve Time: " << timer.seconds() << std::endl; } -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) // step 4: perform analysis else { -#if 0 - double *dvalues = (double *)(values.data()); - int *drow_map = (int *)(row_map.data()); - int *dentries = (int *)(entries.data()); - double *dlhs = (double *)(lhs.data()); - double *drhs = (double *)(rhs.data()); -#endif //int nnz = triMtx.nnz(); //std::cout << " cusparse path: analysis" << std::endl; //status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy, pBuffer); @@ -283,13 +361,32 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string } #endif // Error Check - scalar_t sum = 0.0; Kokkos::fence(); + { + scalar_t sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { - tsum += lhs(i); + tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i)); }, sum); + scalar_t norm_ssd = sqrt(sum / lhs.extent(0)); + std::cout << " ssd = " << sum << " norm_sqrt_ssd = " << norm_ssd << std::endl; + + if ( norm_ssd > 1e-8 ) { + std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl; + return 1; + } + else { + std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl; + } + + /* + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), + KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += lhs(i); + }, sum); + if ( sum != lhs.extent(0) ) { std::cout << "Lower Tri Solve FAILURE: sum = " << sum << std::endl; auto hsoln = Kokkos::create_mirror_view(lhs); @@ -300,7 +397,9 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string return 1; } else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; + std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl; + } + */ } @@ -310,13 +409,35 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string double max_time = 0.0; double ave_time = 0.0; - for(int i=0;i(0, lhs.extent(0)), + KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i)); + }, sum); + + scalar_t norm_ssd = sqrt(sum / lhs.extent(0)); + std::cout << " ssd = " << sum << " norm_sqrt_ssd = " << norm_ssd << std::endl; + if ( norm_ssd > 1e-8 ) { + std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl; + return 1; + } + else { + std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl; + } + } + #endif } -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) else { cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer); } @@ -328,19 +449,92 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string if(time>max_time) max_time = time; if(timeget_host_nodes_per_level(); + auto nlevels = kh.get_sptrsv_handle()->get_num_levels(); + std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string(); + std::cout << algmstring << std::endl; + // Create filename + std::string filename = "lower_nodes_per_level_" + algmstring + ".txt"; + std::cout << filename << std::endl; + std::cout << " nlevels = " << nlevels << std::endl; + std::ofstream outfile; + outfile.open(filename); + if (outfile.is_open()) { + for ( int i = 0; i < nlevels; ++i ) { + outfile << hnpl(i) << std::endl; + //std::cout << hnpl(i) << std::endl; + } + outfile.close(); + } + else { + std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl; + } + + auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level(); + filename = "lower_nodes_groupby_level_" + algmstring + ".txt"; + std::cout << filename << std::endl; + outfile.open(filename); + if (outfile.is_open()) { + for ( size_t i = 0; i < hngpl.extent(0); ++i ) + outfile << hngpl(i) << std::endl; + outfile.close(); + } + else { + std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl; + } + + } + #endif + + #ifdef PRINT_LEVEL_LIST + if (test != CUSPARSE) + { + auto level_list = kh.get_sptrsv_handle()->get_level_list(); + auto hlevel_list = Kokkos::create_mirror_view(level_list); + Kokkos::deep_copy(hlevel_list, level_list); + + auto nlevels = kh.get_sptrsv_handle()->get_num_levels(); + + std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string(); + std::cout << algmstring << std::endl; + // Create filename + std::string filename = "lower_level_list_" + algmstring + ".txt"; + std::cout << filename << std::endl; + std::cout << " nlevels = " << nlevels << " nodes = " << hlevel_list.extent(0) << std::endl; + std::ofstream outfile; + outfile.open(filename); + if (outfile.is_open()) { + for ( size_t i = 0; i < hlevel_list.extent(0); ++i ) + outfile << hlevel_list(i) << std::endl; + outfile.close(); + } + else { + std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl; + } + } + #endif + + kh.destroy_sptrsv_handle(); } -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) // step 6: free resources cudaFree(pBuffer); cusparseDestroyCsrsv2Info(info); cusparseDestroyMatDescr(descr); cusparseDestroy(handle); #endif - } + } // end lowertri + Kokkos::fence(); std::cout << "\n\n" << std::endl; // UPPERTRI @@ -369,7 +563,24 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string auto entries = graph.entries; auto values = triMtx.values; -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + std::cout << "Upper Perf: row_map.extent(0) = " << row_map.extent(0) << std::endl; + std::cout << "Upper Perf: entries.extent(0) = " << entries.extent(0) << std::endl; + std::cout << "Upper Perf: values.extent(0) = " << values.extent(0) << std::endl; + + std::cout << "Upper Perf: lhs.extent(0) = " << lhs.extent(0) << std::endl; + std::cout << "Upper Perf: rhs.extent(0) = " << rhs.extent(0) << std::endl; + + check_entries_sorted(row_map, entries); + +#ifdef PRINTVIEWSSPTRSVPERF + print_view1d(row_map); + print_view1d(entries); + print_view1d(values); + print_view1d(known_lhs); + print_view1d(rhs); +#endif + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) //std::cout << " cusparse: create handle" << std::endl; cusparseStatus_t status; cusparseHandle_t handle = 0; @@ -421,7 +632,7 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string KernelHandle kh; bool is_lower_tri = false; - std::cout << "Create handle" << std::endl; + std::cout << "Create handle (upper)" << std::endl; switch(test) { case LVLSCHED_RP: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, is_lower_tri); @@ -429,22 +640,39 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string break; case LVLSCHED_TP1: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + std::cout << "TP1 set team_size = " << team_size << std::endl; + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); + kh.get_sptrsv_handle()->print_algorithm(); + break; + case LVLSCHED_TP1CHAIN: + printf("TP1 with CHAIN\n"); + printf("chain_threshold %d\n", chain_threshold); + printf("team_size %d\n", team_size); + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri); + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); + if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length); kh.get_sptrsv_handle()->print_algorithm(); break; /* case LVLSCHED_TP2: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2, nrows, is_lower_tri); + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); + if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length); kh.get_sptrsv_handle()->print_algorithm(); break; */ -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE case CUSPARSE: +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) std::cout << "CUSPARSE: No kk interface added yet" << std::endl; //cusparse_matvec(A, x, y, rows_per_thread, team_size, vector_length); break; +#else + std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl; #endif default: kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size); kh.get_sptrsv_handle()->print_algorithm(); } @@ -463,16 +691,9 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string std::cout << "UTRI Solve Time: " << timer.seconds() << std::endl; } -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) // step 4: perform analysis else { -#if 0 - double *dvalues = (double *)(values.data()); - int *drow_map = (int *)(row_map.data()); - int *dentries = (int *)(entries.data()); - double *dlhs = (double *)(lhs.data()); - double *drhs = (double *)(rhs.data()); -#endif //int nnz = triMtx.nnz(); //std::cout << " cusparse path: analysis" << std::endl; //status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy, pBuffer); @@ -506,7 +727,26 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string #endif // Error Check Kokkos::fence(); + { scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), + KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i)); + }, sum); + + scalar_t norm_ssd = sqrt(sum / lhs.extent(0)); + std::cout << " ssd = " << sum << " norm_sqrt_ssd = " << norm_ssd << std::endl; + + if ( norm_ssd > 1e-8 ) { + std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl; + return 1; + } + else { + std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl; + } + + /* + sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { tsum += lhs(i); @@ -522,7 +762,9 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string return 1; } else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; + std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl; + } + */ } // Benchmark @@ -531,13 +773,35 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string double max_time = 0.0; double ave_time = 0.0; - for(int i=0;i(0, lhs.extent(0)), + KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) { + tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i)); + }, sum); + + scalar_t norm_ssd = sqrt(sum / lhs.extent(0)); + std::cout << " ssd = " << sum << " norm_sqrt_ssd = " << norm_ssd << std::endl; + if ( norm_ssd > 1e-8 ) { + std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl; + return 1; + } + else { + std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl; + } + } + #endif } -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) else { cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer); } @@ -553,15 +817,88 @@ int test_sptrsv_perf(std::vector tests, std::string& lfilename, std::string std::cout << "LOOP_AVG_TIME: " << ave_time/loop << std::endl; std::cout << "LOOP_MAX_TIME: " << max_time << std::endl; std::cout << "LOOP_MIN_TIME: " << min_time << std::endl; + + // Output for level frequency plot + #ifdef PRINT_HLEVEL_FREQ_PLOT + if (test != CUSPARSE) + { + auto hnpl = kh.get_sptrsv_handle()->get_host_nodes_per_level(); + auto nlevels = kh.get_sptrsv_handle()->get_num_levels(); + std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string(); + std::cout << algmstring << std::endl; + // Create filename + std::string filename = "upper_nodes_per_level_" + algmstring + ".txt"; + std::cout << filename << std::endl; + std::cout << " nlevels = " << nlevels << std::endl; + std::ofstream outfile; + outfile.open(filename); + if (outfile.is_open()) { + for ( int i = 0; i < nlevels; ++i ) { + outfile << hnpl(i) << std::endl; + //std::cout << hnpl(i) << std::endl; + } + outfile.close(); + } + else { + std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl; + } + + auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level(); + filename = "upper_nodes_groupby_level_" + algmstring + ".txt"; + std::cout << filename << std::endl; + outfile.open(filename); + if (outfile.is_open()) { + for ( size_t i = 0; i < hngpl.extent(0); ++i ) + outfile << hngpl(i) << std::endl; + outfile.close(); + } + else { + std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl; + } + + } + #endif + + #ifdef PRINT_LEVEL_LIST + if (test != CUSPARSE) + { + auto level_list = kh.get_sptrsv_handle()->get_level_list(); + auto hlevel_list = Kokkos::create_mirror_view(level_list); + Kokkos::deep_copy(hlevel_list, level_list); + + auto nlevels = kh.get_sptrsv_handle()->get_num_levels(); + + std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string(); + std::cout << algmstring << std::endl; + // Create filename + std::string filename = "upper_level_list_" + algmstring + ".txt"; + std::cout << filename << std::endl; + std::cout << " nlevels = " << nlevels << " nodes = " << hlevel_list.extent(0) << std::endl; + std::ofstream outfile; + outfile.open(filename); + if (outfile.is_open()) { + for ( size_t i = 0; i < hlevel_list.extent(0); ++i ) + outfile << hlevel_list(i) << std::endl; + outfile.close(); + } + else { + std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl; + } + } + #endif + + kh.destroy_sptrsv_handle(); } -#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + +#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE) // step 6: free resources cudaFree(pBuffer); cusparseDestroyCsrsv2Info(info); cusparseDestroyMatDescr(descr); cusparseDestroy(handle); #endif - } + } // end uppertri + Kokkos::fence(); return 0; } @@ -571,21 +908,23 @@ void print_help_sptrsv() { printf("Options:\n"); printf(" --test [OPTION] : Use different kernel implementations\n"); printf(" Options:\n"); - printf(" lvlrp, lvltp1, lvltp2\n\n"); + printf(" lvlrp, lvltp1, lvltp2, lvltp1chain, lvldensetp1, lvldensetp2\n\n"); printf(" cusparse (Vendor Libraries)\n\n"); - printf(" -lf [file] : Read in Matrix Market formatted text file 'file'.\n"); - printf(" -uf [file] : Read in Matrix Market formatted text file 'file'.\n"); + printf(" -lf [file] : Read in Matrix Market formatted text file 'file'.\n"); + printf(" -uf [file] : Read in Matrix Market formatted text file 'file'.\n"); + printf(" --offset [O] : Subtract O from every index.\n"); + printf(" Useful in case the matrix market file is not 0 based.\n\n"); + printf(" -ts [T] : Number of threads per team.\n"); + printf(" -vl [V] : Vector-length (i.e. how many Cuda threads are a Kokkos 'thread').\n"); + printf(" -ct [V] : Chain threshold: Only has effect of lvltp1chain algorithm.\n"); + printf(" -dr [V] : Dense row percent (as float): Only has effect of lvldensetp1 algorithm.\n"); + printf(" --loop [LOOP] : How many spmv to run to aggregate average time. \n"); +// printf(" --write-lvl-freq: Write output files with number of nodes per level for each matrix and algorithm.\n"); // printf(" -s [N] : generate a semi-random banded (band size 0.01xN) NxN matrix\n"); // printf(" with average of 10 entries per row.\n"); // printf(" --schedule [SCH]: Set schedule for kk variant (static,dynamic,auto [ default ]).\n"); // printf(" -fb [file] : Read in binary Matrix files 'file'.\n"); // printf(" --write-binary : In combination with -f, generate binary files.\n"); - printf(" --offset [O] : Subtract O from every index.\n"); - printf(" Useful in case the matrix market file is not 0 based.\n\n"); - printf(" -rpt [K] : Number of Rows assigned to a thread.\n"); - printf(" -ts [T] : Number of threads per team.\n"); - printf(" -vl [V] : Vector-length (i.e. how many Cuda threads are a Kokkos 'thread').\n"); - printf(" --loop [LOOP] : How many spmv to run to aggregate average time. \n"); } @@ -600,6 +939,8 @@ int main(int argc, char **argv) int team_size = -1; int idx_offset = 0; int loop = 1; + int chain_threshold = 0; + float dense_row_percent = -1.0; // int schedule=AUTO; if(argc == 1) { @@ -617,20 +958,29 @@ int main(int argc, char **argv) if((strcmp(argv[i],"lvltp1")==0)) { tests.push_back( LVLSCHED_TP1 ); } -/* + if((strcmp(argv[i],"lvltp1chain")==0)) { + tests.push_back( LVLSCHED_TP1CHAIN ); + } + /* if((strcmp(argv[i],"lvltp2")==0)) { tests.push_back( LVLSCHED_TP2 ); } -*/ + */ if((strcmp(argv[i],"cusparse")==0)) { tests.push_back( CUSPARSE ); } + if((strcmp(argv[i],"cusparsek")==0)) { + tests.push_back( CUSPARSE_K ); + } continue; } if((strcmp(argv[i],"-lf")==0)) {lfilename = argv[++i]; continue;} if((strcmp(argv[i],"-uf")==0)) {ufilename = argv[++i]; continue;} if((strcmp(argv[i],"-ts")==0)) {team_size=atoi(argv[++i]); continue;} if((strcmp(argv[i],"-vl")==0)) {vector_length=atoi(argv[++i]); continue;} + if((strcmp(argv[i],"-ct")==0)) {chain_threshold=atoi(argv[++i]); continue;} + if((strcmp(argv[i],"-dr")==0)) {dense_row_percent=atof(argv[++i]); continue;} + if((strcmp(argv[i],"-l")==0)) {loop=atoi(argv[++i]); continue;} if((strcmp(argv[i],"--offset")==0)) {idx_offset=atoi(argv[++i]); continue;} if((strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;} /* @@ -663,7 +1013,7 @@ int main(int argc, char **argv) Kokkos::initialize(argc,argv); { - int total_errors = test_sptrsv_perf(tests,lfilename,ufilename,team_size,vector_length,idx_offset,loop); + int total_errors = test_sptrsv_perf(tests, lfilename, ufilename, team_size, vector_length, idx_offset, loop, chain_threshold, dense_row_percent); if(total_errors == 0) printf("Kokkos::SPTRSV Test: Passed\n"); @@ -677,6 +1027,7 @@ int main(int argc, char **argv) } #else int main() { + std::cout << "KokkosSparse_sptrsv: This perf_test will do nothing when Cuda is enabled without lambda support." << std::endl; return 0; } #endif diff --git a/scripts/generate_makefile.bash b/scripts/generate_makefile.bash index 20be08abfb..c6937ddf40 100755 --- a/scripts/generate_makefile.bash +++ b/scripts/generate_makefile.bash @@ -81,6 +81,9 @@ do --ldflags*) LDFLAGS="${key#*=}" ;; + --ldlib*) + LDLIB="${key#*=}" + ;; --debug|-dbg) KOKKOS_DEBUG=yes ;; @@ -296,6 +299,11 @@ if [ ${#LDFLAGS} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\"" fi +if [ ${#LDLIB} -gt 0 ]; then + echo "LDLIB: $LDLIB" + KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDLIB=\"${LDLIB}\"" +fi + if [ ${#GTEST_PATH} -gt 0 ]; then KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}" else diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp index 7caf38944b..a33fa00164 100644 --- a/src/common/KokkosKernels_Handle.hpp +++ b/src/common/KokkosKernels_Handle.hpp @@ -558,7 +558,7 @@ class KokkosKernelsHandle this->destroy_sptrsv_handle(); this->is_owner_of_the_sptrsv_handle = true; this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri); - this->sptrsvHandle->reset_handle(nrows); +// this->sptrsvHandle->init_handle(nrows); this->sptrsvHandle->set_team_size(this->team_work_size); this->sptrsvHandle->set_vector_size(this->vector_size); } diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp index d45cc7d214..a20624f185 100644 --- a/src/sparse/KokkosSparse_spgemm_handle.hpp +++ b/src/sparse/KokkosSparse_spgemm_handle.hpp @@ -126,8 +126,6 @@ class SPGEMMHandle{ typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type - - #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE struct cuSparseHandleType{ cusparseHandle_t handle; diff --git a/src/sparse/KokkosSparse_sptrsv.hpp b/src/sparse/KokkosSparse_sptrsv.hpp index 94a86f2f8c..2b2ef260ab 100644 --- a/src/sparse/KokkosSparse_sptrsv.hpp +++ b/src/sparse/KokkosSparse_sptrsv.hpp @@ -58,6 +58,8 @@ #include "KokkosSparse_sptrsv_symbolic_spec.hpp" #include "KokkosSparse_sptrsv_solve_spec.hpp" +#include "KokkosSparse_sptrsv_cuSPARSE_impl.hpp" + namespace KokkosSparse { namespace Experimental { @@ -112,6 +114,82 @@ namespace Experimental { } // sptrsv_symbolic + template + void sptrsv_symbolic( + KernelHandle *handle, + lno_row_view_t_ rowmap, + lno_nnz_view_t_ entries, + scalar_nnz_view_t_ values) + { + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::nnz_lno_t ordinal_type; + typedef typename KernelHandle::nnz_scalar_t scalar_type; + + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_row_view_t_::non_const_value_type, size_type), + "sptrsv_symbolic: A size_type must match KernelHandle size_type (const doesn't matter)"); + + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_nnz_view_t_::non_const_value_type, ordinal_type), + "sptrsv_symbolic: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)"); + + static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename scalar_nnz_view_t_::value_type, scalar_type), + "sptrsv_symbolic: A scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)"); + + typedef typename KernelHandle::const_size_type c_size_t; + typedef typename KernelHandle::const_nnz_lno_t c_lno_t; + typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t; + + typedef typename KernelHandle::HandleExecSpace c_exec_t; + typedef typename KernelHandle::HandleTempMemorySpace c_temp_t; + typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t; + + typedef typename KokkosKernels::Experimental::KokkosKernelsHandle const_handle_type; + const_handle_type tmp_handle (*handle); + + typedef Kokkos::View< + typename lno_row_view_t_::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename lno_row_view_t_::device_type, + Kokkos::MemoryTraits > RowMap_Internal; + + typedef Kokkos::View< + typename lno_nnz_view_t_::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename lno_nnz_view_t_::device_type, + Kokkos::MemoryTraits > Entries_Internal; + + typedef Kokkos::View< + typename scalar_nnz_view_t_::const_value_type*, + typename KokkosKernels::Impl::GetUnifiedLayout::array_layout, + typename scalar_nnz_view_t_::device_type, + Kokkos::MemoryTraits > Values_Internal; + + auto sptrsv_handle = handle->get_sptrsv_handle(); + if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + RowMap_Internal rowmap_i = rowmap; + Entries_Internal entries_i = entries; + Values_Internal values_i = values; + + + typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; + sptrsvHandleType *sh = handle->get_sptrsv_handle(); + auto nrows = sh->get_nrows(); + + KokkosSparse::Impl::sptrsvcuSPARSE_symbolic + < sptrsvHandleType, + RowMap_Internal, + Entries_Internal, + Values_Internal > + (sh, nrows, rowmap_i, entries_i, values_i, false); + + } + else { + KokkosSparse::Experimental::sptrsv_symbolic (handle, rowmap, entries); + } + + } // sptrsv_symbolic template ::sptrsv_solve (&tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i); + auto sptrsv_handle = handle->get_sptrsv_handle(); + if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) { + typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType; + sptrsvHandleType *sh = handle->get_sptrsv_handle(); + auto nrows = sh->get_nrows(); + + KokkosSparse::Impl::sptrsvcuSPARSE_solve + < sptrsvHandleType, + RowMap_Internal, + Entries_Internal, + Values_Internal, + BType_Internal, + XType_Internal > + (sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false); + + } + else { + KokkosSparse::Impl::SPTRSV_SOLVE::sptrsv_solve (&tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i); + } } // sptrsv_solve diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp index 005ad8d828..d92456a41e 100644 --- a/src/sparse/KokkosSparse_sptrsv_handle.hpp +++ b/src/sparse/KokkosSparse_sptrsv_handle.hpp @@ -46,14 +46,22 @@ #include #include -#ifndef _SPTRSVHANDLE_HPP -#define _SPTRSVHANDLE_HPP +#ifndef KOKKOSSPARSE_SPTRSVHANDLE_HPP +#define KOKKOSSPARSE_SPTRSVHANDLE_HPP + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#include "cusparse.h" +#endif + +#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH) +#define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT +#endif namespace KokkosSparse { namespace Experimental { -// TP2 algorithm has issues with some offset-ordinal combo to be addressed -enum class SPTRSVAlgorithm { SEQLVLSCHD_RP, SEQLVLSCHD_TP1/*, SEQLVLSCHED_TP2*/ }; +// TODO TP2 algorithm had issues with some offset-ordinal combo to be addressed when compiled in Trilinos... +enum class SPTRSVAlgorithm { SEQLVLSCHD_RP, SEQLVLSCHD_TP1, /*SEQLVLSCHED_TP2,*/ SEQLVLSCHD_TP1CHAIN, SPTRSV_CUSPARSE }; template ::type nnz_lno_t; typedef const nnz_lno_t const_nnz_lno_t; - typedef typename std::remove_const::type nnz_scalar_t; - typedef const nnz_scalar_t const_nnz_scalar_t; + typedef typename std::remove_const::type scalar_t; + typedef const scalar_t const_nnz_scalar_t; + // row_map type (managed memory) typedef typename Kokkos::View nnz_row_view_temp_t; typedef typename Kokkos::View nnz_row_view_t; + typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t; // typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type + typedef typename Kokkos::View> nnz_row_unmanaged_view_t; // for rank1 subviews - typedef typename Kokkos::View nnz_scalar_view_temp_t; - typedef typename Kokkos::View nnz_scalar_view_t; - + // values type (managed memory) + typedef typename Kokkos::View nnz_scalar_view_temp_t; + typedef typename Kokkos::View nnz_scalar_view_t; + typedef typename nnz_scalar_view_t::HostMirror host_nnz_scalar_view_t; + typedef typename Kokkos::View> nnz_scalar_unmanaged_view_t; // for rank1 subviews + // entries type (managed memory) typedef typename Kokkos::View nnz_lno_view_temp_t; typedef typename Kokkos::View nnz_lno_view_t; + typedef typename nnz_lno_view_t::HostMirror host_nnz_lno_view_t; + typedef typename Kokkos::View> nnz_lno_unmanaged_view_t; // for rank1 subviews // typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type typedef typename std::make_signed::type signed_integral_t; typedef Kokkos::View< signed_integral_t*, typename nnz_row_view_t::array_layout, typename nnz_row_view_t::device_type, typename nnz_row_view_t::memory_traits > signed_nnz_lno_view_t; + typedef typename signed_nnz_lno_view_t::HostMirror host_signed_nnz_lno_view_t; + + typedef typename Kokkos::View mtx_scalar_view_t; + + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + struct cuSparseHandleType { + cusparseHandle_t handle; + cusparseOperation_t transpose; + csrsv2Info_t info {0}; + cusparseMatDescr_t descr; + cusparseSolvePolicy_t policy; + void *pBuffer {nullptr}; + + cuSparseHandleType(bool transpose_, bool is_lower){ + cusparseStatus_t status; + status= cusparseCreate(&handle); + if (status != CUSPARSE_STATUS_SUCCESS) { + throw std::runtime_error ("cusparseCreate ERROR\n"); + } + cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST); + + if (transpose_){ + transpose = CUSPARSE_OPERATION_TRANSPOSE; + } + else { + transpose = CUSPARSE_OPERATION_NON_TRANSPOSE; + } + + status = cusparseCreateMatDescr(&descr); + if (status != CUSPARSE_STATUS_SUCCESS) { + throw std::runtime_error ("cusparseCreateMatDescr descr ERROR\n"); + } + cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO); + + if (is_lower) + cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER); + else + cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_UPPER); + + cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT); + + policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL; + } + + ~cuSparseHandleType() { + if (pBuffer != nullptr) { + cudaFree(pBuffer); + pBuffer = nullptr; + } + cusparseDestroyMatDescr(descr); + cusparseDestroy(handle); + } + }; + typedef cuSparseHandleType SPTRSVcuSparseHandleType; +#endif + + +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + bool cudagraphCreated; // Move this later + struct cudaGraphWrapperType { + cudaGraph_t cudagraph; + cudaGraphExec_t cudagraphinstance; + cudaStream_t stream; + + //cudaGraphWrapperType() { } + //~cudaGraphWrapperType() { } + }; + + typedef cudaGraphWrapperType SPTRSVcudaGraphWrapperType; + + void create_SPTRSVcudaGraphWrapperType() { + destroy_SPTRSVcudaGraphWrapperType(); + sptrsvCudaGraph = new SPTRSVcudaGraphWrapperType; + cudaStreamCreate(&sptrsvCudaGraph->stream); + } + + void destroy_SPTRSVcudaGraphWrapperType() { + if(sptrsvCudaGraph != nullptr) { + //cudaGraphExecDestroy(sptrsvCudaGraph->cudagraphinstance); + //cudaGraphDestroy(sptrsvCudaGraph->cudagraph); + cudaStreamDestroy(sptrsvCudaGraph->stream); + delete sptrsvCudaGraph; + sptrsvCudaGraph = nullptr; + } + } + + SPTRSVcudaGraphWrapperType* get_sptrsvCudaGraph() { + return sptrsvCudaGraph; + } +#endif private: - signed_nnz_lno_view_t level_list; - nnz_lno_view_t nodes_per_level; - nnz_lno_view_t nodes_grouped_by_level; +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + SPTRSVcudaGraphWrapperType *sptrsvCudaGraph; +#endif size_type nrows; - size_type nlevel; bool lower_tri; - bool symbolic_complete; - SPTRSVAlgorithm algm; + // Symbolic: Level scheduling data + signed_nnz_lno_view_t level_list; + nnz_lno_view_t nodes_per_level; + host_nnz_lno_view_t hnodes_per_level; // NEW + nnz_lno_view_t nodes_grouped_by_level; + host_nnz_lno_view_t hnodes_grouped_by_level; // NEW + size_type nlevel; + int team_size; int vector_size; + bool stored_diagonal; + nnz_lno_view_t diagonal_offsets; + nnz_scalar_view_t diagonal_values; // inserted by rowid + + host_nnz_lno_view_t hdiagonal_offsets; + host_nnz_scalar_view_t hdiagonal_values; // inserted by rowid + + // Symbolic: Single-block chain data + host_signed_nnz_lno_view_t h_chain_ptr; + size_type num_chain_entries; + signed_integral_t chain_threshold; + + bool symbolic_complete; + bool require_symbolic_lvlsched_phase; + bool require_symbolic_chain_phase; + + void set_if_algm_require_symb_lvlsched () { + if (algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP + || algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1 + /*|| algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2*/ + || algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN + ) + { + require_symbolic_lvlsched_phase = true; + } + else { + require_symbolic_lvlsched_phase = false; + } + } + + void set_if_algm_require_symb_chain () { + if (algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN + ) + { + require_symbolic_chain_phase = true; + } + else { + require_symbolic_chain_phase = false; + } + } + + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + SPTRSVcuSparseHandleType *cuSPARSEHandle; +#endif public: - SPTRSVHandle ( SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, bool symbolic_complete_ = false ) : + SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, bool symbolic_complete_ = false) : +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + cudagraphCreated(false), + sptrsvCudaGraph(nullptr), +#endif + nrows(nrows_), + lower_tri(lower_tri_), + algm(choice), level_list(), nodes_per_level(), + hnodes_per_level(), nodes_grouped_by_level(), - nrows(nrows_), + hnodes_grouped_by_level(), nlevel(0), - lower_tri( lower_tri_ ), - symbolic_complete( symbolic_complete_ ), - algm(choice), team_size(-1), - vector_size(-1) - {} - -#if 0 - SPTRSVHandle ( SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, bool symbolic_complete_ = false ) : - level_list( Kokkos::ViewAllocateWithoutInitializing("level_list"), nrows), - nodes_per_level("nodes_per_level", nrows), - nodes_grouped_by_level("nodes_grouped_by_level", nrows), - nrows(nrows_), - nlevel(0), - lower_tri( lower_tri_ ), - symbolic_complete( symbolic_complete_ ), - algm(choice) + vector_size(-1), + stored_diagonal(false), + diagonal_offsets(), + diagonal_values(), // inserted by rowid + hdiagonal_offsets(), + hdiagonal_values(), + h_chain_ptr(), + num_chain_entries(0), + chain_threshold(-1), + symbolic_complete(symbolic_complete_), + require_symbolic_lvlsched_phase(false), + require_symbolic_chain_phase(false) +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + ,cuSPARSEHandle(nullptr) +#endif { - // WithoutInitializing - Kokkos::deep_copy( level_list, signed_integral_t(-1) ); + this->set_if_algm_require_symb_lvlsched(); + this->set_if_algm_require_symb_chain(); } -/* - template - SPTRSVHandle ( SPTRSVHandle< rhslno_row_view_t_, rhslno_nnz_view_t_, rhsscalar_nnz_view_t_, rhsExecutionSpace, rhsMemorySpace > & rhs ) { - - this->level_list = rhs.level_list; - this->nodes_per_level = rhs.nodes_per_level; - this->nodes_grouped_by_level = rhs.nodes_grouped_by_level; - this->nrows = rhs.nrows; - this->nlevel = rhs.nlevel; - this->lower_tri = rhs.lower_tri; - this->symbolic_complete = rhs.symbolic_complete; - this->algm = rhs.algm; - } + // Requires nrows_ input + // Allocates all views + void new_init_handle(const size_type nrows_) { + //set_nrows(nrows_); + nrows = nrows_; + // Assumed that level scheduling occurs during symbolic phase for all algorithms, for now + + // TODO: Set sizes differently/smaller, resize during symbolic to save space + if ( this->require_symbolic_lvlsched_phase == true ) + { + set_num_levels(0); + level_list = signed_nnz_lno_view_t(Kokkos::ViewAllocateWithoutInitializing("level_list"), nrows_); + Kokkos::deep_copy( level_list, signed_integral_t(-1) ); + nodes_per_level = nnz_lno_view_t("nodes_per_level", nrows_); + hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); + nodes_grouped_by_level = nnz_lno_view_t("nodes_grouped_by_level", nrows_); + hnodes_grouped_by_level = Kokkos::create_mirror_view(nodes_grouped_by_level); + +#if 0 + std::cout << " newinit_handle: level schedule allocs" << std::endl; + std::cout << " ll.extent = " << level_list.extent(0) << std::endl; + std::cout << " npl.extent = " << nodes_per_level.extent(0) << std::endl; + std::cout << " hnpl.extent = " << hnodes_per_level.extent(0) << std::endl; + std::cout << " ngbl.extent = " << nodes_grouped_by_level.extent(0) << std::endl; + std::cout << " hngbl.extent = " << hnodes_grouped_by_level.extent(0) << std::endl; +#endif + } + + if (stored_diagonal) { + diagonal_offsets = nnz_lno_view_t(Kokkos::ViewAllocateWithoutInitializing("diagonal_offsets"), nrows_); + diagonal_values = nnz_scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("diagonal_values"), nrows_); // inserted by rowid + hdiagonal_offsets = Kokkos::create_mirror_view(diagonal_offsets); + hdiagonal_values = Kokkos::create_mirror_view(diagonal_values); + } + + if (this->require_symbolic_chain_phase == true) + { + if (this->chain_threshold == -1) { + // Need default if chain_threshold not set + // 0 means every level, regardless of number of nodes, is launched within a kernel + if (team_size == -1) { + this->chain_threshold = 0; + h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows); + } + else { + std::cout << " Warning: chain_threshold was not set - will default to team_size = " << this->team_size << " chain_threshold = " << this->chain_threshold << std::endl; + this->chain_threshold = this->team_size; + h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows); + } + } + else { + if (this->team_size >= this->chain_threshold) { + h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows); + } + else if (this->team_size == -1 && chain_threshold > 0) { + std::cout << " Warning: team_size was not set; chain_threshold = " << this->chain_threshold << std::endl; + std::cout << " Automatically setting team_size to chain_threshold - if this exceeds the hardware limitations relaunch with reduced chain_threshold or set a valid team_size" << std::endl; + this->team_size = this->chain_threshold; + h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows); + } + else { + std::cout << " EXPERIMENTAL: team_size less than chain size. team_size = " << this->team_size << " chain_threshold = " << this->chain_threshold << std::endl; + h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows); + } + } + } + else { + h_chain_ptr = host_signed_nnz_lno_view_t(); + this->chain_threshold = -1; + } - template - SPTRSVHandle & operator= ( SPTRSVHandle< rhslno_row_view_t_, rhslno_nnz_view_t_, rhsscalar_nnz_view_t_, rhsExecutionSpace, rhsMemorySpace > & rhs ) { - - this->level_list = rhs.level_list; - this->nodes_per_level = rhs.nodes_per_level; - this->nodes_grouped_by_level = rhs.nodes_grouped_by_level; - this->nrows = rhs.nrows; - this->nlevel = rhs.nlevel; - this->lower_tri = rhs.lower_tri; - this->symbolic_complete = rhs.symbolic_complete; - this->algm = rhs.algm; - return *this; +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + create_SPTRSVcudaGraphWrapperType(); +#endif + + set_num_chain_entries(0); + set_symbolic_incomplete(); } -*/ + virtual ~SPTRSVHandle() { +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + destroy_SPTRSVcudaGraphWrapperType(); #endif - void reset_handle( const size_type nrows_ ) { - set_nrows(nrows_); - set_num_levels(0); - level_list = signed_nnz_lno_view_t( Kokkos::ViewAllocateWithoutInitializing("level_list"), nrows_), - Kokkos::deep_copy( level_list, signed_integral_t(-1) ); - nodes_per_level = nnz_lno_view_t("nodes_per_level", nrows_), - nodes_grouped_by_level = nnz_lno_view_t("nodes_grouped_by_level", nrows_), - reset_symbolic_complete(); +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + this->destroy_cuSPARSE_Handle(); +#endif + }; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + void create_cuSPARSE_Handle(bool transpose, bool is_lower){ + this->destroy_cuSPARSE_Handle(); + this->cuSPARSEHandle = new cuSparseHandleType(transpose, is_lower); + } + void destroy_cuSPARSE_Handle(){ + if (this->cuSPARSEHandle != nullptr){ + delete this->cuSPARSEHandle; + this->cuSPARSEHandle = nullptr; + } } - virtual ~SPTRSVHandle() {}; + SPTRSVcuSparseHandleType *get_cuSparseHandle(){ + return this->cuSPARSEHandle; + } +#endif + + + bool algm_requires_symb_lvlsched() const { return require_symbolic_lvlsched_phase; } + bool algm_requires_symb_chain() const { return require_symbolic_chain_phase; } - void set_algorithm(SPTRSVAlgorithm choice) { algm = choice; } + // Can change the algorithm to a "Compatible algorithms" - for ease in some testing cases + void set_algorithm(SPTRSVAlgorithm choice) { + if (algm != choice) { + algm = choice; + } + } + KOKKOS_INLINE_FUNCTION SPTRSVAlgorithm get_algorithm() { return algm; } KOKKOS_INLINE_FUNCTION signed_nnz_lno_view_t get_level_list() const { return level_list; } + inline + host_signed_nnz_lno_view_t get_host_level_list() const { + auto hlevel_list = Kokkos::create_mirror_view(this->level_list); + Kokkos::deep_copy(hlevel_list, this->level_list); + return hlevel_list; + } + + void set_stored_diagonal(const bool stored_diagonal_) { + stored_diagonal = stored_diagonal_; + } + + KOKKOS_INLINE_FUNCTION + nnz_lno_view_t get_diagonal_offsets() const { return diagonal_offsets; } + + KOKKOS_INLINE_FUNCTION + nnz_scalar_view_t get_diagonal_values() const { return diagonal_values; } + + KOKKOS_INLINE_FUNCTION + host_nnz_lno_view_t get_host_diagonal_offsets() const { return hdiagonal_offsets; } + + KOKKOS_INLINE_FUNCTION + host_nnz_scalar_view_t get_host_diagonal_values() const { return hdiagonal_values; } + + inline + host_signed_nnz_lno_view_t get_host_chain_ptr() const { return h_chain_ptr; } + KOKKOS_INLINE_FUNCTION nnz_lno_view_t get_nodes_per_level() const { return nodes_per_level; } + inline + host_nnz_lno_view_t get_host_nodes_per_level() const { + return hnodes_per_level; + } + KOKKOS_INLINE_FUNCTION nnz_lno_view_t get_nodes_grouped_by_level() const { return nodes_grouped_by_level; } + inline + host_nnz_lno_view_t get_host_nodes_grouped_by_level() const { return hnodes_grouped_by_level; } + KOKKOS_INLINE_FUNCTION size_type get_nrows() const { return nrows; } + void set_nrows(const size_type nrows_) { this->nrows = nrows_; } + + + void reset_chain_threshold(const signed_integral_t threshold) { + if (threshold != this->chain_threshold || h_chain_ptr.span() == 0) { + this->chain_threshold = threshold; + if (this->team_size >= this->chain_threshold) { + // h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows); + } + else if (this->team_size == -1 && chain_threshold > 0) { + //std::cout << " Warning: team_size was not set team_size = " << this->team_size << " chain_threshold = " << this->chain_threshold << std::endl; + //std::cout << " Automatically setting team_size to chain_threshold - if this exceeds the hardware limitation a runtime error will occur during kernel launch - reduce chain_threshold in that case" << std::endl; + this->team_size = this->chain_threshold; + // h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows); + } + else { + std::cout << " EXPERIMENTAL: team_size < chain_size: team_size = " << this->team_size << " chain_threshold = " << this->chain_threshold << std::endl; + } + } + } KOKKOS_INLINE_FUNCTION - void set_nrows(const size_type nrows_) { this->nrows = nrows_; } + signed_integral_t get_chain_threshold () const { return this->chain_threshold; } + bool is_lower_tri() const { return lower_tri; } bool is_upper_tri() const { return !lower_tri; } bool is_symbolic_complete() const { return symbolic_complete; } + bool is_stored_diagonal() const { return stored_diagonal; } + + KOKKOS_INLINE_FUNCTION size_type get_num_levels() const { return nlevel; } + void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; } void set_symbolic_complete() { this->symbolic_complete = true; } - void reset_symbolic_complete() { this->symbolic_complete = false; } + void set_symbolic_incomplete() { this->symbolic_complete = false; } - void set_team_size(const int ts) {this->team_size = ts;} + KOKKOS_INLINE_FUNCTION int get_team_size() const {return this->team_size;} + // Called by user at setup - should only set a value, no alloc + void set_team_size(const int ts) {this->team_size = ts;} - void set_vector_size(const int vs) {this->vector_size = vs;} + KOKKOS_INLINE_FUNCTION int get_vector_size() const {return this->vector_size;} + // Called by user at setup - should only set a value, no alloc + void set_vector_size(const int vs) {this->vector_size = vs;} + + KOKKOS_INLINE_FUNCTION + int get_num_chain_entries() const {return this->num_chain_entries;} + void set_num_chain_entries(const int nce) {this->num_chain_entries = nce;} void print_algorithm() { if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_RP ) @@ -239,20 +536,49 @@ class SPTRSVHandle { if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 ) std::cout << "SEQLVLSCHD_TP1" << std::endl;; - - /* +/* if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { std::cout << "SEQLVLSCHED_TP2" << std::endl;; std::cout << "WARNING: With CUDA this is currently only reliable with int-int ordinal-offset pair" << std::endl; } - */ +*/ + if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) + std::cout << "SEQLVLSCHD_TP1CHAIN" << std::endl;; + + if ( algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE ) + std::cout << "SPTRSV_CUSPARSE" << std::endl;; + } + + + std::string return_algorithm_string() { + std::string ret_string; + + if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_RP ) + ret_string = "SEQLVLSCHD_RP"; + + if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 ) + ret_string = "SEQLVLSCHD_TP1"; +/* + if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) + ret_string = "SEQLVLSCHED_TP2"; +*/ + if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) + ret_string = "SEQLVLSCHD_TP1CHAIN"; + + if ( algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE ) + ret_string = "SPTRSV_CUSPARSE"; + + return ret_string; } + inline SPTRSVAlgorithm StringToSPTRSVAlgorithm(std::string & name) { - if(name=="SPTRSV_DEFAULT") return SPTRSVAlgorithm::SEQLVLSCHD_RP; - else if(name=="SPTRSV_RANGEPOLICY") return SPTRSVAlgorithm::SEQLVLSCHD_RP; - else if(name=="SPTRSV_TEAMPOLICY1") return SPTRSVAlgorithm::SEQLVLSCHD_TP1; - /*else if(name=="SPTRSV_TEAMPOLICY2") return SPTRSVAlgorithm::SEQLVLSCHED_TP2;*/ + if(name=="SPTRSV_DEFAULT") return SPTRSVAlgorithm::SEQLVLSCHD_RP; + else if(name=="SPTRSV_RANGEPOLICY") return SPTRSVAlgorithm::SEQLVLSCHD_RP; + else if(name=="SPTRSV_TEAMPOLICY1") return SPTRSVAlgorithm::SEQLVLSCHD_TP1; + /*else if(name=="SPTRSV_TEAMPOLICY2") return SPTRSVAlgorithm::SEQLVLSCHED_TP2;*/ + else if(name=="SPTRSV_TEAMPOLICY1CHAIN") return SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN; + else if(name=="SPTRSV_CUSPARSE") return SPTRSVAlgorithm::SPTRSV_CUSPARSE; else throw std::runtime_error("Invalid SPTRSVAlgorithm name"); } diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp new file mode 100644 index 0000000000..ef92b493b5 --- /dev/null +++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp @@ -0,0 +1,296 @@ +/* +//@HEADER +// ************************************************************************ +// +// KokkosKernels 0.9: Linear Algebra and Graph Kernels +// Copyright 2017 Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef _KOKKOSSPTRSVCUSPARSE_HPP +#define _KOKKOSSPTRSVCUSPARSE_HPP + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#include "cusparse.h" +#endif +namespace KokkosSparse{ +namespace Impl{ + + template + void sptrsvcuSPARSE_symbolic( + KernelHandle *sptrsv_handle, + typename KernelHandle::nnz_lno_t nrows, + ain_row_index_view_type row_map, + ain_nonzero_index_view_type entries, + ain_values_scalar_view_type values, + bool trans + ) + { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + typedef typename KernelHandle::nnz_lno_t idx_type; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::scalar_t scalar_type; + + if (std::is_same::value) { + bool is_lower = sptrsv_handle->is_lower_tri(); + sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower); + + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + + cusparseStatus_t status; + status = cusparseCreateCsrsv2Info(&(h->info)); + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "csrsv2info create status error name " << (status) << std::endl; + + // query how much memory used in csrsv2, and allocate the buffer + int nnz = entries.extent_int(0); + int pBufferSize; + + const scalar_type* vals = values.data(); + const size_type* rm = row_map.data(); + const idx_type* ent = entries.data(); + + if (std::is_same::value) { + cusparseDcsrsv2_bufferSize( + h->handle, + h->transpose, + nrows, + nnz, + h->descr, + (double*)vals, + (int*)rm, + (int*)ent, + h->info, + &pBufferSize); + + + // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. + cudaError_t error_t; + error_t = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + + if (cudaSuccess != error_t) + std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(error_t) << std::endl; + + status = cusparseDcsrsv2_analysis( + h->handle, + h->transpose, + nrows, + nnz, + h->descr, + (double*)vals, + (int*)rm, + (int*)ent, + h->info, + h->policy, + h->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "analysis status error name " << (status) << std::endl; + } + else if (std::is_same>::value) { + cusparseZcsrsv2_bufferSize( + h->handle, + h->transpose, + nrows, + nnz, + h->descr, + (cuDoubleComplex*)vals, + (int*)rm, + (int*)ent, + h->info, + &pBufferSize); + + // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes. + cudaError_t error_t; + error_t = cudaMalloc((void**)&(h->pBuffer), pBufferSize); + + if (cudaSuccess != error_t) + std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(error_t) << std::endl; + + status = cusparseZcsrsv2_analysis( + h->handle, + h->transpose, + nrows, + nnz, + h->descr, + (cuDoubleComplex*)vals, + (int*)rm, + (int*)ent, + h->info, + h->policy, + h->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "analysis status error name " << (status) << std::endl; + } + else { + throw std::runtime_error ("This CUSPARSE wrapper currently only supports double and complex.\n"); + } + } + else { + throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n"); + } +#else + (void)sptrsv_handle; + (void)nrows; + (void)row_map; + (void)entries; + (void)values; + (void)trans; + throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n"); + //return; +#endif + + } + + + template + void sptrsvcuSPARSE_solve( + KernelHandle *sptrsv_handle, + typename KernelHandle::nnz_lno_t nrows, + ain_row_index_view_type row_map, + ain_nonzero_index_view_type entries, + ain_values_scalar_view_type values, + b_values_scalar_view_type rhs, + x_values_scalar_view_type lhs, + bool trans + ) + { + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + typedef typename KernelHandle::nnz_lno_t idx_type; + typedef typename KernelHandle::size_type size_type; + typedef typename KernelHandle::scalar_t scalar_type; + + if (std::is_same::value) { + bool is_lower = sptrsv_handle->is_lower_tri(); + + cusparseStatus_t status; + + typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle(); + + int nnz = entries.extent_int(0); + + const scalar_type* vals = values.data(); + const size_type* rm = row_map.data(); + const idx_type* ent = entries.data(); + const scalar_type* bv = rhs.data(); + scalar_type* xv = lhs.data(); + + const scalar_type alpha = scalar_type(1); + + if (std::is_same::value) { + + if (h->pBuffer == nullptr) { std::cout << " pBuffer invalid" << std::endl; } + + status = cusparseDcsrsv2_solve( + h->handle, + h->transpose, + nrows, + nnz, + &alpha, + h->descr, + (double*)vals, + (int*)rm, + (int*)ent, + h->info, + (double*)bv, + (double*)xv, + h->policy, + h->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "solve status error name " << (status) << std::endl; + } + else if (std::is_same>::value) { + cuDoubleComplex cualpha; + cualpha.x = 1.0; + cualpha.y = 0.0; + status = cusparseZcsrsv2_solve( + h->handle, + h->transpose, + nrows, + nnz, + &cualpha, + h->descr, + (cuDoubleComplex*)vals, + (int*)rm, + (int*)ent, + h->info, + (cuDoubleComplex*)bv, + (cuDoubleComplex*)xv, + h->policy, + h->pBuffer); + + if (CUSPARSE_STATUS_SUCCESS != status) + std::cout << "solve status error name " << (status) << std::endl; + } + else { + throw std::runtime_error ("This CUSPARSE wrapper currently only supports double and complex.\n"); + } + + } + else { + throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n"); + } +#else + (void)sptrsv_handle; + (void)nrows; + (void)row_map; + (void)entries; + (void)values; + (void)rhs; + (void)lhs; + (void)trans; + throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n"); +#endif + + } + +} +} + +#endif diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp index 417acc223a..293a8d33ec 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp @@ -50,16 +50,301 @@ #include #include #include +#include +#include -//#define LVL_OUTPUT_INFO +#include +#include + +//#define TRISOLVE_TIMERS +//#define SERIAL_FOR_LOOP + +#define KOKKOSKERNELS_SPTRSV_TRILVLSCHED + +//#define KOKKOSPSTRSV_SOLVE_IMPL_PROFILE 1 +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +#include "cuda_profiler_api.h" +#endif namespace KokkosSparse { namespace Impl { namespace Experimental { +#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH) + #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT +#endif struct UnsortedTag {}; +struct LargerCutoffTag {}; + +struct UnsortedLargerCutoffTag {}; + +template +void print_view1d_solve(const ViewType dv, size_t range = 0) { + auto v = Kokkos::create_mirror_view(dv); + Kokkos::deep_copy(v, dv); + std::cout << "Output for view " << v.label() << std::endl; + range = range == 0 ? dv.extent(0) : range; + for (size_t i = 0; i < range; ++i) { + std::cout << "v(" << i << ") = " << v(i) << " , "; + } + std::cout << std::endl; +} + +// Needed for cudagraphs +struct EmptyFunctor { + KOKKOS_INLINE_FUNCTION + void operator()(const int) const {} +}; + + +// This functor unifies the lower and upper implementations, the hope is the "is_lowertri" check does not add noticable time on larger problems +template +struct TriLvlSchedTP1SolverFunctor +{ + typedef typename RowMapType::execution_space execution_space; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type member_type; + typedef typename EntriesType::non_const_value_type lno_t; + typedef typename ValuesType::non_const_value_type scalar_t; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + NGBLType nodes_grouped_by_level; + + const bool is_lowertri; + + long node_count; // like "block" offset into ngbl, my_league is the "local" offset + long dense_nrows; + + + TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const bool is_lowertri_, long node_count_, long dense_nrows_ = 0) : + row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), node_count(node_count_), dense_nrows(dense_nrows_) {} + + + KOKKOS_INLINE_FUNCTION + void operator()( const member_type & team ) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid+1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) { + auto colid = entries(ptr); + + auto val = values(ptr); + if ( colid != rowid ) { + tdiff = tdiff - val*lhs(colid); + } + }, diff ); + + team.team_barrier(); + + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if ( my_rank == 0 ) + { + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset); + } + } + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag&, const member_type & team) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid+1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + + Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff = tdiff - val*lhs(colid); + } + else { + diag = ptr; + } + }, diff ); + team.team_barrier(); + + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if ( my_rank == 0 ) + { + lhs(rowid) = (rhs_rowid+diff)/values(diag); + } + } +}; + + +template +struct TriLvlSchedTP1SolverFunctorDiagValues +{ + typedef typename RowMapType::execution_space execution_space; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type member_type; + typedef typename EntriesType::non_const_value_type lno_t; + typedef typename ValuesType::non_const_value_type scalar_t; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + NGBLType nodes_grouped_by_level; + ValuesType diagonal_values; // inserted according to rowid + + const bool is_lowertri; + + long node_count; // like "block" offset into ngbl, my_league is the "local" offset + long dense_nrows; + + + TriLvlSchedTP1SolverFunctorDiagValues( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const ValuesType &diagonal_values_, const bool is_lowertri_, long node_count_, long dense_nrows_ = 0) : + row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), diagonal_values(diagonal_values_), is_lowertri(is_lowertri_), node_count(node_count_), dense_nrows(dense_nrows_) {} + + + KOKKOS_INLINE_FUNCTION + void operator()( const member_type & team ) const { + auto my_league = team.league_rank(); // map to rowid + auto rowid = nodes_grouped_by_level(my_league + node_count); + auto my_rank = team.team_rank(); + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid+1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff = tdiff - val*lhs(colid); + } + }, diff ); + + team.team_barrier(); + + // At end, finalize rowid == colid + // only one thread should do this; can also use Kokkos::single + if ( my_rank == 0 ) + { + //lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset); + lhs(rowid) = (rhs_rowid+diff)/diagonal_values(rowid); + } + } + +}; + + +template +struct TriLvlSchedTP2SolverFunctor +{ + typedef typename RowMapType::execution_space execution_space; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type member_type; + typedef typename EntriesType::non_const_value_type lno_t; + typedef typename ValuesType::non_const_value_type scalar_t; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + NGBLType nodes_grouped_by_level; + + const bool is_lowertri; + long node_count; // like "block" offset into ngbl, my_league is the "local" offset + long node_groups; + long dense_nrows; + + + TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const bool is_lowertri_, long node_count_, long node_groups_ = 0, long dense_nrows_ = 0) : + row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), node_count(node_count_), node_groups(node_groups_), dense_nrows(dense_nrows_) {} + + + KOKKOS_INLINE_FUNCTION + void operator()(const member_type & team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) { + auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng); + if ( size_t(rowid) < nrows ) { + + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid+1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff = tdiff - val*lhs(colid); + } + }, diff ); + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset); + } // end if + }); // end TeamThreadRange + + team.team_barrier(); + } + + KOKKOS_INLINE_FUNCTION + void operator()(const UnsortedTag&, const member_type & team) const { + auto my_league = team.league_rank(); // map to rowid + + size_t nrows = row_map.extent(0) - 1; + + Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) { + auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng); + if ( size_t(rowid) < nrows ) { + auto soffset = row_map(rowid); + auto eoffset = row_map(rowid+1); + auto rhs_rowid = rhs(rowid); + scalar_t diff = scalar_t(0.0); + + auto diag = -1; + Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff = tdiff - val*lhs(colid); + } + else { + diag = ptr; + } + }, diff ); + + lhs(rowid) = (rhs_rowid+diff)/values(diag); + } // end if + }); // end TeamThreadRange + + team.team_barrier(); + } +}; + + +// Lower vs Upper Multi-block Functors template struct LowerTriLvlSchedRPSolverFunctor @@ -120,6 +405,7 @@ struct LowerTriLvlSchedRPSolverFunctor }; + template struct LowerTriLvlSchedTP1SolverFunctor { @@ -136,7 +422,7 @@ struct LowerTriLvlSchedTP1SolverFunctor RHSType rhs; NGBLType nodes_grouped_by_level; - long node_count; + long node_count; // like "block" offset into ngbl, my_league is the "local" offset long node_groups; @@ -148,7 +434,7 @@ struct LowerTriLvlSchedTP1SolverFunctor void operator()( const member_type & team ) const { auto my_league = team.league_rank(); // map to rowid auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_team = team.team_rank(); + auto my_rank = team.team_rank(); auto soffset = row_map(rowid); auto eoffset = row_map(rowid+1); @@ -167,7 +453,7 @@ struct LowerTriLvlSchedTP1SolverFunctor // At end, finalize rowid == colid // only one thread should do this; can also use Kokkos::single - if ( my_team == 0 ) + if ( my_rank == 0 ) { // ASSUMPTION: sorted diagonal value located at eoffset - 1 lhs(rowid) = (rhs_rowid+diff)/values(eoffset-1); @@ -178,7 +464,7 @@ struct LowerTriLvlSchedTP1SolverFunctor void operator()(const UnsortedTag&, const member_type & team) const { auto my_league = team.league_rank(); // map to rowid auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_team = team.team_rank(); + auto my_rank = team.team_rank(); auto soffset = row_map(rowid); auto eoffset = row_map(rowid+1); @@ -197,19 +483,18 @@ struct LowerTriLvlSchedTP1SolverFunctor diag = ptr; } }, diff ); - team.team_barrier(); // At end, finalize rowid == colid // only one thread should do this; can also use Kokkos::single - if ( my_team == 0 ) + if ( my_rank == 0 ) { - // ASSUMPTION: sorted diagonal value located at eoffset - 1 lhs(rowid) = (rhs_rowid+diff)/values(diag); } } }; + // FIXME CUDA: This algorithm not working with all integral type combos // In any case, this serves as a skeleton for 3-level hierarchical parallelism for alg dev template @@ -228,11 +513,11 @@ struct LowerTriLvlSchedTP2SolverFunctor RHSType rhs; NGBLType nodes_grouped_by_level; - long node_count; + long node_count; // like "block" offset into ngbl, my_league is the "local" offset long node_groups; - LowerTriLvlSchedTP2SolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) : + LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) : row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {} @@ -359,6 +644,7 @@ struct UpperTriLvlSchedRPSolverFunctor }; + template struct UpperTriLvlSchedTP1SolverFunctor { @@ -375,7 +661,7 @@ struct UpperTriLvlSchedTP1SolverFunctor RHSType rhs; NGBLType nodes_grouped_by_level; - long node_count; + long node_count; // like "block" offset into ngbl, my_league is the "local" offset long node_groups; @@ -387,7 +673,7 @@ struct UpperTriLvlSchedTP1SolverFunctor void operator()(const member_type & team) const { auto my_league = team.league_rank(); // map to rowid auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_team = team.team_rank(); + auto my_rank = team.team_rank(); auto soffset = row_map(rowid); auto eoffset = row_map(rowid+1); @@ -406,7 +692,7 @@ struct UpperTriLvlSchedTP1SolverFunctor // At end, finalize rowid == colid // only one thread should do this, also can use Kokkos::single - if ( my_team == 0 ) + if ( my_rank == 0 ) { // ASSUMPTION: sorted diagonal value located at start offset lhs(rowid) = (rhs_rowid+diff)/values(soffset); @@ -417,7 +703,7 @@ struct UpperTriLvlSchedTP1SolverFunctor void operator()(const UnsortedTag&, const member_type & team) const { auto my_league = team.league_rank(); // map to rowid auto rowid = nodes_grouped_by_level(my_league + node_count); - auto my_team = team.team_rank(); + auto my_rank = team.team_rank(); auto soffset = row_map(rowid); auto eoffset = row_map(rowid+1); @@ -440,13 +726,11 @@ struct UpperTriLvlSchedTP1SolverFunctor // At end, finalize rowid == colid // only one thread should do this, also can use Kokkos::single - if ( my_team == 0 ) + if ( my_rank == 0 ) { - // ASSUMPTION: sorted diagonal value located at start offset lhs(rowid) = (rhs_rowid+diff)/values(diag); } } - }; @@ -468,16 +752,16 @@ struct UpperTriLvlSchedTP2SolverFunctor RHSType rhs; NGBLType nodes_grouped_by_level; - long node_count; + long node_count; // like "block" offset into ngbl, my_league is the "local" offset long node_groups; - UpperTriLvlSchedTP2SolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0 ) : + UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) : row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {} KOKKOS_INLINE_FUNCTION - void operator()( const member_type & team ) const { + void operator()(const member_type & team) const { auto my_league = team.league_rank(); // map to rowid size_t nrows = row_map.extent(0) - 1; @@ -533,7 +817,6 @@ struct UpperTriLvlSchedTP2SolverFunctor } }, diff ); - // ASSUMPTION: sorted diagonal value located at start offset lhs(rowid) = (rhs_rowid+diff)/values(diag); } // end if }); // end TeamThreadRange @@ -544,29 +827,1191 @@ struct UpperTriLvlSchedTP2SolverFunctor }; -template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > -void lower_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { - - typedef typename TriSolveHandle::execution_space execution_space; - typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; - - auto nlevels = thandle.get_num_levels(); - // Keep this a host View, create device version and copy to back to host during scheduling - auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); - Kokkos::deep_copy(hnodes_per_level, nodes_per_level); +// -------------------------------- +// Single-block functors +// -------------------------------- - auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); +template +struct LowerTriLvlSchedTP1SingleBlockFunctor +{ + typedef typename RowMapType::execution_space execution_space; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type member_type; + typedef typename EntriesType::non_const_value_type lno_t; + typedef typename ValuesType::non_const_value_type scalar_t; - size_type node_count = 0; + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + NGBLType nodes_grouped_by_level; + NGBLType nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" offset + long lvl_start; + long lvl_end; + long cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + + LowerTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0 ) : + row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row + + KOKKOS_INLINE_FUNCTION + void operator()( const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + }, diff); +#endif + // ASSUMPTION: sorted diagonal value located at eoffset - 1 + lhs(rowid) = (rhs_val+diff)/values(eoffset-1); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()( const UnsortedTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + else { + diag = ptr; + } + }, diff); +#endif + lhs(rowid) = (rhs_val+diff)/values(diag); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + + KOKKOS_INLINE_FUNCTION + void operator()( const LargerCutoffTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + },diff); +#endif + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri + lhs(rowid) = (rhs_val+diff)/values(eoffset-1); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + else { + diag = ptr; + } + },diff); +#endif + lhs(rowid) = (rhs_val+diff)/values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + +}; + + +template +struct UpperTriLvlSchedTP1SingleBlockFunctor +{ + typedef typename RowMapType::execution_space execution_space; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type member_type; + typedef typename EntriesType::non_const_value_type lno_t; + typedef typename ValuesType::non_const_value_type scalar_t; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + NGBLType nodes_grouped_by_level; + NGBLType nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" offset + long lvl_start; + long lvl_end; + long cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + + UpperTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0 ) : + row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row + + KOKKOS_INLINE_FUNCTION + void operator()( const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + }, diff); +#endif + // ASSUMPTION: sorted diagonal value located at soffset + lhs(rowid) = (rhs_val+diff)/values(soffset); + } // end if + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl each thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()( const UnsortedTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + else { + diag = ptr; + } + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + else { + diag = ptr; + } + }, diff); +#endif + lhs(rowid) = (rhs_val+diff)/values(diag); + } // end if + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl each thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + + KOKKOS_INLINE_FUNCTION + void operator()( const LargerCutoffTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + }, diff); +#endif + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri + lhs(rowid) = (rhs_val+diff)/values(soffset); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + else { + diag = ptr; + } + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + else { + diag = ptr; + } + }, diff); +#endif + lhs(rowid) = (rhs_val+diff)/values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator +}; + + +template +struct TriLvlSchedTP1SingleBlockFunctor +{ + typedef typename RowMapType::execution_space execution_space; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type member_type; + typedef typename EntriesType::non_const_value_type lno_t; + typedef typename ValuesType::non_const_value_type scalar_t; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + NGBLType nodes_grouped_by_level; + NGBLType nodes_per_level; + + long node_count; // like "block" offset into ngbl, my_league is the "local" offset + long lvl_start; + long lvl_end; + const bool is_lowertri; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + + TriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0 ) : + row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), is_lowertri(is_lower_), dense_nrows(dense_nrows_), cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row + + KOKKOS_INLINE_FUNCTION + void operator()( const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + }, diff); +#endif + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri + if (is_lowertri) + lhs(rowid) = (rhs_val+diff)/values(eoffset-1); + else + lhs(rowid) = (rhs_val+diff)/values(soffset); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + KOKKOS_INLINE_FUNCTION + void operator()( const UnsortedTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + else { + diag = ptr; + } + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + else { + diag = ptr; + } + }, diff); +#endif + lhs(rowid) = (rhs_val+diff)/values(diag); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + + KOKKOS_INLINE_FUNCTION + void operator()( const LargerCutoffTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + }, diff); +#endif + + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri + if (is_lowertri) + lhs(rowid) = (rhs_val+diff)/values(eoffset-1); + else + lhs(rowid) = (rhs_val+diff)/values(soffset); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + + KOKKOS_INLINE_FUNCTION + void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + auto diag = -1; + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + else { + diag = ptr; + } + } +#else + auto trange = eoffset - soffset; + auto diag = -1; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + else { + diag = ptr; + } + }, diff); +#endif + lhs(rowid) = (rhs_val+diff)/values(diag); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + +}; + + +template +struct TriLvlSchedTP1SingleBlockFunctorDiagValues +{ + typedef typename RowMapType::execution_space execution_space; + typedef Kokkos::TeamPolicy policy_type; + typedef typename policy_type::member_type member_type; + typedef typename EntriesType::non_const_value_type lno_t; + typedef typename ValuesType::non_const_value_type scalar_t; + + RowMapType row_map; + EntriesType entries; + ValuesType values; + LHSType lhs; + RHSType rhs; + NGBLType nodes_grouped_by_level; + NGBLType nodes_per_level; + ValuesType diagonal_values; + + long node_count; // like "block" offset into ngbl, my_league is the "local" offset + long lvl_start; + long lvl_end; + const bool is_lowertri; + const int dense_nrows; + const int cutoff; + // team_size: each team can be assigned a row, if there are enough rows... + + + TriLvlSchedTP1SingleBlockFunctorDiagValues( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const NGBLType &nodes_per_level_, const ValuesType &diagonal_values_, long node_count_, const long lvl_start_, const long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0 ) : + row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), diagonal_values(diagonal_values_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), is_lowertri(is_lower_), dense_nrows(dense_nrows_), cutoff(cutoff_) {} + + // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row + + KOKKOS_INLINE_FUNCTION + void operator()( const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_rank = team.team_rank(); + diff = scalar_t(0.0); + + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + }, diff); +#endif + // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri + lhs(rowid) = (rhs_val+diff)/diagonal_values(rowid); + } // end if team.team_rank() < nodes_this_lvl + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end operator + + + KOKKOS_INLINE_FUNCTION + void operator()( const LargerCutoffTag&, const member_type & team ) const { + long mut_node_count = node_count; + typename NGBLType::non_const_value_type rowid {0}; + typename RowMapType::non_const_value_type soffset {0}; + typename RowMapType::non_const_value_type eoffset {0}; + typename RHSType::non_const_value_type rhs_val {0}; + scalar_t diff = scalar_t(0.0); + + for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) { + auto nodes_this_lvl = nodes_per_level(lvl); + int my_team_rank = team.team_rank(); + // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc. + for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) { + diff = scalar_t(0.0); + if (my_rank < nodes_this_lvl) { + // THIS is where the mapping of threadid to rowid happens + rowid = nodes_grouped_by_level(my_rank + mut_node_count); + soffset = row_map(rowid); + eoffset = row_map(rowid+1); + rhs_val = rhs(rowid); + +#ifdef SERIAL_FOR_LOOP + for (auto ptr = soffset; ptr < eoffset; ++ptr) { + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + diff -= val*lhs(colid); + } + } +#else + auto trange = eoffset - soffset; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff) + { + auto ptr = soffset + loffset; + auto colid = entries(ptr); + auto val = values(ptr); + if ( colid != rowid ) { + tdiff -= val*lhs(colid); + } + }, diff); +#endif + lhs(rowid) = (rhs_val+diff)/diagonal_values(rowid); + } // end if team.team_rank() < nodes_this_lvl + } // end for my_rank loop + { + // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread + mut_node_count += nodes_this_lvl; + } + team.team_barrier(); + } // end for lvl + } // end tagged operator + +}; + + +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT +template +struct ReturnTeamPolicyType; + +#ifdef KOKKOS_ENABLE_SERIAL +template <> +struct ReturnTeamPolicyType { + using PolicyType = Kokkos::TeamPolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType ) { + return PolicyType(nt,ts); + //return PolicyType(ExecInstanceType(),nt,ts); + } +}; +#endif +#ifdef KOKKOS_ENABLE_OPENMP +template <> +struct ReturnTeamPolicyType { + using PolicyType = Kokkos::TeamPolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType ) { + return PolicyType(nt,ts); + //return PolicyType(ExecInstanceType(),nt,ts); + } +}; +#endif +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct ReturnTeamPolicyType { + using PolicyType = Kokkos::TeamPolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { + return PolicyType(stream,nt,ts); + } +}; +#endif + +template +struct ReturnRangePolicyType; + +#ifdef KOKKOS_ENABLE_SERIAL +template <> +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType ) { + return PolicyType(nt,ts); + //return PolicyType(ExecInstanceType(),nt,ts); + } +}; +#endif +#ifdef KOKKOS_ENABLE_OPENMP +template <> +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType ) { + return PolicyType(nt,ts); + //return PolicyType(ExecInstanceType(),nt,ts); + } +}; +#endif +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct ReturnRangePolicyType { + using PolicyType = Kokkos::RangePolicy; + + static inline + PolicyType get_policy(int nt, int ts) { + return PolicyType(nt,ts); + } + + template + static inline + PolicyType get_policy(int nt, int ts, ExecInstanceType stream) { + return PolicyType(stream,nt,ts); + } +}; +#endif + +template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > +void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { + + typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; + typedef typename TriSolveHandle::execution_space execution_space; + typedef typename TriSolveHandle::size_type size_type; + typename TriSolveHandle::SPTRSVcudaGraphWrapperType* lcl_cudagraph = thandle.get_sptrsvCudaGraph(); + + auto nlevels = thandle.get_num_levels(); + + auto stream1 = lcl_cudagraph->stream; + Kokkos::Cuda cuda1(stream1); + auto graph = lcl_cudagraph->cudagraph; + + Kokkos::parallel_for("Init", Kokkos::RangePolicy(0,1), EmptyFunctor()); + Kokkos::Cuda().fence(); + cudaStreamSynchronize(stream1); + //Kokkos::fence(); + + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + + size_type node_count = 0; + + int team_size = thandle.get_team_size(); + team_size = team_size == -1 ? 64 : team_size; + + // Start capturing stream + if(thandle.cudagraphCreated == false) { + Kokkos::fence(); + cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); + { + for (int iter = 0; iter < nlevels; ++iter) { + size_type lvl_nodes = hnodes_per_level(iter); + + using policy_type = ReturnTeamPolicyType; + + Kokkos::parallel_for("parfor_l_team_cudagraph", Kokkos::Experimental::require(ReturnTeamPolicyType::get_policy(lvl_nodes,team_size,cuda1), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedTP1SolverFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); + + node_count += hnodes_per_level(iter); + } + } + cudaStreamEndCapture(stream1, &graph); + + // Create graphExec + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0); + thandle.cudagraphCreated = true; + } + // Run graph + Kokkos::fence(); + cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); + + cudaStreamSynchronize(stream1); + Kokkos::fence(); +} // end lower_tri_solve_cg + + +template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > +void upper_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { + + typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; + typedef typename TriSolveHandle::execution_space execution_space; + typedef typename TriSolveHandle::size_type size_type; + typename TriSolveHandle::SPTRSVcudaGraphWrapperType* lcl_cudagraph = thandle.get_sptrsvCudaGraph(); + + auto nlevels = thandle.get_num_levels(); + + auto stream1 = lcl_cudagraph->stream; + Kokkos::Cuda cuda1(stream1); + auto graph = lcl_cudagraph->cudagraph; + + Kokkos::parallel_for("Init", Kokkos::RangePolicy(0,1), EmptyFunctor()); + Kokkos::Cuda().fence(); + cudaStreamSynchronize(stream1); + + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + + size_type node_count = 0; + + int team_size = thandle.get_team_size(); + team_size = team_size == -1 ? 64 : team_size; + + // Start capturing stream + if(thandle.cudagraphCreated == false) { + Kokkos::fence(); + cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal); + { + for (int iter = 0; iter < nlevels; ++iter) { + size_type lvl_nodes = hnodes_per_level(iter); + + using policy_type = ReturnTeamPolicyType; + + Kokkos::parallel_for("parfor_u_team_cudagraph", Kokkos::Experimental::require(ReturnTeamPolicyType::get_policy(lvl_nodes,team_size,cuda1), Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedTP1SolverFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count)); + + node_count += hnodes_per_level(iter); + } + } + cudaStreamEndCapture(stream1, &graph); + + // Create graphExec + cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0); + thandle.cudagraphCreated = true; + } + // Run graph + Kokkos::fence(); + cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1); + + cudaStreamSynchronize(stream1); + Kokkos::fence(); +} // end upper_tri_solve_cg + +#endif + + +template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > +void lower_tri_solve(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +cudaProfilerStop(); +#endif + + typedef typename TriSolveHandle::execution_space execution_space; + typedef typename TriSolveHandle::size_type size_type; + typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; + + auto nlevels = thandle.get_num_levels(); + // Keep this a host View, create device version and copy to back to host during scheduling + // This requires making sure the host view in the handle is properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + + size_type node_count = 0; - // This must stay serial; would be nice to try out Cuda's graph stuff to reduce kernel launch overhead for ( size_type lvl = 0; lvl < nlevels; ++lvl ) { + { size_type lvl_nodes = hnodes_per_level(lvl); if ( lvl_nodes != 0 ) { +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +cudaProfilerStart(); +#endif if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ) { Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::RangePolicy( node_count, node_count+lvl_nodes ), LowerTriLvlSchedRPSolverFunctor (row_map, entries, values, lhs, rhs, nodes_grouped_by_level) ); } @@ -574,57 +2019,78 @@ void lower_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const typedef Kokkos::TeamPolicy policy_type; int team_size = thandle.get_team_size(); +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count); +#else LowerTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); +#endif if ( team_size == -1 ) Kokkos::parallel_for("parfor_l_team", policy_type( lvl_nodes , Kokkos::AUTO ), tstf); else Kokkos::parallel_for("parfor_l_team", policy_type( lvl_nodes , team_size ), tstf); } - /* // TP2 algorithm has issues with some offset-ordinal combo to be addressed + /* else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef Kokkos::TeamPolicy tvt_policy_type; int team_size = thandle.get_team_size(); if ( team_size == -1 ) { - team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 128; + team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 64; } int vector_size = thandle.get_team_size(); if ( vector_size == -1 ) { vector_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4; } - // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing that many nodes - // TeamThreadRange over number of node_groups - // To avoid masking threads, 1 thread (team) per node in node_group + // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing team_size # nodes + // TeamThreadRange over number nodes of node_groups + // To avoid masking threads, 1 thread (team) per node in node_group (thread has full ownership of a node) // ThreadVectorRange responsible for the actual solve computation - const int node_groups = team_size; + //const int node_groups = team_size; + const int node_groups = vector_size; +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count, vector_size, 0); +#else LowerTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups); +#endif Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), tstf); } // end elseif */ node_count += lvl_nodes; +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +cudaProfilerStop(); +#endif } // end if + } // scope for if-block + } // end for lvl } // end lower_tri_solve + template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > -void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { +void upper_tri_solve(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) { +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +cudaProfilerStop(); +#endif typedef typename TriSolveHandle::execution_space execution_space; typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; + auto nlevels = thandle.get_num_levels(); // Keep this a host View, create device version and copy to back to host during scheduling + // This requires making sure the host view in the handle is properly updated after the symbolic phase auto nodes_per_level = thandle.get_nodes_per_level(); - auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); - Kokkos::deep_copy(hnodes_per_level, nodes_per_level); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + //auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level); + //Kokkos::deep_copy(hnodes_per_level, nodes_per_level); auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); @@ -635,6 +2101,9 @@ void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const size_type lvl_nodes = hnodes_per_level(lvl); if ( lvl_nodes != 0 ) { +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +cudaProfilerStart(); +#endif if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ) { Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::RangePolicy( node_count, node_count+lvl_nodes ), UpperTriLvlSchedRPSolverFunctor (row_map, entries, values, lhs, rhs, nodes_grouped_by_level) ); @@ -644,20 +2113,24 @@ void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const int team_size = thandle.get_team_size(); +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count); +#else UpperTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); +#endif if ( team_size == -1 ) Kokkos::parallel_for("parfor_u_team", policy_type( lvl_nodes , Kokkos::AUTO ), tstf); else Kokkos::parallel_for("parfor_u_team", policy_type( lvl_nodes , team_size ), tstf); } - /* // TP2 algorithm has issues with some offset-ordinal combo to be addressed + /* else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) { typedef Kokkos::TeamPolicy tvt_policy_type; int team_size = thandle.get_team_size(); if ( team_size == -1 ) { - team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 128; + team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 64; } int vector_size = thandle.get_team_size(); if ( vector_size == -1 ) { @@ -665,24 +2138,209 @@ void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const } // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing that many nodes - // TeamThreadRange over number of node_groups - // To avoid masking threads, 1 thread (team) per node in node_group + // TeamThreadRange over number nodes of node_groups + // To avoid masking threads, 1 thread (team) per node in node_group (thread has full ownership of a node) // ThreadVectorRange responsible for the actual solve computation - const int node_groups = team_size; + //const int node_groups = team_size; + const int node_groups = vector_size; +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count, vector_size, 0); +#else UpperTriLvlSchedTP2SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups); +#endif + Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), tstf); } // end elseif */ node_count += lvl_nodes; +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +cudaProfilerStop(); +#endif } // end if } // end for lvl } // end upper_tri_solve +template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType > +void tri_solve_chain(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs, const bool is_lowertri_) { + +#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE) +cudaProfilerStop(); +#endif + typedef typename TriSolveHandle::execution_space execution_space; + typedef typename TriSolveHandle::size_type size_type; + typedef typename TriSolveHandle::nnz_lno_view_t NGBLType; + + // Algorithm is checked before this function is called + auto h_chain_ptr = thandle.get_host_chain_ptr(); + size_type num_chain_entries = thandle.get_num_chain_entries(); + + // Keep this a host View, create device version and copy to back to host during scheduling + // This requires making sure the host view in the handle is properly updated after the symbolic phase + auto nodes_per_level = thandle.get_nodes_per_level(); + auto hnodes_per_level = thandle.get_host_nodes_per_level(); + + auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); + + const bool is_lowertri = thandle.is_lower_tri(); + + size_type node_count = 0; + +// REFACTORED to cleanup; next, need debug and timer routines + using policy_type = Kokkos::TeamPolicy; + using large_cutoff_policy_type = Kokkos::TeamPolicy; +/* + using TP1Functor = TriLvlSchedTP1SolverFunctor; + using LTP1Functor = LowerTriLvlSchedTP1SolverFunctor; + using UTP1Functor = UpperTriLvlSchedTP1SolverFunctor; + using LSingleBlockFunctor = LowerTriLvlSchedTP1SingleBlockFunctor; + using USingleBlockFunctor = UpperTriLvlSchedTP1SingleBlockFunctor; +*/ + using SingleBlockFunctor = TriLvlSchedTP1SingleBlockFunctor; + + int team_size = thandle.get_team_size(); + int vector_size = thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1; + + auto cutoff = thandle.get_chain_threshold(); + int team_size_singleblock = team_size; + + // Enumerate options + // ts -1,0 | cu 0 - select default ts == 1 + // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto) + // ts > 0 | cu 0 - set + // ts > 0 | cu > 0 - set + // Controls ts,cu > 0 + // co > ts - not all rows can be mapped to a thread - must call largercutoff impl + // co <= ts - okay, kernel must be careful not to access out-of-bounds; some threads idol + if (team_size_singleblock <= 0 && cutoff == 0) { + team_size_singleblock = 1; + // If cutoff == 0, no single-block calls will be made, team_size_singleblock is unimportant + } + + // This is only necessary for Lower,UpperTri functor versions; else, is_lowertri can be passed as arg to the generic Tri functor... + if (is_lowertri) { + + for ( size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink ) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink+1); + + if ( echain - schain == 1 ) { + + // if team_size is -1 (unset), get recommended size from Kokkos +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count); +#else + LowerTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); +#endif + if (team_size == - 1) { + team_size = policy_type(1, 1, vector_size).team_size_recommended(tstf, Kokkos::ParallelForTag()); + } + + size_type lvl_nodes = hnodes_per_level(schain); //lvl == echain???? + Kokkos::parallel_for("parfor_l_team_chain1", policy_type(lvl_nodes , team_size, vector_size), tstf); + node_count += lvl_nodes; + + } + else { + size_type lvl_nodes = 0; + + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); + } + + if (team_size_singleblock <= 0) { + team_size_singleblock = policy_type(1, 1, vector_size).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag()); + } + + if (cutoff <= team_size_singleblock) { +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true); +#else + LowerTriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain); +#endif + Kokkos::parallel_for("parfor_l_team_chainmulti", policy_type(1, team_size_singleblock, vector_size), tstf); + } + else { + // team_size_singleblock < cutoff => kernel must allow for a block-stride internally +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true, 0, cutoff); +#else + LowerTriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff); +#endif + Kokkos::parallel_for("parfor_l_team_chainmulti_cutoff", large_cutoff_policy_type(1, team_size_singleblock, vector_size), tstf); + } + node_count += lvl_nodes; + } + Kokkos::fence(); // TODO - is this necessary? that is, can the parallel_for launch before the s/echain values have been updated? + } + + } + else { + + for ( size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink ) { + size_type schain = h_chain_ptr(chainlink); + size_type echain = h_chain_ptr(chainlink+1); + + if ( echain - schain == 1 ) { + + // if team_size is -1 (unset), get recommended size from Kokkos +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count); +#else + UpperTriLvlSchedTP1SolverFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count); +#endif + if (team_size == - 1) { + team_size = policy_type(1, 1, vector_size).team_size_recommended(tstf, Kokkos::ParallelForTag()); + } + + // TODO To use cudagraph here, need to know how many non-unit chains there are, create a graph for each and launch accordingly + size_type lvl_nodes = hnodes_per_level(schain); //lvl == echain???? + Kokkos::parallel_for("parfor_u_team_chain1", policy_type(lvl_nodes , team_size, vector_size), tstf); + node_count += lvl_nodes; + + } + else { + size_type lvl_nodes = 0; + + for (size_type i = schain; i < echain; ++i) { + lvl_nodes += hnodes_per_level(i); + } + + if (team_size_singleblock <= 0) { + //team_size_singleblock = policy_type(1, 1, 1).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), Kokkos::ParallelForTag()); + team_size_singleblock = policy_type(1, 1, vector_size).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag()); + } + + if (cutoff <= team_size_singleblock) { +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri); +#else + UpperTriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain); +#endif + Kokkos::parallel_for("parfor_u_team_chainmulti", policy_type(1, team_size_singleblock, vector_size), tstf); + } + else { + // team_size_singleblock < cutoff => kernel must allow for a block-stride internally +#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED + TriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri, 0, cutoff); +#else + UpperTriLvlSchedTP1SingleBlockFunctor tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff); +#endif + Kokkos::parallel_for("parfor_u_team_chainmulti_cutoff", large_cutoff_policy_type(1, team_size_singleblock, vector_size), tstf); + } + node_count += lvl_nodes; + } + Kokkos::fence(); // TODO - is this necessary? that is, can the parallel_for launch before the s/echain values have been updated? + } + + } + +} // end tri_solve_chain + } // namespace Experimental } // namespace Impl } // namespace KokkosSparse diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp index 7375aa4afb..ce6f2bc599 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp @@ -102,6 +102,10 @@ struct sptrsv_solve_eti_spec_avail { namespace KokkosSparse { namespace Impl { +#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH) + #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT +#endif + // Unification layer /// \brief Implementation of KokkosSparse::sptrsv_solve @@ -155,19 +159,40 @@ struct SPTRSV_SOLVEget_sptrsv_handle(); if ( sptrsv_handle->is_lower_tri() ) { if ( sptrsv_handle->is_symbolic_complete() == false ) { Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); } - Experimental::lower_tri_solve( *sptrsv_handle, row_map, entries, values, b, x); + if ( sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) { + Experimental::tri_solve_chain( *sptrsv_handle, row_map, entries, values, b, x, true); + } + else { +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + if ( std::is_same::value) + Experimental::lower_tri_solve_cg( *sptrsv_handle, row_map, entries, values, b, x); + else +#endif + Experimental::lower_tri_solve( *sptrsv_handle, row_map, entries, values, b, x); + } } else { if ( sptrsv_handle->is_symbolic_complete() == false ) { Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries); } - Experimental::upper_tri_solve( *sptrsv_handle, row_map, entries, values, b, x); + if ( sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) { + Experimental::tri_solve_chain( *sptrsv_handle, row_map, entries, values, b, x, false); + } + else { +#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT + if ( std::is_same::value) + Experimental::upper_tri_solve_cg( *sptrsv_handle, row_map, entries, values, b, x); + else +#endif + Experimental::upper_tri_solve( *sptrsv_handle, row_map, entries, values, b, x); + } } } diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp index 2335f68dea..721b0377ac 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp @@ -51,33 +51,138 @@ #include #include +//#define TRISOLVE_SYMB_TIMERS //#define LVL_OUTPUT_INFO +//#define CHAIN_LVL_OUTPUT_INFO + +// TODO Pass values array and store diagonal entries - should this always be done or optional? namespace KokkosSparse { namespace Impl { namespace Experimental { +template +void print_view1d_symbolic(const ViewType dv, size_t range = 0) { + auto v = Kokkos::create_mirror_view(dv); + Kokkos::deep_copy(v, dv); + std::cout << "Output for view " << v.label() << std::endl; + range = range == 0 ? dv.extent(0) : range; + for (size_t i = 0; i < range; ++i) { + std::cout << "v(" << i << ") = " << v(i) << " , "; + } + std::cout << std::endl; +} + + +// Usage: + // for c in [0, num_chain_entries) + // s = h_chain_ptr(c); e = h_chain_ptr(c+1); + // num_levels_in_current_chain = e - s; + // if nlicc > 256 + // call current_alg + // else + // call single_block(s,e) + +template < class TriSolveHandle, class NPLViewType > +void symbolic_chain_phase(TriSolveHandle &thandle, const NPLViewType &nodes_per_level) { + +#ifdef TRISOLVE_SYMB_TIMERS + Kokkos::Timer timer_sym_chain_total; +#endif + typedef typename TriSolveHandle::size_type size_type; -template < class TriSolveHandle, class RowMapType, class EntriesType > -void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) { + size_type nlevels = thandle.get_num_levels(); + + // Create the chain now + // FIXME Implementations will need to be templated on exec space it seems... + auto cutoff_threshold = thandle.get_chain_threshold(); + if ( thandle.algm_requires_symb_chain() ) { + auto h_chain_ptr = thandle.get_host_chain_ptr(); + h_chain_ptr(0) = 0; + size_type chainlinks_length = 0; + size_type num_chain_entries = 0; + int chain_state = 0; + const int cutoff = cutoff_threshold; + for ( size_type i = 0; i < nlevels; ++i ) { + auto cnpl = nodes_per_level(i); + if (cnpl <= cutoff) { + // this nlevels may be part of a chain passed to the "single_block" solver to reduce kernel launches + chainlinks_length += 1; + } + else { + // Too many levels to run on single block... + // If first lvl <= cutoff but next nlevels isn't, the two aren't separately updated and info is lost... + // if chainlinks_length > 0, take path so that chain-links updated, then current too large chain updated (i.e. 2 updates); if chainlinks_length == 0, then no previous chains and only one update required (npl too large for single-block + chain_state = chainlinks_length > 0 ? 2 : 1; + } + + // if we hit final nlevels before a trigger to update the chain, than override it + // in this case, there was not a larger value to miss cutoff and reset the update + if ( chain_state == 0 && i == nlevels-1 ) { chain_state = 1; } + + if (chain_state == 1) { + num_chain_entries += 1; + if (chainlinks_length == 0) { + h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + 1; + } + else { + h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + chainlinks_length; + } + chainlinks_length = 0; //reset + chain_state = 0; //reset + } + // Two updates required - should only occur if chainlinks_length > 0 + // We have found two things: a non-one length chain, and a subsequent one length chain + if (chain_state == 2) { + if (chainlinks_length == 0) { std::runtime_error("MAJOR LOGIC ERROR! TERMINATE!"); } + + num_chain_entries += 1; + h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + chainlinks_length; + + num_chain_entries += 1; + h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + 1; + + chainlinks_length = 0; //reset + chain_state = 0; //reset + } + } + thandle.set_num_chain_entries(num_chain_entries); + +#ifdef CHAIN_LVL_OUTPUT_INFO + std::cout << " num_chain_entries = " << thandle.get_num_chain_entries() << std::endl; + for ( size_type i = 0; i < num_chain_entries+1; ++i ) + { + std::cout << "chain_ptr(" << i << "): " << h_chain_ptr(i) << std::endl; + } +#endif + } - if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP || - thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1 ) -/* || thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 )*/ +#ifdef TRISOLVE_SYMB_TIMERS + std::cout << " Symbolic Chain Phase Total Time: " << timer_sym_chain_total.seconds() << std::endl;; +#endif +} // end symbolic_chain_phase + + +template < class TriSolveHandle, class RowMapType, class EntriesType > +void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) { +#ifdef TRISOLVE_SYMB_TIMERS + Kokkos::Timer timer_sym_lowertri_total; +#endif + if ( thandle.algm_requires_symb_lvlsched() ) { - // Scheduling currently compute on host - need host copy of all views + // Scheduling currently computes on host - need host copy of all views typedef typename TriSolveHandle::size_type size_type; typedef typename TriSolveHandle::nnz_lno_view_t DeviceEntriesType; - typedef typename TriSolveHandle::nnz_lno_view_t::HostMirror HostEntriesType; typedef typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType; typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror HostSignedEntriesType; typedef typename TriSolveHandle::signed_integral_t signed_integral_t; - size_type nrows = thandle.get_nrows(); + // Necessary for partitioned persisting sparse matrix + size_type nrows = drow_map.extent(0)-1; auto row_map = Kokkos::create_mirror_view(drow_map); Kokkos::deep_copy(row_map, drow_map); @@ -85,13 +190,13 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co auto entries = Kokkos::create_mirror_view(dentries); Kokkos::deep_copy(entries, dentries); + // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); - HostEntriesType nodes_per_level = Kokkos::create_mirror_view(dnodes_per_level); - Kokkos::deep_copy(nodes_per_level, dnodes_per_level); + auto nodes_per_level = thandle.get_host_nodes_per_level(); + // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - HostEntriesType nodes_grouped_by_level = Kokkos::create_mirror_view(dnodes_grouped_by_level); - Kokkos::deep_copy(nodes_grouped_by_level, dnodes_grouped_by_level); + auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level(); DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); @@ -100,19 +205,21 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co HostSignedEntriesType previous_level_list( Kokkos::ViewAllocateWithoutInitializing("previous_level_list"), nrows ); Kokkos::deep_copy( previous_level_list, signed_integral_t(-1) ); + const bool stored_diagonal = thandle.is_stored_diagonal(); + // diagonal_offsets is uninitialized - deep_copy unnecessary at the beginning, only needed at the end + auto diagonal_offsets = thandle.get_diagonal_offsets(); + auto hdiagonal_offsets = thandle.get_host_diagonal_offsets(); - // node 0 is trivially independent in lower tri solve, start with it in level 0 size_type level = 0; auto starting_node = 0; - level_list(starting_node) = 0; - size_type node_count = 1; //lower tri: starting with node 0 already in level 0 + auto ending_node = nrows; - nodes_per_level(0) = 1; - nodes_grouped_by_level(0) = starting_node; + size_type node_count = 0; while (node_count < nrows) { - for ( size_type row = 1; row < nrows; ++row ) { // row 0 already included + for ( size_type row = starting_node; row < ending_node; ++row ) + { if ( level_list(row) == -1 ) { // unmarked bool is_root = true; signed_integral_t ptrstart = row_map(row); @@ -126,6 +233,14 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co break; } } + else if ( col == row ) { + if (stored_diagonal) + hdiagonal_offsets(row) = offset; + } + else if ( col > row ) { + std::cout << "\nrow = " << row << " col = " << col << " offset = " << offset << std::endl; + std::runtime_error("SYMB ERROR: Lower tri with colid > rowid - SHOULD NOT HAPPEN!!!"); + } } // end for offset , i.e. cols of this row if ( is_root == true ) { @@ -146,9 +261,15 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co level += 1; } // end while - thandle.set_symbolic_complete(); thandle.set_num_levels(level); + // Create the chain now + if ( thandle.algm_requires_symb_chain() ) { + symbolic_chain_phase(thandle, nodes_per_level); + } + + thandle.set_symbolic_complete(); + // Output check #ifdef LVL_OUTPUT_INFO std::cout << " set symbolic complete: " << thandle.is_symbolic_complete() << std::endl; @@ -165,33 +286,62 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co { std::cout << "i: " << i << " nodes_grouped_by_level = " << nodes_grouped_by_level(i) << std::endl; } #endif + // Deep copy to device views Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); + if (stored_diagonal) + Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets); + + // Extra check: +#ifdef LVL_OUTPUT_INFO + { + std::cout << " End symb - extra checks" << std::endl; + std::cout << " node_count = " << node_count << std::endl; + std::cout << " nlevel = " << level << std::endl; + std::cout << " npl.extent = " << nodes_per_level.extent(0) << std::endl; + long check_count = 0; + Kokkos::parallel_reduce("check_count host", Kokkos::RangePolicy(0, nodes_per_level.extent(0)), + KOKKOS_LAMBDA (const long i, long& update) { + update+=nodes_per_level(i); + }, check_count); + std::cout << " host check_count= " << check_count << std::endl; + + check_count = 0; // reset + Kokkos::parallel_reduce("check_count device", Kokkos::RangePolicy(0, dnodes_per_level.extent(0)), + KOKKOS_LAMBDA (const long i, long& update) { + update+=dnodes_per_level(i); + }, check_count); + std::cout << " devicecheck_count= " << check_count << std::endl; + } +#endif } -} // end lowertri_level_sched + +#ifdef TRISOLVE_SYMB_TIMERS + std::cout << " Symbolic (lower tri) Total Time: " << timer_sym_lowertri_total.seconds() << std::endl;; +#endif +} // end lower_tri_symbolic template < class TriSolveHandle, class RowMapType, class EntriesType > void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries ) { +#ifdef TRISOLVE_SYMB_TIMERS + Kokkos::Timer timer_sym_uppertri_total; +#endif - if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP || - thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1 ) -/* || thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 )*/ + if ( thandle.algm_requires_symb_lvlsched() ) { // Scheduling currently compute on host - need host copy of all views typedef typename TriSolveHandle::size_type size_type; - typedef typename TriSolveHandle::nnz_lno_view_t DeviceEntriesType; - typedef typename TriSolveHandle::nnz_lno_view_t::HostMirror HostEntriesType; - typedef typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType; typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror HostSignedEntriesType; - typedef typename TriSolveHandle::signed_integral_t signed_integral_t; - size_type nrows = thandle.get_nrows(); +// size_type nrows = thandle.get_nrows(); + // Necessary for partitioned persisting sparse matrix + size_type nrows = drow_map.extent(0)-1; auto row_map = Kokkos::create_mirror_view(drow_map); Kokkos::deep_copy(row_map, drow_map); @@ -199,13 +349,13 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co auto entries = Kokkos::create_mirror_view(dentries); Kokkos::deep_copy(entries, dentries); + // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level(); - HostEntriesType nodes_per_level = Kokkos::create_mirror_view(dnodes_per_level); - Kokkos::deep_copy(nodes_per_level, dnodes_per_level); + auto nodes_per_level = thandle.get_host_nodes_per_level(); + // get device view - will deep_copy to it at end of this host routine DeviceEntriesType dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level(); - HostEntriesType nodes_grouped_by_level = Kokkos::create_mirror_view(dnodes_grouped_by_level); - Kokkos::deep_copy(nodes_grouped_by_level, dnodes_grouped_by_level); + auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level(); DeviceSignedEntriesType dlevel_list = thandle.get_level_list(); HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list); @@ -214,19 +364,21 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co HostSignedEntriesType previous_level_list( Kokkos::ViewAllocateWithoutInitializing("previous_level_list"), nrows); Kokkos::deep_copy( previous_level_list, signed_integral_t(-1) ); + const bool stored_diagonal = thandle.is_stored_diagonal(); + // diagonal_offsets is uninitialized - deep_copy unnecessary at the beginning, only needed at the end + auto diagonal_offsets = thandle.get_diagonal_offsets(); + auto hdiagonal_offsets = thandle.get_host_diagonal_offsets(); - // final row is trivially independent in upper tri solve, start with it in level 0 size_type level = 0; auto starting_node = nrows - 1; - level_list(starting_node) = 0; - size_type node_count = 1; //upper tri: starting with node n already in level 0 + auto ending_node = 0; - nodes_per_level(0) = 1; - nodes_grouped_by_level(0) = starting_node; + size_type node_count = 0; while (node_count < nrows) { - for ( signed_integral_t row = nrows-2; row >= 0; --row ) { // row 0 already included + for ( signed_integral_t row = starting_node; row >= ending_node; --row ) + { if ( level_list(row) == -1 ) { // unmarked bool is_root = true; signed_integral_t ptrstart = row_map(row); @@ -234,12 +386,17 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co for (signed_integral_t offset = ptrend-1; offset >= ptrstart; --offset) { signed_integral_t col = entries(offset); - if ( previous_level_list(col) == -1 && col != row ) { // unmarked + + if (previous_level_list(col) == -1 && col != row) { // unmarked if ( col > row ) { is_root = false; break; } } + else if ( col == row ) { + if (stored_diagonal) + hdiagonal_offsets(row) = offset; + } } // end for offset , i.e. cols of this row if ( is_root == true ) { @@ -260,9 +417,15 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co level += 1; } // end while - thandle.set_symbolic_complete(); thandle.set_num_levels(level); + // Create the chain now + if ( thandle.algm_requires_symb_chain() ) { + symbolic_chain_phase(thandle, nodes_per_level); + } + + thandle.set_symbolic_complete(); + // Output check #ifdef LVL_OUTPUT_INFO std::cout << " set symbolic complete: " << thandle.is_symbolic_complete() << std::endl; @@ -279,15 +442,54 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co { std::cout << "i: " << i << " nodes_grouped_by_level = " << nodes_grouped_by_level(i) << std::endl; } #endif + // Deep copy to device views Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level); Kokkos::deep_copy(dnodes_per_level, nodes_per_level); Kokkos::deep_copy(dlevel_list, level_list); + if (stored_diagonal) + Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets); + + // Extra check: +#ifdef LVL_OUTPUT_INFO + { + std::cout << " End symb - extra checks" << std::endl; + std::cout << " node_count = " << node_count << std::endl; + std::cout << " nlevel = " << level << std::endl; + std::cout << " npl.extent = " << nodes_per_level.extent(0) << std::endl; + long check_count = 0; + Kokkos::parallel_reduce("check_count host", Kokkos::RangePolicy(0, nodes_per_level.extent(0)), + KOKKOS_LAMBDA (const long i, long& update) { + update+=nodes_per_level(i); + }, check_count); + std::cout << " host check_count= " << check_count << std::endl; + + check_count = 0; // reset + Kokkos::parallel_reduce("check_count device", Kokkos::RangePolicy(0, dnodes_per_level.extent(0)), + KOKKOS_LAMBDA (const long i, long& update) { + update+=dnodes_per_level(i); + }, check_count); + std::cout << " devicecheck_count= " << check_count << std::endl; + } +#endif + } -} // end uppertri_level_sched + +#ifdef TRISOLVE_SYMB_TIMERS + std::cout << " Symbolic (upper tri) Total Time: " << timer_sym_uppertri_total.seconds() << std::endl;; +#endif +} // end upper_tri_symbolic } // namespace Experimental } // namespace Impl } // namespace KokkosSparse +#ifdef LVL_OUTPUT_INFO +#undef LVL_OUTPUT_INFO +#endif + +#ifdef CHAIN_LVL_OUTPUT_INFO +#undef CHAIN_LVL_OUTPUT_INFO +#endif + #endif diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp index f0103a2171..e960a404c1 100644 --- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp +++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp @@ -124,6 +124,8 @@ struct SPTRSV_SYMBOLICget_sptrsv_handle(); + auto nrows = row_map.extent(0)-1; + sptrsv_handle->new_init_handle(nrows); if ( sptrsv_handle->is_lower_tri() ) { Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries); diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp index 27b447cd2c..a5113db472 100644 --- a/unit_test/sparse/Test_Sparse_sptrsv.hpp +++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp @@ -52,7 +52,7 @@ #include "KokkosKernels_SparseUtils.hpp" #include "KokkosSparse_spmv.hpp" #include "KokkosSparse_CrsMatrix.hpp" -#include +#include #include "KokkosSparse_sptrsv.hpp" @@ -344,7 +344,7 @@ void run_test_sptrsv() { typedef Kokkos::View< size_type*, device > RowMapType; typedef Kokkos::View< lno_t*, device > EntriesType; - typedef Kokkos::View< scalar_t*, device > ValuesType; + typedef Kokkos::View< scalar_t*, device > ValuesType; // Lower tri { @@ -391,10 +391,6 @@ void run_test_sptrsv() { typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - KernelHandle kh; - bool is_lower_tri = true; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs ValuesType known_lhs("known_lhs", nrows); // Create known solution lhs set to all 1's @@ -410,60 +406,107 @@ void run_test_sptrsv() { crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - sptrsv_symbolic( &kh, row_map, entries ); - Kokkos::fence(); - - kh.get_sptrsv_handle()->print_algorithm(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; + { + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + + sptrsv_symbolic( &kh, row_map, entries ); + Kokkos::fence(); + + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + //FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + */ + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; + { + Kokkos::deep_copy(lhs, 0); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic( &kh, row_map, entries ); + Kokkos::fence(); + + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - -/* -//FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Lower Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Lower Tri Solve SUCCESS!" << std::endl; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if (std::is_same::value && std::is_same::value) + { + Kokkos::deep_copy(lhs, 0); + KernelHandle kh; + bool is_lower_tri = true; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Lower Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); -*/ - - - kh.destroy_sptrsv_handle(); +#endif } // Upper tri { @@ -510,10 +553,6 @@ void run_test_sptrsv() { typedef KokkosKernels::Experimental::KokkosKernelsHandle KernelHandle; - KernelHandle kh; - bool is_lower_tri = false; - kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); - // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs ValuesType known_lhs("known_lhs", nrows); // Create known solution lhs set to all 1's @@ -529,60 +568,108 @@ void run_test_sptrsv() { crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries); KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs); - sptrsv_symbolic( &kh, row_map, entries ); - Kokkos::fence(); - - kh.get_sptrsv_handle()->print_algorithm(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - scalar_t sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; + { + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri); + + sptrsv_symbolic( &kh, row_map, entries ); + Kokkos::fence(); + + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + //FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed + /* + Kokkos::deep_copy(lhs, 0); + kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + */ + + + kh.destroy_sptrsv_handle(); } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; - } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP); - kh.get_sptrsv_handle()->print_algorithm(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; + { + Kokkos::deep_copy(lhs, 0); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri); + auto chain_threshold = 1; + kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold); + + sptrsv_symbolic( &kh, row_map, entries ); + Kokkos::fence(); + + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); - -/* -//FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed - Kokkos::deep_copy(lhs, 0); - kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2); - kh.get_sptrsv_handle()->print_algorithm(); - sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); - Kokkos::fence(); - sum = 0.0; - Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); - if ( sum != lhs.extent(0) ) { - std::cout << "Upper Tri Solve FAILURE" << std::endl; - } - else { - std::cout << "Upper Tri Solve SUCCESS!" << std::endl; +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if (std::is_same::value && std::is_same::value) + { + Kokkos::deep_copy(lhs, 0); + KernelHandle kh; + bool is_lower_tri = false; + kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri); + + sptrsv_symbolic(&kh, row_map, entries, values); + Kokkos::fence(); + + sptrsv_solve( &kh, row_map, entries, values, rhs, lhs ); + Kokkos::fence(); + + scalar_t sum = 0.0; + Kokkos::parallel_reduce( Kokkos::RangePolicy(0, lhs.extent(0)), ReductionCheck(lhs), sum); + if ( sum != lhs.extent(0) ) { + std::cout << "Upper Tri Solve FAILURE" << std::endl; + kh.get_sptrsv_handle()->print_algorithm(); + } + EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); + + kh.destroy_sptrsv_handle(); } - EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) ); -*/ - - - kh.destroy_sptrsv_handle(); +#endif } }