diff --git a/example/buildlib/compileKokkosKernelsSimple.sh b/example/buildlib/compileKokkosKernelsSimple.sh
index b947e2b3e0..20d0a7aef4 100755
--- a/example/buildlib/compileKokkosKernelsSimple.sh
+++ b/example/buildlib/compileKokkosKernelsSimple.sh
@@ -4,10 +4,10 @@ KOKKOSKERNELS_LAYOUTS=LayoutLeft #the layout types to instantiate.
 KOKKOSKERNELS_ORDINALS=int #ordinal types to instantiate
 KOKKOSKERNELS_OFFSETS=int #offset types to instantiate
 KOKKOSKERNELS_PATH=../.. #path to kokkos-kernels top directory.
-CXX=icpc #${KOKKOS_PATH}/config/nvcc_wrapper #icpc #
+CXX=${KOKKOS_PATH}/bin/nvcc_wrapper #icpc #
 KOKKOSKERNELS_OPTIONS=eti-only #options for kokkoskernels  
-KOKKOS_DEVICES=OpenMP # other devices Cuda,Serial ..
-KOKKOS_ARCHS=KNL
+KOKKOS_DEVICES=Cuda # other devices Cuda,Serial ..
+KOKKOS_ARCHS=SKX,Volta70
 CXXFLAGS="-Wall -pedantic -Werror -O3 -g -Wshadow -Wsign-compare -Wtype-limits -Wuninitialized"
 
 ../../scripts/generate_makefile.bash --kokkoskernels-path=${KOKKOSKERNELS_PATH} --with-scalars=${KOKKOSKERNELS_SCALARS} --with-ordinals=${KOKKOSKERNELS_ORDINALS} --with-offsets=${KOKKOSKERNELS_OFFSETS} --kokkos-path=${KOKKOS_PATH} --with-devices=${KOKKOS_DEVICES} --arch=${KOKKOS_ARCHS} --compiler=${CXX} --with-options=${KOKKOSKERNELS_OPTIONS}  --cxxflags="${CXXFLAGS}"
diff --git a/perf_test/Makefile b/perf_test/Makefile
index b219dbc7cd..28e1aa0223 100644
--- a/perf_test/Makefile
+++ b/perf_test/Makefile
@@ -119,7 +119,7 @@ default: $(TEST_TARGETS)
 build: $(TEST_TARGETS)
 
 %.exe:%.o $(KOKKOS_LINK_DEPENDS) $(KOKKOSKERNELS_LINK_DEPENDS) $(TEST_HEADERS)
-	$(LINK) $(EXTRA_PATH) $< $(KOKKOSKERNELS_LDFLAGS) $(KOKKOSKERNELS_LIBS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o $@
+	$(LINK) $(EXTRA_PATH) $< $(KOKKOSKERNELS_LDFLAGS) $(KOKKOSKERNELS_LIBS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(LDLIB) -o $@
 
 %.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(KOKKOSKERNELS_CPP_DEPENDS) $(TEST_HEADERS)
 	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOSKERNELS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(INC) $(CXXFLAGS) $(EXTRA_INC) -I. -c $< -o $(notdir $@)
diff --git a/perf_test/sparse/KokkosSparse_spiluk.cpp b/perf_test/sparse/KokkosSparse_spiluk.cpp
index c817a1b1d8..5573d10cfb 100644
--- a/perf_test/sparse/KokkosSparse_spiluk.cpp
+++ b/perf_test/sparse/KokkosSparse_spiluk.cpp
@@ -90,7 +90,7 @@ using namespace KokkosKernels::Experimental;
 enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1/*, LVLSCHED_TP2*/};
 
 template<typename Scalar>
-int test_spiluk_perf(std::vector<int> tests, std::string afilename, int k, int team_size, int vector_length, /*int idx_offset,*/ int loop) {
+int test_spiluk_perf(std::vector<int> tests, std::string afilename, int K, int team_size, int vector_length, /*int idx_offset,*/ int loop) {
 
   typedef Scalar scalar_t;
   typedef default_lno_t lno_t;
@@ -119,12 +119,12 @@ int test_spiluk_perf(std::vector<int> tests, std::string afilename, int k, int t
 
   std::cout << "\n\n" << std::endl;
   if (!afilename.empty()) {
-    std::cout << "ILU(k) Begin: Read matrix filename " << afilename << std::endl;
+    std::cout << "ILU(K) Begin: Read matrix filename " << afilename << std::endl;
     crsmat_t A    = KokkosKernels::Impl::read_kokkos_crst_matrix<crsmat_t>(afilename.c_str()); //in_matrix
     graph_t graph = A.graph; // in_graph
     const size_type nrows = graph.numRows();
     const int       nnz   = A.nnz();
-    const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(k) ;
+    const typename KernelHandle::const_nnz_lno_t fill_lev = lno_t(K) ;
 
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
     //cuSPARSE requires lno_t = size_type = int. For both, int is always used (if enabled)
diff --git a/perf_test/sparse/KokkosSparse_sptrsv.cpp b/perf_test/sparse/KokkosSparse_sptrsv.cpp
index 81057409c9..bae058d175 100644
--- a/perf_test/sparse/KokkosSparse_sptrsv.cpp
+++ b/perf_test/sparse/KokkosSparse_sptrsv.cpp
@@ -64,6 +64,8 @@
 #include "KokkosSparse_CrsMatrix.hpp"
 #include <KokkosKernels_IOUtils.hpp>
 
+//#define INTERNAL_CUSPARSE
+
 #if defined(KOKKOSKERNELS_INST_ORDINAL_INT)
   typedef int default_lno_t;
 #elif defined(KOKKOSKERNELS_INST_ORDINAL_INT64_T)
@@ -86,12 +88,51 @@ using namespace KokkosSparse::Experimental;
 using namespace KokkosKernels;
 using namespace KokkosKernels::Experimental;
 
-enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1/*, LVLSCHED_TP2*/};
+//#define PRINTVIEWSSPTRSVPERF
+//#define PRINT_HLEVEL_FREQ_PLOT
+//#define PRINT_LEVEL_LIST
 
+enum {DEFAULT, CUSPARSE, LVLSCHED_RP, LVLSCHED_TP1, /*LVLSCHED_TP2,*/ LVLSCHED_TP1CHAIN, CUSPARSE_K};
 
+#ifdef PRINTVIEWSSPTRSVPERF
+template <class ViewType>
+void print_view1d(const ViewType dv) {
+  auto v = Kokkos::create_mirror_view(dv);
+  Kokkos::deep_copy(v,dv);
+  std::cout << "Output for view " << v.label() << std::endl;
+  for (size_t i = 0; i < v.extent(0); ++i) {
+    std::cout << "v(" << i << ") = " << v(i) << " , ";
+  }
+  std::cout << std::endl;
+}
+#else
+template <class ViewType>
+void print_view1d(const ViewType /*dv*/) {}
+#endif
+
+template <class RowMapType, class EntriesType>
+void check_entries_sorted(const RowMapType drow_map, const EntriesType dentries) {
+  auto row_map = Kokkos::create_mirror_view(drow_map);
+  Kokkos::deep_copy(row_map,drow_map);
+  auto entries = Kokkos::create_mirror_view(dentries);
+  Kokkos::deep_copy(entries,dentries);
+
+  for (size_t row = 0; row < row_map.extent(0)-1; ++row) {
+    size_t start = row_map(row);
+    size_t end = row_map(row+1);
+    for (size_t offset = start; offset < end-1; ++offset) {
+      size_t pcol = entries(offset);
+      size_t ncol = entries(offset+1);
+      if (pcol > ncol) {
+        std::cout << "  UNSORTED!!" << std::endl;
+      }
+    }
+  }
+
+}
 
 template<typename Scalar>
-int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string& ufilename, int team_size, int vector_length, int idx_offset, int loop) {
+int test_sptrsv_perf(std::vector<int> tests, const std::string& lfilename, const std::string& ufilename, const int team_size, const int vector_length, const int idx_offset, const int loop, const int chain_threshold = 0, const float dense_row_percent = -1.0) {
 
   typedef Scalar scalar_t;
   typedef default_lno_t lno_t;
@@ -107,8 +148,8 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
   typedef KokkosKernels::Experimental::KokkosKernelsHandle <size_type, lno_t, scalar_t,
     execution_space, memory_space, memory_space > KernelHandle;
 
-  scalar_t ZERO = scalar_t(0);
-  scalar_t ONE = scalar_t(1);
+  const scalar_t ZERO = scalar_t(0);
+  const scalar_t ONE  = scalar_t(1);
 
 
 // Read lmtx
@@ -144,9 +185,24 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     auto entries = graph.entries;
     auto values  = triMtx.values;
 
+    std::cout << "Lower Perf: row_map.extent(0) = " << row_map.extent(0) << std::endl;
+    std::cout << "Lower Perf: entries.extent(0) = " << entries.extent(0) << std::endl;
+    std::cout << "Lower Perf: values.extent(0) = " << values.extent(0) << std::endl;
 
+    std::cout << "Lower Perf: lhs.extent(0) = " << lhs.extent(0) << std::endl;
+    std::cout << "Lower Perf: rhs.extent(0) = " << rhs.extent(0) << std::endl;
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    check_entries_sorted(row_map, entries);
+
+#ifdef PRINTVIEWSSPTRSVPERF
+    print_view1d(row_map);
+    print_view1d(entries);
+    print_view1d(values);
+    print_view1d(known_lhs);
+    print_view1d(rhs);
+#endif
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
   //std::cout << "  cusparse: create handle" << std::endl;
   cusparseStatus_t status;
   cusparseHandle_t handle = 0;
@@ -198,7 +254,7 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     KernelHandle kh;
     bool is_lower_tri = true;
 
-    std::cout << "Create handle" << std::endl;
+    std::cout << "Create handle (lower)" << std::endl;
     switch(test) {
       case LVLSCHED_RP:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, is_lower_tri);
@@ -206,22 +262,44 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
         break;
       case LVLSCHED_TP1:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+        std::cout << "TP1 set team_size = " << team_size << std::endl;
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+        kh.get_sptrsv_handle()->print_algorithm();
+        break;
+      case LVLSCHED_TP1CHAIN:
+        printf("TP1 with CHAIN\n");
+        printf("chain_threshold %d\n", chain_threshold);
+        printf("team_size %d\n", team_size);
+        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
+        kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
         kh.get_sptrsv_handle()->print_algorithm();
         break;
 /*
       case LVLSCHED_TP2:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2, nrows, is_lower_tri);
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
         kh.get_sptrsv_handle()->print_algorithm();
         break;
 */
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+      case CUSPARSE_K:
+        printf("CUSPARSE WRAPPER\n");
+        kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri);
+        kh.get_sptrsv_handle()->print_algorithm();
+        break;
       case CUSPARSE:
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
         std::cout << "CUSPARSE: No kk interface added yet" << std::endl;
         //cusparse_matvec(A, x, y, rows_per_thread, team_size, vector_length);
         break;
+#else
+        std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl;
 #endif
       default:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
         kh.get_sptrsv_handle()->print_algorithm();
     }
 
@@ -230,7 +308,14 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     Kokkos::Timer timer;
     if (test != CUSPARSE) {
     timer.reset();
-    sptrsv_symbolic( &kh, row_map, entries );
+    if (test == CUSPARSE_K) {
+      printf("cusparsek symbolic\n");
+      sptrsv_symbolic( &kh, row_map, entries, values );
+      printf("  finished cusparsek symbolic\n");
+    }
+    else {
+      sptrsv_symbolic( &kh, row_map, entries );
+    }
     std::cout << "LTRI Symbolic Time: " << timer.seconds() << std::endl;
 
     //std::cout << "TriSolve Solve" << std::endl;
@@ -240,16 +325,9 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     std::cout << "LTRI Solve Time: " << timer.seconds() << std::endl;
   
     }
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
 // step 4: perform analysis
     else {
-#if 0
-      double *dvalues = (double *)(values.data());
-      int *drow_map = (int *)(row_map.data());
-      int *dentries = (int *)(entries.data());
-      double *dlhs = (double *)(lhs.data());
-      double *drhs = (double *)(rhs.data());
-#endif
       //int nnz = triMtx.nnz();
       //std::cout << "  cusparse path: analysis" << std::endl;
       //status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy, pBuffer);
@@ -283,13 +361,32 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     }
 #endif
     // Error Check
-    scalar_t sum = 0.0;
     Kokkos::fence();
+    {
+    scalar_t sum = 0.0;
     Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
       KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
-        tsum += lhs(i);
+        tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
       }, sum);
   
+    scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
+    std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
+
+    if ( norm_ssd > 1e-8 ) {
+      std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
+      return 1;
+    }
+    else {
+     std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
+    }
+
+      /*
+    sum = 0.0;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
+      KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
+        tsum += lhs(i);
+      }, sum);
+
     if ( sum != lhs.extent(0) ) {
       std::cout << "Lower Tri Solve FAILURE: sum = " << sum << std::endl;
       auto hsoln = Kokkos::create_mirror_view(lhs);
@@ -300,7 +397,9 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
       return 1;
     }
     else {
-     std::cout << "Lower Tri Solve SUCCESS!" << std::endl;
+     std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
+    }
+      */
     }
 
   
@@ -310,13 +409,35 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     double max_time = 0.0;
     double ave_time = 0.0;
 
-    for(int i=0;i<loop;i++) {
+    for(int titer=0;titer<loop;titer++) {
       timer.reset();
   
     if (test != CUSPARSE) {
+    #ifdef CHECKALLRUNRESULTS
+      Kokkos::deep_copy(lhs,0,0);
+    #endif
       sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+    #ifdef CHECKALLRUNRESULTS
+        {
+        scalar_t sum = 0.0;
+        Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
+          KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
+            tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
+          }, sum);
+  
+        scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
+        std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
+        if ( norm_ssd > 1e-8 ) {
+          std::cout << "Lower Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
+          return 1;
+        }
+        else {
+         std::cout << "\nLower Tri Solve Init Test: SUCCESS!\n" << std::endl;
+        }
+        }
+    #endif
     }
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
     else {
       cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer);
     }
@@ -328,19 +449,92 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
       if(time>max_time) max_time = time;
       if(time<min_time) min_time = time;
     }
+
     std::cout << "LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
     std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
     std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
 
+    // Output for level frequency plot
+    #ifdef PRINT_HLEVEL_FREQ_PLOT
+    if (test != CUSPARSE)
+    {
+    auto hnpl = kh.get_sptrsv_handle()->get_host_nodes_per_level();
+    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
+    std::cout << algmstring << std::endl;
+    // Create filename
+    std::string filename = "lower_nodes_per_level_" + algmstring + ".txt";
+    std::cout << filename << std::endl;
+    std::cout << "  nlevels = " << nlevels << std::endl;
+    std::ofstream outfile;
+    outfile.open(filename);
+    if (outfile.is_open()) {
+      for ( int i = 0; i < nlevels; ++i ) {
+        outfile << hnpl(i) << std::endl;
+        //std::cout  << hnpl(i) << std::endl;
+      }
+      outfile.close();
+    }
+    else {
+      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+    }
+
+    auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level();
+    filename = "lower_nodes_groupby_level_" + algmstring + ".txt";
+    std::cout << filename << std::endl;
+    outfile.open(filename);
+    if (outfile.is_open()) {
+      for ( size_t i = 0; i < hngpl.extent(0); ++i )
+        outfile << hngpl(i) << std::endl;
+      outfile.close();
+    }
+    else {
+      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+    }
+
+    }
+    #endif
+
+    #ifdef PRINT_LEVEL_LIST
+    if (test != CUSPARSE)
+    {
+    auto level_list = kh.get_sptrsv_handle()->get_level_list();
+    auto hlevel_list = Kokkos::create_mirror_view(level_list);
+    Kokkos::deep_copy(hlevel_list, level_list);
+
+    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+
+    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
+    std::cout << algmstring << std::endl;
+    // Create filename
+    std::string filename = "lower_level_list_" + algmstring + ".txt";
+    std::cout << filename << std::endl;
+    std::cout << "  nlevels = " << nlevels << "  nodes = " << hlevel_list.extent(0) << std::endl;
+    std::ofstream outfile;
+    outfile.open(filename);
+    if (outfile.is_open()) {
+      for ( size_t i = 0; i < hlevel_list.extent(0); ++i )
+        outfile << hlevel_list(i) << std::endl;
+      outfile.close();
+    }
+    else {
+      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+    }
+    }
+    #endif
+
+    kh.destroy_sptrsv_handle();
   }
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
 // step 6: free resources
     cudaFree(pBuffer);
     cusparseDestroyCsrsv2Info(info);
     cusparseDestroyMatDescr(descr);
     cusparseDestroy(handle);
 #endif
-  }
+  } // end lowertri
+  Kokkos::fence();
 
   std::cout << "\n\n" << std::endl;
 // UPPERTRI
@@ -369,7 +563,24 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     auto entries = graph.entries;
     auto values  = triMtx.values;
 
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    std::cout << "Upper Perf: row_map.extent(0) = " << row_map.extent(0) << std::endl;
+    std::cout << "Upper Perf: entries.extent(0) = " << entries.extent(0) << std::endl;
+    std::cout << "Upper Perf: values.extent(0) = " << values.extent(0) << std::endl;
+
+    std::cout << "Upper Perf: lhs.extent(0) = " << lhs.extent(0) << std::endl;
+    std::cout << "Upper Perf: rhs.extent(0) = " << rhs.extent(0) << std::endl;
+
+    check_entries_sorted(row_map, entries);
+
+#ifdef PRINTVIEWSSPTRSVPERF
+    print_view1d(row_map);
+    print_view1d(entries);
+    print_view1d(values);
+    print_view1d(known_lhs);
+    print_view1d(rhs);
+#endif
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
   //std::cout << "  cusparse: create handle" << std::endl;
   cusparseStatus_t status;
   cusparseHandle_t handle = 0;
@@ -421,7 +632,7 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     KernelHandle kh;
     bool is_lower_tri = false;
 
-    std::cout << "Create handle" << std::endl;
+    std::cout << "Create handle (upper)" << std::endl;
     switch(test) {
       case LVLSCHED_RP:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_RP, nrows, is_lower_tri);
@@ -429,22 +640,39 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
         break;
       case LVLSCHED_TP1:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+        std::cout << "TP1 set team_size = " << team_size << std::endl;
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+        kh.get_sptrsv_handle()->print_algorithm();
+        break;
+      case LVLSCHED_TP1CHAIN:
+        printf("TP1 with CHAIN\n");
+        printf("chain_threshold %d\n", chain_threshold);
+        printf("team_size %d\n", team_size);
+        kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
+        kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
         kh.get_sptrsv_handle()->print_algorithm();
         break;
 /*
       case LVLSCHED_TP2:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHED_TP2, nrows, is_lower_tri);
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
+        if (vector_length != -1) kh.get_sptrsv_handle()->set_vector_size(vector_length);
         kh.get_sptrsv_handle()->print_algorithm();
         break;
 */
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
       case CUSPARSE:
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
         std::cout << "CUSPARSE: No kk interface added yet" << std::endl;
         //cusparse_matvec(A, x, y, rows_per_thread, team_size, vector_length);
         break;
+#else
+        std::cout << "CUSPARSE not enabled: Fall through to defaults" << std::endl;
 #endif
       default:
         kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+        if (team_size != -1) kh.get_sptrsv_handle()->set_team_size(team_size);
         kh.get_sptrsv_handle()->print_algorithm();
     }
 
@@ -463,16 +691,9 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     std::cout << "UTRI Solve Time: " << timer.seconds() << std::endl;
   
     }
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
 // step 4: perform analysis
     else {
-#if 0
-      double *dvalues = (double *)(values.data());
-      int *drow_map = (int *)(row_map.data());
-      int *dentries = (int *)(entries.data());
-      double *dlhs = (double *)(lhs.data());
-      double *drhs = (double *)(rhs.data());
-#endif
       //int nnz = triMtx.nnz();
       //std::cout << "  cusparse path: analysis" << std::endl;
       //status = cusparseDcsrsv2_analysis(handle, trans, nrows, nnz, descr, (double*)dvalues, (int *)drow_map, (int *)dentries, info, policy, pBuffer);
@@ -506,7 +727,26 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
 #endif
     // Error Check
     Kokkos::fence();
+    {
     scalar_t sum = 0.0;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
+      KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
+        tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
+      }, sum);
+  
+    scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
+    std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
+
+    if ( norm_ssd > 1e-8 ) {
+      std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
+      return 1;
+    }
+    else {
+     std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
+    }
+
+      /*
+    sum = 0.0;
     Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
       KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
         tsum += lhs(i);
@@ -522,7 +762,9 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
       return 1;
     }
     else {
-     std::cout << "Upper Tri Solve SUCCESS!" << std::endl;
+     std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
+    }
+      */
     }
   
     // Benchmark
@@ -531,13 +773,35 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     double max_time = 0.0;
     double ave_time = 0.0;
 
-    for(int i=0;i<loop;i++) {
+    for(int titer=0;titer<loop;titer++) {
       timer.reset();
   
     if (test != CUSPARSE) {
+    #ifdef CHECKALLRUNRESULTS
+      Kokkos::deep_copy(lhs,0,0);
+    #endif
       sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+    #ifdef CHECKALLRUNRESULTS
+        {
+        scalar_t sum = 0.0;
+        Kokkos::parallel_reduce( Kokkos::RangePolicy<execution_space>(0, lhs.extent(0)), 
+          KOKKOS_LAMBDA ( const lno_t i, scalar_t &tsum ) {
+            tsum += (known_lhs(i) - lhs(i))*(known_lhs(i) - lhs(i));
+          }, sum);
+  
+        scalar_t norm_ssd = sqrt(sum / lhs.extent(0));
+        std::cout << "  ssd = " << sum << "  norm_sqrt_ssd = " << norm_ssd << std::endl;
+        if ( norm_ssd > 1e-8 ) {
+          std::cout << "Upper Tri Solve FAILURE: norm_ssd = " << norm_ssd << std::endl;
+          return 1;
+        }
+        else {
+         std::cout << "\nUpper Tri Solve Init Test: SUCCESS!\n" << std::endl;
+        }
+        }
+    #endif
     }
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
     else {
       cusparseDcsrsv2_solve(handle, trans, nrows, triMtx.nnz(), &alpha, descr, values.data(), row_map.data(), entries.data(), info, rhs.data(), lhs.data(), policy, pBuffer);
     }
@@ -553,15 +817,88 @@ int test_sptrsv_perf(std::vector<int> tests, std::string& lfilename, std::string
     std::cout << "LOOP_AVG_TIME:  " << ave_time/loop << std::endl;
     std::cout << "LOOP_MAX_TIME:  " << max_time << std::endl;
     std::cout << "LOOP_MIN_TIME:  " << min_time << std::endl;
+
+    // Output for level frequency plot
+    #ifdef PRINT_HLEVEL_FREQ_PLOT
+    if (test != CUSPARSE)
+    {
+    auto hnpl = kh.get_sptrsv_handle()->get_host_nodes_per_level();
+    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
+    std::cout << algmstring << std::endl;
+    // Create filename
+    std::string filename = "upper_nodes_per_level_" + algmstring + ".txt";
+    std::cout << filename << std::endl;
+    std::cout << "  nlevels = " << nlevels << std::endl;
+    std::ofstream outfile;
+    outfile.open(filename);
+    if (outfile.is_open()) {
+      for ( int i = 0; i < nlevels; ++i ) {
+        outfile << hnpl(i) << std::endl;
+        //std::cout  << hnpl(i) << std::endl;
+      }
+      outfile.close();
+    }
+    else {
+      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+    }
+
+    auto hngpl = kh.get_sptrsv_handle()->get_host_nodes_grouped_by_level();
+    filename = "upper_nodes_groupby_level_" + algmstring + ".txt";
+    std::cout << filename << std::endl;
+    outfile.open(filename);
+    if (outfile.is_open()) {
+      for ( size_t i = 0; i < hngpl.extent(0); ++i )
+        outfile << hngpl(i) << std::endl;
+      outfile.close();
+    }
+    else {
+      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+    }
+
+    }
+    #endif
+
+    #ifdef PRINT_LEVEL_LIST
+    if (test != CUSPARSE)
+    {
+    auto level_list = kh.get_sptrsv_handle()->get_level_list();
+    auto hlevel_list = Kokkos::create_mirror_view(level_list);
+    Kokkos::deep_copy(hlevel_list, level_list);
+
+    auto nlevels = kh.get_sptrsv_handle()->get_num_levels();
+
+    std::string algmstring = kh.get_sptrsv_handle()->return_algorithm_string();
+    std::cout << algmstring << std::endl;
+    // Create filename
+    std::string filename = "upper_level_list_" + algmstring + ".txt";
+    std::cout << filename << std::endl;
+    std::cout << "  nlevels = " << nlevels << "  nodes = " << hlevel_list.extent(0) << std::endl;
+    std::ofstream outfile;
+    outfile.open(filename);
+    if (outfile.is_open()) {
+      for ( size_t i = 0; i < hlevel_list.extent(0); ++i )
+        outfile << hlevel_list(i) << std::endl;
+      outfile.close();
+    }
+    else {
+      std::cout << "OUTFILE DID NOT OPEN!!!" << std::endl;
+    }
+    }
+    #endif
+
+    kh.destroy_sptrsv_handle();
   }
-#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+
+#if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined (INTERNAL_CUSPARSE)
 // step 6: free resources
     cudaFree(pBuffer);
     cusparseDestroyCsrsv2Info(info);
     cusparseDestroyMatDescr(descr);
     cusparseDestroy(handle);
 #endif
-  }
+  } // end uppertri
+  Kokkos::fence();
 
   return 0;
 }
@@ -571,21 +908,23 @@ void print_help_sptrsv() {
   printf("Options:\n");
   printf("  --test [OPTION] : Use different kernel implementations\n");
   printf("                    Options:\n");
-  printf("                      lvlrp, lvltp1, lvltp2\n\n");
+  printf("                      lvlrp, lvltp1, lvltp2, lvltp1chain, lvldensetp1, lvldensetp2\n\n");
   printf("                      cusparse           (Vendor Libraries)\n\n");
-  printf("  -lf [file]       : Read in Matrix Market formatted text file 'file'.\n");
-  printf("  -uf [file]       : Read in Matrix Market formatted text file 'file'.\n");
+  printf("  -lf [file]      : Read in Matrix Market formatted text file 'file'.\n");
+  printf("  -uf [file]      : Read in Matrix Market formatted text file 'file'.\n");
+  printf("  --offset [O]    : Subtract O from every index.\n");
+  printf("                    Useful in case the matrix market file is not 0 based.\n\n");
+  printf("  -ts [T]         : Number of threads per team.\n");
+  printf("  -vl [V]         : Vector-length (i.e. how many Cuda threads are a Kokkos 'thread').\n");
+  printf("  -ct [V]         : Chain threshold: Only has effect of lvltp1chain algorithm.\n");
+  printf("  -dr [V]         : Dense row percent (as float): Only has effect of lvldensetp1 algorithm.\n");
+  printf("  --loop [LOOP]   : How many spmv to run to aggregate average time. \n");
+//  printf("  --write-lvl-freq: Write output files with number of nodes per level for each matrix and algorithm.\n");
 //  printf("  -s [N]          : generate a semi-random banded (band size 0.01xN) NxN matrix\n");
 //  printf("                    with average of 10 entries per row.\n");
 //  printf("  --schedule [SCH]: Set schedule for kk variant (static,dynamic,auto [ default ]).\n");
 //  printf("  -fb [file]      : Read in binary Matrix files 'file'.\n");
 //  printf("  --write-binary  : In combination with -f, generate binary files.\n");
-  printf("  --offset [O]    : Subtract O from every index.\n");
-  printf("                    Useful in case the matrix market file is not 0 based.\n\n");
-  printf("  -rpt [K]        : Number of Rows assigned to a thread.\n");
-  printf("  -ts [T]         : Number of threads per team.\n");
-  printf("  -vl [V]         : Vector-length (i.e. how many Cuda threads are a Kokkos 'thread').\n");
-  printf("  --loop [LOOP]       : How many spmv to run to aggregate average time. \n");
 }
 
 
@@ -600,6 +939,8 @@ int main(int argc, char **argv)
  int team_size = -1;
  int idx_offset = 0;
  int loop = 1;
+ int chain_threshold = 0;
+ float dense_row_percent = -1.0;
 // int schedule=AUTO;
 
  if(argc == 1) {
@@ -617,20 +958,29 @@ int main(int argc, char **argv)
     if((strcmp(argv[i],"lvltp1")==0)) {
       tests.push_back( LVLSCHED_TP1 );
     }
-/*
+    if((strcmp(argv[i],"lvltp1chain")==0)) {
+      tests.push_back( LVLSCHED_TP1CHAIN );
+    }
+    /*
     if((strcmp(argv[i],"lvltp2")==0)) {
       tests.push_back( LVLSCHED_TP2 );
     }
-*/
+    */
     if((strcmp(argv[i],"cusparse")==0)) {
       tests.push_back( CUSPARSE );
     }
+    if((strcmp(argv[i],"cusparsek")==0)) {
+      tests.push_back( CUSPARSE_K );
+    }
     continue;
   }
   if((strcmp(argv[i],"-lf")==0)) {lfilename = argv[++i]; continue;}
   if((strcmp(argv[i],"-uf")==0)) {ufilename = argv[++i]; continue;}
   if((strcmp(argv[i],"-ts")==0)) {team_size=atoi(argv[++i]); continue;}
   if((strcmp(argv[i],"-vl")==0)) {vector_length=atoi(argv[++i]); continue;}
+  if((strcmp(argv[i],"-ct")==0)) {chain_threshold=atoi(argv[++i]); continue;}
+  if((strcmp(argv[i],"-dr")==0)) {dense_row_percent=atof(argv[++i]); continue;}
+  if((strcmp(argv[i],"-l")==0)) {loop=atoi(argv[++i]); continue;}
   if((strcmp(argv[i],"--offset")==0)) {idx_offset=atoi(argv[++i]); continue;}
   if((strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;}
 /*
@@ -663,7 +1013,7 @@ int main(int argc, char **argv)
 
  Kokkos::initialize(argc,argv);
  {
-   int total_errors = test_sptrsv_perf<double>(tests,lfilename,ufilename,team_size,vector_length,idx_offset,loop);
+   int total_errors = test_sptrsv_perf<double>(tests, lfilename, ufilename, team_size, vector_length, idx_offset, loop, chain_threshold, dense_row_percent);
 
    if(total_errors == 0)
    printf("Kokkos::SPTRSV Test: Passed\n");
@@ -677,6 +1027,7 @@ int main(int argc, char **argv)
 }
 #else
 int main() {
+  std::cout << "KokkosSparse_sptrsv: This perf_test will do nothing when Cuda is enabled without lambda support." << std::endl;
   return 0;
 }
 #endif
diff --git a/scripts/generate_makefile.bash b/scripts/generate_makefile.bash
index 20be08abfb..c6937ddf40 100755
--- a/scripts/generate_makefile.bash
+++ b/scripts/generate_makefile.bash
@@ -81,6 +81,9 @@ do
     --ldflags*)
       LDFLAGS="${key#*=}"
       ;;
+    --ldlib*)
+      LDLIB="${key#*=}"
+      ;;
     --debug|-dbg)
       KOKKOS_DEBUG=yes
       ;;
@@ -296,6 +299,11 @@ if [ ${#LDFLAGS} -gt 0 ]; then
   KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDFLAGS=\"${LDFLAGS}\""
 fi
 
+if [ ${#LDLIB} -gt 0 ]; then
+  echo "LDLIB: $LDLIB"
+  KOKKOS_SETTINGS="${KOKKOS_SETTINGS} LDLIB=\"${LDLIB}\""
+fi
+
 if [ ${#GTEST_PATH} -gt 0 ]; then
   KOKKOS_SETTINGS="${KOKKOS_SETTINGS} GTEST_PATH=${GTEST_PATH}"
 else
diff --git a/src/common/KokkosKernels_Handle.hpp b/src/common/KokkosKernels_Handle.hpp
index 7caf38944b..a33fa00164 100644
--- a/src/common/KokkosKernels_Handle.hpp
+++ b/src/common/KokkosKernels_Handle.hpp
@@ -558,7 +558,7 @@ class KokkosKernelsHandle
     this->destroy_sptrsv_handle();
     this->is_owner_of_the_sptrsv_handle = true;
     this->sptrsvHandle = new SPTRSVHandleType(algm, nrows, lower_tri);
-    this->sptrsvHandle->reset_handle(nrows);
+//    this->sptrsvHandle->init_handle(nrows);
     this->sptrsvHandle->set_team_size(this->team_work_size);
     this->sptrsvHandle->set_vector_size(this->vector_size);
   }
diff --git a/src/sparse/KokkosSparse_spgemm_handle.hpp b/src/sparse/KokkosSparse_spgemm_handle.hpp
index d45cc7d214..a20624f185 100644
--- a/src/sparse/KokkosSparse_spgemm_handle.hpp
+++ b/src/sparse/KokkosSparse_spgemm_handle.hpp
@@ -126,8 +126,6 @@ class SPGEMMHandle{
   typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
 
 
-
-
 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
   struct cuSparseHandleType{
     cusparseHandle_t handle;
diff --git a/src/sparse/KokkosSparse_sptrsv.hpp b/src/sparse/KokkosSparse_sptrsv.hpp
index 94a86f2f8c..2b2ef260ab 100644
--- a/src/sparse/KokkosSparse_sptrsv.hpp
+++ b/src/sparse/KokkosSparse_sptrsv.hpp
@@ -58,6 +58,8 @@
 #include "KokkosSparse_sptrsv_symbolic_spec.hpp"
 #include "KokkosSparse_sptrsv_solve_spec.hpp"
 
+#include "KokkosSparse_sptrsv_cuSPARSE_impl.hpp"
+
 namespace KokkosSparse {
 namespace Experimental {
 
@@ -112,6 +114,82 @@ namespace Experimental {
 
   } // sptrsv_symbolic
 
+  template <typename KernelHandle,
+            typename lno_row_view_t_,
+            typename lno_nnz_view_t_,
+            typename scalar_nnz_view_t_>
+  void sptrsv_symbolic(
+      KernelHandle *handle, 
+      lno_row_view_t_ rowmap,
+      lno_nnz_view_t_ entries,
+      scalar_nnz_view_t_ values)
+  {
+    typedef typename KernelHandle::size_type size_type;
+    typedef typename KernelHandle::nnz_lno_t ordinal_type;
+    typedef typename KernelHandle::nnz_scalar_t scalar_type;
+
+    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_row_view_t_::non_const_value_type, size_type),
+        "sptrsv_symbolic: A size_type must match KernelHandle size_type (const doesn't matter)");
+
+    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename lno_nnz_view_t_::non_const_value_type, ordinal_type),
+        "sptrsv_symbolic: A entry type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
+
+    static_assert(KOKKOSKERNELS_SPTRSV_SAME_TYPE(typename scalar_nnz_view_t_::value_type, scalar_type),
+        "sptrsv_symbolic: A scalar type must match KernelHandle entry type (aka nnz_lno_t, and const doesn't matter)");
+
+    typedef typename KernelHandle::const_size_type c_size_t;
+    typedef typename KernelHandle::const_nnz_lno_t c_lno_t;
+    typedef typename KernelHandle::const_nnz_scalar_t c_scalar_t;
+
+    typedef typename KernelHandle::HandleExecSpace c_exec_t;
+    typedef typename KernelHandle::HandleTempMemorySpace c_temp_t;
+    typedef typename KernelHandle::HandlePersistentMemorySpace c_persist_t;
+
+    typedef typename  KokkosKernels::Experimental::KokkosKernelsHandle<c_size_t, c_lno_t, c_scalar_t, c_exec_t, c_temp_t, c_persist_t> const_handle_type;
+    const_handle_type tmp_handle (*handle);
+
+    typedef Kokkos::View<
+          typename lno_row_view_t_::const_value_type*,
+          typename KokkosKernels::Impl::GetUnifiedLayout<lno_row_view_t_>::array_layout,
+          typename lno_row_view_t_::device_type,
+          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > RowMap_Internal;
+
+    typedef Kokkos::View<
+          typename lno_nnz_view_t_::const_value_type*,
+          typename KokkosKernels::Impl::GetUnifiedLayout<lno_nnz_view_t_>::array_layout,
+          typename lno_nnz_view_t_::device_type,
+          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Entries_Internal;
+
+    typedef Kokkos::View<
+          typename scalar_nnz_view_t_::const_value_type*,
+          typename KokkosKernels::Impl::GetUnifiedLayout<scalar_nnz_view_t_>::array_layout,
+          typename scalar_nnz_view_t_::device_type,
+          Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess> > Values_Internal;
+
+    auto sptrsv_handle = handle->get_sptrsv_handle();
+    if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) {
+      RowMap_Internal rowmap_i = rowmap;
+      Entries_Internal entries_i = entries;
+      Values_Internal values_i = values;
+
+
+      typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType;
+      sptrsvHandleType *sh = handle->get_sptrsv_handle();
+      auto nrows = sh->get_nrows();
+
+      KokkosSparse::Impl::sptrsvcuSPARSE_symbolic
+      < sptrsvHandleType,
+        RowMap_Internal,
+        Entries_Internal,
+        Values_Internal >
+        (sh, nrows, rowmap_i, entries_i, values_i, false);
+
+    }
+    else {
+      KokkosSparse::Experimental::sptrsv_symbolic (handle, rowmap, entries);
+    }
+
+  } // sptrsv_symbolic
 
 
   template <typename KernelHandle,
@@ -210,7 +288,25 @@ namespace Experimental {
     BType_Internal b_i = b;
     XType_Internal x_i = x;
 
-    KokkosSparse::Impl::SPTRSV_SOLVE<const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve (&tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i);
+    auto sptrsv_handle = handle->get_sptrsv_handle();
+    if (sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SPTRSV_CUSPARSE) {
+      typedef typename KernelHandle::SPTRSVHandleType sptrsvHandleType;
+      sptrsvHandleType *sh = handle->get_sptrsv_handle();
+      auto nrows = sh->get_nrows();
+
+      KokkosSparse::Impl::sptrsvcuSPARSE_solve
+      < sptrsvHandleType,
+        RowMap_Internal,
+        Entries_Internal,
+        Values_Internal,
+        BType_Internal,
+        XType_Internal >
+        (sh, nrows, rowmap_i, entries_i, values_i, b_i, x_i, false);
+
+    }
+    else {
+      KokkosSparse::Impl::SPTRSV_SOLVE<const_handle_type, RowMap_Internal, Entries_Internal, Values_Internal, BType_Internal, XType_Internal>::sptrsv_solve (&tmp_handle, rowmap_i, entries_i, values_i, b_i, x_i);
+    }
 
   } // sptrsv_solve
 
diff --git a/src/sparse/KokkosSparse_sptrsv_handle.hpp b/src/sparse/KokkosSparse_sptrsv_handle.hpp
index 005ad8d828..d92456a41e 100644
--- a/src/sparse/KokkosSparse_sptrsv_handle.hpp
+++ b/src/sparse/KokkosSparse_sptrsv_handle.hpp
@@ -46,14 +46,22 @@
 #include <iostream>
 #include <string>
 
-#ifndef _SPTRSVHANDLE_HPP
-#define _SPTRSVHANDLE_HPP
+#ifndef KOKKOSSPARSE_SPTRSVHANDLE_HPP
+#define KOKKOSSPARSE_SPTRSVHANDLE_HPP
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#include "cusparse.h"
+#endif
+
+#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
+#define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+#endif
 
 namespace KokkosSparse {
 namespace Experimental {
 
-// TP2 algorithm has issues with some offset-ordinal combo to be addressed
-enum class SPTRSVAlgorithm { SEQLVLSCHD_RP, SEQLVLSCHD_TP1/*, SEQLVLSCHED_TP2*/ };
+// TODO TP2 algorithm had issues with some offset-ordinal combo to be addressed when compiled in Trilinos...
+enum class SPTRSVAlgorithm { SEQLVLSCHD_RP, SEQLVLSCHD_TP1, /*SEQLVLSCHED_TP2,*/ SEQLVLSCHD_TP1CHAIN, SPTRSV_CUSPARSE };
 
 template <class size_type_, class lno_t_, class scalar_t_,
           class ExecutionSpace,
@@ -76,162 +84,451 @@ class SPTRSVHandle {
   typedef typename std::remove_const<lno_t_>::type  nnz_lno_t;
   typedef const nnz_lno_t const_nnz_lno_t;
 
-  typedef typename std::remove_const<scalar_t_>::type  nnz_scalar_t;
-  typedef const nnz_scalar_t const_nnz_scalar_t;
+  typedef typename std::remove_const<scalar_t_>::type  scalar_t;
+  typedef const scalar_t const_nnz_scalar_t;
 
 
+  // row_map type (managed memory)
   typedef typename Kokkos::View<size_type *, HandleTempMemorySpace> nnz_row_view_temp_t;
   typedef typename Kokkos::View<size_type *, HandlePersistentMemorySpace> nnz_row_view_t;
+  typedef typename nnz_row_view_t::HostMirror host_nnz_row_view_t;
  // typedef typename row_lno_persistent_work_view_t::HostMirror row_lno_persistent_work_host_view_t; //Host view type
+  typedef typename Kokkos::View<const size_type *, HandlePersistentMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>> nnz_row_unmanaged_view_t; // for rank1 subviews
 
-  typedef typename Kokkos::View<nnz_scalar_t *, HandleTempMemorySpace> nnz_scalar_view_temp_t;
-  typedef typename Kokkos::View<nnz_scalar_t *, HandlePersistentMemorySpace> nnz_scalar_view_t;
-
+  // values type (managed memory)
+  typedef typename Kokkos::View<scalar_t *, HandleTempMemorySpace> nnz_scalar_view_temp_t;
+  typedef typename Kokkos::View<scalar_t *, HandlePersistentMemorySpace> nnz_scalar_view_t;
+  typedef typename nnz_scalar_view_t::HostMirror host_nnz_scalar_view_t;
+  typedef typename Kokkos::View<const scalar_t *, HandlePersistentMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>> nnz_scalar_unmanaged_view_t; // for rank1 subviews
 
+  // entries type (managed memory)
   typedef typename Kokkos::View<nnz_lno_t *, HandleTempMemorySpace> nnz_lno_view_temp_t;
   typedef typename Kokkos::View<nnz_lno_t *, HandlePersistentMemorySpace> nnz_lno_view_t;
+  typedef typename nnz_lno_view_t::HostMirror host_nnz_lno_view_t;
+  typedef typename Kokkos::View<const nnz_lno_t *, HandlePersistentMemorySpace, Kokkos::MemoryTraits<Kokkos::Unmanaged|Kokkos::RandomAccess>> nnz_lno_unmanaged_view_t; // for rank1 subviews
  // typedef typename nnz_lno_persistent_work_view_t::HostMirror nnz_lno_persistent_work_host_view_t; //Host view type
 
 
   typedef typename std::make_signed<typename nnz_row_view_t::non_const_value_type>::type signed_integral_t;
   typedef Kokkos::View< signed_integral_t*, typename nnz_row_view_t::array_layout, typename nnz_row_view_t::device_type, typename nnz_row_view_t::memory_traits > signed_nnz_lno_view_t;
+  typedef typename signed_nnz_lno_view_t::HostMirror host_signed_nnz_lno_view_t;
+
+  typedef typename Kokkos::View<scalar_t **, HandlePersistentMemorySpace> mtx_scalar_view_t;
+
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+  struct cuSparseHandleType {
+    cusparseHandle_t handle;
+    cusparseOperation_t transpose;
+    csrsv2Info_t info {0};
+    cusparseMatDescr_t descr;
+    cusparseSolvePolicy_t policy;
+    void *pBuffer {nullptr};
+
+    cuSparseHandleType(bool transpose_, bool is_lower){
+      cusparseStatus_t status;
+      status= cusparseCreate(&handle);
+      if (status != CUSPARSE_STATUS_SUCCESS) {
+        throw std::runtime_error ("cusparseCreate ERROR\n");
+      }
+      cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_HOST);
+
+      if (transpose_){
+        transpose = CUSPARSE_OPERATION_TRANSPOSE;
+      }
+      else {
+        transpose  = CUSPARSE_OPERATION_NON_TRANSPOSE;
+      }
+
+      status = cusparseCreateMatDescr(&descr);
+      if (status != CUSPARSE_STATUS_SUCCESS) {
+        throw std::runtime_error ("cusparseCreateMatDescr descr ERROR\n");
+      }
+      cusparseSetMatType(descr,CUSPARSE_MATRIX_TYPE_GENERAL);
+      cusparseSetMatIndexBase(descr,CUSPARSE_INDEX_BASE_ZERO);
+
+      if (is_lower)
+        cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_LOWER);
+      else
+        cusparseSetMatFillMode(descr, CUSPARSE_FILL_MODE_UPPER);
+
+      cusparseSetMatDiagType(descr, CUSPARSE_DIAG_TYPE_NON_UNIT);
+
+      policy = CUSPARSE_SOLVE_POLICY_USE_LEVEL;
+    }
+
+    ~cuSparseHandleType() {
+      if (pBuffer != nullptr) {
+        cudaFree(pBuffer);
+        pBuffer = nullptr;
+      }
+      cusparseDestroyMatDescr(descr);
+      cusparseDestroy(handle);
+    }
+  };
 
+  typedef cuSparseHandleType SPTRSVcuSparseHandleType;
+#endif
+
+
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+  bool cudagraphCreated; // Move this later
+  struct cudaGraphWrapperType {
+    cudaGraph_t cudagraph;
+    cudaGraphExec_t cudagraphinstance;
+    cudaStream_t stream;
+
+    //cudaGraphWrapperType() { }
+    //~cudaGraphWrapperType() { }
+  };
+
+  typedef cudaGraphWrapperType SPTRSVcudaGraphWrapperType;
+
+  void create_SPTRSVcudaGraphWrapperType() {
+    destroy_SPTRSVcudaGraphWrapperType();
+      sptrsvCudaGraph = new SPTRSVcudaGraphWrapperType;
+    cudaStreamCreate(&sptrsvCudaGraph->stream);
+  }
+
+  void destroy_SPTRSVcudaGraphWrapperType() {
+    if(sptrsvCudaGraph != nullptr) {
+      //cudaGraphExecDestroy(sptrsvCudaGraph->cudagraphinstance);
+      //cudaGraphDestroy(sptrsvCudaGraph->cudagraph);
+      cudaStreamDestroy(sptrsvCudaGraph->stream);
+      delete sptrsvCudaGraph;
+      sptrsvCudaGraph = nullptr;
+    }
+  }
+
+  SPTRSVcudaGraphWrapperType* get_sptrsvCudaGraph() {
+    return sptrsvCudaGraph;
+  }
+#endif
 
 private:
 
-  signed_nnz_lno_view_t level_list;
-  nnz_lno_view_t nodes_per_level;
-  nnz_lno_view_t nodes_grouped_by_level;
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+  SPTRSVcudaGraphWrapperType *sptrsvCudaGraph;
+#endif
 
   size_type nrows;
-  size_type nlevel;
 
   bool lower_tri;
 
-  bool symbolic_complete;
-
   SPTRSVAlgorithm algm;
 
+  // Symbolic: Level scheduling data
+  signed_nnz_lno_view_t level_list;
+  nnz_lno_view_t nodes_per_level;
+  host_nnz_lno_view_t hnodes_per_level; // NEW
+  nnz_lno_view_t nodes_grouped_by_level;
+  host_nnz_lno_view_t hnodes_grouped_by_level; // NEW
+  size_type nlevel;
+
   int team_size;
   int vector_size;
 
+  bool stored_diagonal;
+  nnz_lno_view_t diagonal_offsets;
+  nnz_scalar_view_t diagonal_values; // inserted by rowid
+
+  host_nnz_lno_view_t hdiagonal_offsets;
+  host_nnz_scalar_view_t hdiagonal_values; // inserted by rowid
+
+  // Symbolic: Single-block chain data
+  host_signed_nnz_lno_view_t h_chain_ptr;
+  size_type num_chain_entries;
+  signed_integral_t chain_threshold;
+
+  bool symbolic_complete;
+  bool require_symbolic_lvlsched_phase;
+  bool require_symbolic_chain_phase;
+
+  void set_if_algm_require_symb_lvlsched () {
+    if (algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP
+        || algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1
+        /*|| algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2*/
+        || algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN
+       )
+    {
+      require_symbolic_lvlsched_phase = true;
+    }
+    else {
+      require_symbolic_lvlsched_phase = false;
+    }
+  }
+
+  void set_if_algm_require_symb_chain () {
+    if (algm == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN
+       )
+    {
+      require_symbolic_chain_phase = true;
+    }
+    else {
+      require_symbolic_chain_phase = false;
+    }
+  }
+
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+  SPTRSVcuSparseHandleType *cuSPARSEHandle;
+#endif
 public:
 
-  SPTRSVHandle ( SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, bool symbolic_complete_ = false ) :
+  SPTRSVHandle(SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, bool symbolic_complete_ = false) :
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+    cudagraphCreated(false),
+    sptrsvCudaGraph(nullptr),
+#endif
+    nrows(nrows_),
+    lower_tri(lower_tri_),
+    algm(choice),
     level_list(),
     nodes_per_level(),
+    hnodes_per_level(),
     nodes_grouped_by_level(),
-    nrows(nrows_),
+    hnodes_grouped_by_level(),
     nlevel(0),
-    lower_tri( lower_tri_ ),
-    symbolic_complete( symbolic_complete_ ),
-    algm(choice),
     team_size(-1),
-    vector_size(-1)
-  {}
-
-#if 0
-  SPTRSVHandle ( SPTRSVAlgorithm choice, const size_type nrows_, bool lower_tri_, bool symbolic_complete_ = false ) :
-    level_list( Kokkos::ViewAllocateWithoutInitializing("level_list"), nrows),
-    nodes_per_level("nodes_per_level", nrows),
-    nodes_grouped_by_level("nodes_grouped_by_level", nrows),
-    nrows(nrows_),
-    nlevel(0),
-    lower_tri( lower_tri_ ),
-    symbolic_complete( symbolic_complete_ ),
-    algm(choice)
+    vector_size(-1),
+    stored_diagonal(false),
+    diagonal_offsets(),
+    diagonal_values(), // inserted by rowid
+    hdiagonal_offsets(),
+    hdiagonal_values(),
+    h_chain_ptr(),
+    num_chain_entries(0),
+    chain_threshold(-1),
+    symbolic_complete(symbolic_complete_),
+    require_symbolic_lvlsched_phase(false),
+    require_symbolic_chain_phase(false)
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    ,cuSPARSEHandle(nullptr)
+#endif
   {
-    // WithoutInitializing
-    Kokkos::deep_copy( level_list, signed_integral_t(-1) );
+    this->set_if_algm_require_symb_lvlsched();
+    this->set_if_algm_require_symb_chain();
   }
 
-/*
-  template <class rhslno_row_view_t_,
-          class rhslno_nnz_view_t_,
-          class rhsscalar_nnz_view_t_,
-          class rhsExecutionSpace,
-          class rhsMemorySpace>
-  SPTRSVHandle ( SPTRSVHandle< rhslno_row_view_t_, rhslno_nnz_view_t_, rhsscalar_nnz_view_t_, rhsExecutionSpace, rhsMemorySpace > & rhs ) {
-
-    this->level_list = rhs.level_list;
-    this->nodes_per_level = rhs.nodes_per_level;
-    this->nodes_grouped_by_level = rhs.nodes_grouped_by_level;
-    this->nrows = rhs.nrows;
-    this->nlevel = rhs.nlevel;
-    this->lower_tri = rhs.lower_tri;
-    this->symbolic_complete = rhs.symbolic_complete;
-    this->algm = rhs.algm;
-  }
+  // Requires nrows_ input
+  // Allocates all views
+  void new_init_handle(const size_type nrows_) {
+    //set_nrows(nrows_);
+    nrows = nrows_;
+    // Assumed that level scheduling occurs during symbolic phase for all algorithms, for now
+
+    // TODO: Set sizes differently/smaller, resize during symbolic to save space
+    if ( this->require_symbolic_lvlsched_phase == true )
+    {
+      set_num_levels(0);
+      level_list = signed_nnz_lno_view_t(Kokkos::ViewAllocateWithoutInitializing("level_list"), nrows_);
+      Kokkos::deep_copy( level_list, signed_integral_t(-1) );
+      nodes_per_level =  nnz_lno_view_t("nodes_per_level", nrows_);
+      hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
+      nodes_grouped_by_level = nnz_lno_view_t("nodes_grouped_by_level", nrows_);
+      hnodes_grouped_by_level = Kokkos::create_mirror_view(nodes_grouped_by_level);
+
+#if 0
+      std::cout << "  newinit_handle: level schedule allocs" << std::endl;
+      std::cout << "  ll.extent = " << level_list.extent(0) << std::endl;
+      std::cout << "  npl.extent = " << nodes_per_level.extent(0) << std::endl;
+      std::cout << "  hnpl.extent = " << hnodes_per_level.extent(0) << std::endl;
+      std::cout << "  ngbl.extent = " << nodes_grouped_by_level.extent(0) << std::endl;
+      std::cout << "  hngbl.extent = " << hnodes_grouped_by_level.extent(0) << std::endl;
+#endif
+    }
+
+    if (stored_diagonal) {
+      diagonal_offsets = nnz_lno_view_t(Kokkos::ViewAllocateWithoutInitializing("diagonal_offsets"), nrows_);
+      diagonal_values = nnz_scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("diagonal_values"), nrows_); // inserted by rowid
+      hdiagonal_offsets = Kokkos::create_mirror_view(diagonal_offsets);
+      hdiagonal_values = Kokkos::create_mirror_view(diagonal_values);
+    }
+
+    if (this->require_symbolic_chain_phase == true)
+    {
+      if (this->chain_threshold == -1) {
+        // Need default if chain_threshold not set
+        // 0 means every level, regardless of number of nodes, is launched within a kernel
+        if (team_size == -1) {
+          this->chain_threshold = 0; 
+          h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
+        }
+        else {
+          std::cout << "  Warning: chain_threshold was not set - will default to team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
+          this->chain_threshold = this->team_size; 
+          h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
+        }
+      }
+      else {
+        if (this->team_size >= this->chain_threshold) {
+          h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
+        }
+        else if (this->team_size == -1 && chain_threshold > 0) {
+          std::cout << "  Warning: team_size was not set; chain_threshold = " << this->chain_threshold << std::endl;
+          std::cout << "  Automatically setting team_size to chain_threshold - if this exceeds the hardware limitations relaunch with reduced chain_threshold or set a valid team_size" << std::endl;
+          this->team_size = this->chain_threshold;
+          h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
+        }
+        else {
+          std::cout << "  EXPERIMENTAL: team_size less than chain size. team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
+          h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
+        }
+      }
+    }
+    else {
+      h_chain_ptr = host_signed_nnz_lno_view_t();
+      this->chain_threshold = -1;
+    }
 
-  template <class rhslno_row_view_t_,
-          class rhslno_nnz_view_t_,
-          class rhsscalar_nnz_view_t_,
-          class rhsExecutionSpace,
-          class rhsMemorySpace>
-  SPTRSVHandle & operator= ( SPTRSVHandle< rhslno_row_view_t_, rhslno_nnz_view_t_, rhsscalar_nnz_view_t_, rhsExecutionSpace, rhsMemorySpace > & rhs ) {
-
-    this->level_list = rhs.level_list;
-    this->nodes_per_level = rhs.nodes_per_level;
-    this->nodes_grouped_by_level = rhs.nodes_grouped_by_level;
-    this->nrows = rhs.nrows;
-    this->nlevel = rhs.nlevel;
-    this->lower_tri = rhs.lower_tri;
-    this->symbolic_complete = rhs.symbolic_complete;
-    this->algm = rhs.algm;
-    return *this;
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+    create_SPTRSVcudaGraphWrapperType();
+#endif
+
+    set_num_chain_entries(0);
+    set_symbolic_incomplete();
   }
-*/
 
+  virtual ~SPTRSVHandle() {
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+    destroy_SPTRSVcudaGraphWrapperType();
 #endif
 
-  void reset_handle( const size_type nrows_ ) {
-    set_nrows(nrows_);
-    set_num_levels(0);
-    level_list = signed_nnz_lno_view_t( Kokkos::ViewAllocateWithoutInitializing("level_list"), nrows_),
-    Kokkos::deep_copy( level_list, signed_integral_t(-1) );
-    nodes_per_level =  nnz_lno_view_t("nodes_per_level", nrows_),
-    nodes_grouped_by_level = nnz_lno_view_t("nodes_grouped_by_level", nrows_),
-    reset_symbolic_complete();
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    this->destroy_cuSPARSE_Handle();
+#endif
+  };
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+  void create_cuSPARSE_Handle(bool transpose, bool is_lower){
+    this->destroy_cuSPARSE_Handle();
+    this->cuSPARSEHandle = new cuSparseHandleType(transpose, is_lower);
+  }
+  void destroy_cuSPARSE_Handle(){
+    if (this->cuSPARSEHandle != nullptr){
+      delete this->cuSPARSEHandle;
+      this->cuSPARSEHandle = nullptr;
+    }
   }
 
-  virtual ~SPTRSVHandle() {};
+  SPTRSVcuSparseHandleType *get_cuSparseHandle(){
+    return this->cuSPARSEHandle;
+  }
+#endif
+
+
+  bool algm_requires_symb_lvlsched() const { return require_symbolic_lvlsched_phase; } 
 
+  bool algm_requires_symb_chain() const { return require_symbolic_chain_phase; }
 
-  void set_algorithm(SPTRSVAlgorithm choice) { algm = choice; }
+  // Can change the algorithm to a "Compatible algorithms" - for ease in some testing cases
+  void set_algorithm(SPTRSVAlgorithm choice) { 
+    if (algm != choice) {
+      algm = choice; 
+    }
+  }
 
+  KOKKOS_INLINE_FUNCTION
   SPTRSVAlgorithm get_algorithm() { return algm; }
 
   KOKKOS_INLINE_FUNCTION
   signed_nnz_lno_view_t get_level_list() const { return level_list; }
 
+  inline
+  host_signed_nnz_lno_view_t get_host_level_list() const { 
+    auto hlevel_list = Kokkos::create_mirror_view(this->level_list);
+    Kokkos::deep_copy(hlevel_list, this->level_list);
+    return hlevel_list; 
+  }
+
+  void set_stored_diagonal(const bool stored_diagonal_) {
+    stored_diagonal = stored_diagonal_;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  nnz_lno_view_t get_diagonal_offsets() const { return diagonal_offsets; }
+
+  KOKKOS_INLINE_FUNCTION
+  nnz_scalar_view_t get_diagonal_values() const { return diagonal_values; }
+
+  KOKKOS_INLINE_FUNCTION
+  host_nnz_lno_view_t get_host_diagonal_offsets() const { return hdiagonal_offsets; }
+
+  KOKKOS_INLINE_FUNCTION
+  host_nnz_scalar_view_t get_host_diagonal_values() const { return hdiagonal_values; }
+
+  inline
+  host_signed_nnz_lno_view_t get_host_chain_ptr() const { return h_chain_ptr; }
+
   KOKKOS_INLINE_FUNCTION
   nnz_lno_view_t get_nodes_per_level() const { return nodes_per_level; }
 
+  inline
+  host_nnz_lno_view_t get_host_nodes_per_level() const { 
+    return hnodes_per_level; 
+  }
+
   KOKKOS_INLINE_FUNCTION
   nnz_lno_view_t get_nodes_grouped_by_level() const { return nodes_grouped_by_level; }
 
+  inline
+  host_nnz_lno_view_t get_host_nodes_grouped_by_level() const { return hnodes_grouped_by_level; }
+
   KOKKOS_INLINE_FUNCTION
   size_type get_nrows() const { return nrows; }
+  void set_nrows(const size_type nrows_) { this->nrows = nrows_; }
+
+
+  void reset_chain_threshold(const signed_integral_t threshold) {
+    if (threshold != this->chain_threshold || h_chain_ptr.span() == 0) {
+        this->chain_threshold = threshold;
+        if (this->team_size >= this->chain_threshold) {
+        //  h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
+        }
+        else if (this->team_size == -1 && chain_threshold > 0) {
+          //std::cout << "  Warning: team_size was not set  team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
+          //std::cout << "  Automatically setting team_size to chain_threshold - if this exceeds the hardware limitation a runtime error will occur during kernel launch - reduce chain_threshold in that case" << std::endl;
+          this->team_size = this->chain_threshold;
+        //  h_chain_ptr = host_signed_nnz_lno_view_t("h_chain_ptr", this->nrows);
+        }
+        else {
+          std::cout << "  EXPERIMENTAL: team_size < chain_size: team_size = " << this->team_size << "  chain_threshold = " << this->chain_threshold << std::endl;
+        }
+    }
+  }
 
   KOKKOS_INLINE_FUNCTION
-  void set_nrows(const size_type nrows_) { this->nrows = nrows_; }
+  signed_integral_t get_chain_threshold () const { return this->chain_threshold; }
+
 
   bool is_lower_tri() const { return lower_tri; }
   bool is_upper_tri() const { return !lower_tri; }
 
   bool is_symbolic_complete() const { return symbolic_complete; }
 
+  bool is_stored_diagonal() const { return stored_diagonal; }
+
+  KOKKOS_INLINE_FUNCTION
   size_type get_num_levels() const { return nlevel; }
+
   void set_num_levels(size_type nlevels_) { this->nlevel = nlevels_; }
 
   void set_symbolic_complete() { this->symbolic_complete = true; }
-  void reset_symbolic_complete() { this->symbolic_complete = false; }
+  void set_symbolic_incomplete() { this->symbolic_complete = false; }
 
-  void set_team_size(const int ts) {this->team_size = ts;}
+  KOKKOS_INLINE_FUNCTION
   int get_team_size() const {return this->team_size;}
+  // Called by user at setup - should only set a value, no alloc
+  void set_team_size(const int ts) {this->team_size = ts;}
 
-  void set_vector_size(const int vs) {this->vector_size = vs;}
+  KOKKOS_INLINE_FUNCTION
   int get_vector_size() const {return this->vector_size;}
+  // Called by user at setup - should only set a value, no alloc
+  void set_vector_size(const int vs) {this->vector_size = vs;}
+
+  KOKKOS_INLINE_FUNCTION
+  int get_num_chain_entries() const {return this->num_chain_entries;}
+  void set_num_chain_entries(const int nce) {this->num_chain_entries = nce;}
 
   void print_algorithm() { 
     if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_RP )
@@ -239,20 +536,49 @@ class SPTRSVHandle {
 
     if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 )
       std::cout << "SEQLVLSCHD_TP1" << std::endl;;
-
-    /*
+/*
     if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
       std::cout << "SEQLVLSCHED_TP2" << std::endl;;
       std::cout << "WARNING: With CUDA this is currently only reliable with int-int ordinal-offset pair" << std::endl;
     }
-    */
+*/
+    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN )
+      std::cout << "SEQLVLSCHD_TP1CHAIN" << std::endl;;
+
+    if ( algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE )
+      std::cout << "SPTRSV_CUSPARSE" << std::endl;;
+  }
+
+
+  std::string return_algorithm_string() { 
+    std::string ret_string;
+
+    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_RP )
+      ret_string = "SEQLVLSCHD_RP";
+
+    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1 )
+      ret_string = "SEQLVLSCHD_TP1";
+/*
+    if ( algm == SPTRSVAlgorithm::SEQLVLSCHED_TP2 )
+      ret_string = "SEQLVLSCHED_TP2";
+*/
+    if ( algm == SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN )
+      ret_string = "SEQLVLSCHD_TP1CHAIN";
+
+    if ( algm == SPTRSVAlgorithm::SPTRSV_CUSPARSE )
+      ret_string = "SPTRSV_CUSPARSE";
+
+    return ret_string;
   }
 
+
   inline SPTRSVAlgorithm StringToSPTRSVAlgorithm(std::string & name) {
-    if(name=="SPTRSV_DEFAULT")             return SPTRSVAlgorithm::SEQLVLSCHD_RP;
-    else if(name=="SPTRSV_RANGEPOLICY")    return SPTRSVAlgorithm::SEQLVLSCHD_RP;
-    else if(name=="SPTRSV_TEAMPOLICY1")    return SPTRSVAlgorithm::SEQLVLSCHD_TP1;
-    /*else if(name=="SPTRSV_TEAMPOLICY2")    return SPTRSVAlgorithm::SEQLVLSCHED_TP2;*/
+    if(name=="SPTRSV_DEFAULT")                return SPTRSVAlgorithm::SEQLVLSCHD_RP;
+    else if(name=="SPTRSV_RANGEPOLICY")       return SPTRSVAlgorithm::SEQLVLSCHD_RP;
+    else if(name=="SPTRSV_TEAMPOLICY1")       return SPTRSVAlgorithm::SEQLVLSCHD_TP1;
+    /*else if(name=="SPTRSV_TEAMPOLICY2")       return SPTRSVAlgorithm::SEQLVLSCHED_TP2;*/
+    else if(name=="SPTRSV_TEAMPOLICY1CHAIN")  return SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN;
+    else if(name=="SPTRSV_CUSPARSE")          return SPTRSVAlgorithm::SPTRSV_CUSPARSE;
     else
       throw std::runtime_error("Invalid SPTRSVAlgorithm name");
   }
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
new file mode 100644
index 0000000000..ef92b493b5
--- /dev/null
+++ b/src/sparse/impl/KokkosSparse_sptrsv_cuSPARSE_impl.hpp
@@ -0,0 +1,296 @@
+/*
+//@HEADER
+// ************************************************************************
+//
+//               KokkosKernels 0.9: Linear Algebra and Graph Kernels
+//                 Copyright 2017 Sandia Corporation
+//
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact Siva Rajamanickam (srajama@sandia.gov)
+//
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef _KOKKOSSPTRSVCUSPARSE_HPP
+#define _KOKKOSSPTRSVCUSPARSE_HPP
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+#include "cusparse.h"
+#endif
+namespace KokkosSparse{
+namespace Impl{
+
+  template <typename KernelHandle,
+  typename ain_row_index_view_type,
+  typename ain_nonzero_index_view_type,
+  typename ain_values_scalar_view_type>
+  void sptrsvcuSPARSE_symbolic(
+      KernelHandle *sptrsv_handle,
+      typename KernelHandle::nnz_lno_t nrows,
+      ain_row_index_view_type row_map,
+      ain_nonzero_index_view_type entries,
+      ain_values_scalar_view_type values,
+      bool trans
+      )
+  {
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+  typedef typename KernelHandle::nnz_lno_t idx_type;
+  typedef typename KernelHandle::size_type  size_type;
+  typedef typename KernelHandle::scalar_t  scalar_type;
+
+  if (std::is_same<idx_type, int>::value) {
+    bool is_lower = sptrsv_handle->is_lower_tri();
+    sptrsv_handle->create_cuSPARSE_Handle(trans, is_lower);
+
+    typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle();
+
+    cusparseStatus_t status;
+    status = cusparseCreateCsrsv2Info(&(h->info));
+    if (CUSPARSE_STATUS_SUCCESS != status)
+      std::cout << "csrsv2info create status error name " << (status) << std::endl;
+  
+    // query how much memory used in csrsv2, and allocate the buffer
+    int nnz = entries.extent_int(0);
+    int pBufferSize;
+
+    const scalar_type* vals = values.data();
+    const size_type* rm  =  row_map.data();
+    const idx_type* ent =  entries.data(); 
+
+    if (std::is_same<scalar_type,double>::value) {
+    cusparseDcsrsv2_bufferSize(
+      h->handle, 
+      h->transpose, 
+      nrows, 
+      nnz, 
+      h->descr,
+      (double*)vals,
+      (int*)rm,
+      (int*)ent,
+      h->info, 
+      &pBufferSize);
+
+
+      // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
+      cudaError_t error_t;
+      error_t = cudaMalloc((void**)&(h->pBuffer), pBufferSize);
+
+      if (cudaSuccess != error_t)
+        std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(error_t) << std::endl;
+
+      status = cusparseDcsrsv2_analysis(
+        h->handle, 
+        h->transpose, 
+        nrows, 
+        nnz, 
+        h->descr, 
+        (double*)vals,
+        (int*)rm,
+        (int*)ent,
+        h->info, 
+        h->policy, 
+        h->pBuffer);
+
+      if (CUSPARSE_STATUS_SUCCESS != status)
+        std::cout << "analysis status error name " << (status) << std::endl;
+    }
+    else if (std::is_same<scalar_type,Kokkos::complex<double>>::value) {
+    cusparseZcsrsv2_bufferSize(
+      h->handle, 
+      h->transpose, 
+      nrows, 
+      nnz, 
+      h->descr,
+      (cuDoubleComplex*)vals,
+      (int*)rm,
+      (int*)ent,
+      h->info, 
+      &pBufferSize);
+
+      // pBuffer returned by cudaMalloc is automatically aligned to 128 bytes.
+      cudaError_t error_t;
+      error_t = cudaMalloc((void**)&(h->pBuffer), pBufferSize);
+
+      if (cudaSuccess != error_t)
+        std::cout << "cudmalloc pBuffer error_t error name " << cudaGetErrorString(error_t) << std::endl;
+
+      status = cusparseZcsrsv2_analysis(
+        h->handle, 
+        h->transpose, 
+        nrows, 
+        nnz, 
+        h->descr, 
+        (cuDoubleComplex*)vals,
+        (int*)rm,
+        (int*)ent,
+        h->info, 
+        h->policy, 
+        h->pBuffer);
+
+      if (CUSPARSE_STATUS_SUCCESS != status)
+        std::cout << "analysis status error name " << (status) << std::endl;
+    }
+    else {
+      throw std::runtime_error ("This CUSPARSE wrapper currently only supports double and complex<double>.\n");
+    }
+  }
+  else {
+    throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
+  }
+#else
+    (void)sptrsv_handle;
+    (void)nrows;
+    (void)row_map;
+    (void)entries;
+    (void)values;
+    (void)trans;
+    throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n");
+    //return;
+#endif
+
+  }
+
+
+  template <typename KernelHandle,
+  typename ain_row_index_view_type,
+  typename ain_nonzero_index_view_type,
+  typename ain_values_scalar_view_type,
+  typename b_values_scalar_view_type,
+  typename x_values_scalar_view_type>
+  void sptrsvcuSPARSE_solve(
+      KernelHandle *sptrsv_handle,
+      typename KernelHandle::nnz_lno_t nrows,
+      ain_row_index_view_type row_map,
+      ain_nonzero_index_view_type entries,
+      ain_values_scalar_view_type values,
+      b_values_scalar_view_type rhs,
+      x_values_scalar_view_type lhs,
+      bool trans
+      )
+  {
+
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+  typedef typename KernelHandle::nnz_lno_t idx_type;
+  typedef typename KernelHandle::size_type  size_type;
+  typedef typename KernelHandle::scalar_t  scalar_type;
+
+  if (std::is_same<idx_type, int>::value) {
+    bool is_lower = sptrsv_handle->is_lower_tri();
+
+    cusparseStatus_t status;
+
+    typename KernelHandle::SPTRSVcuSparseHandleType *h = sptrsv_handle->get_cuSparseHandle();
+
+    int nnz = entries.extent_int(0);
+
+    const scalar_type* vals = values.data();
+    const size_type* rm  =  row_map.data();
+    const idx_type* ent =  entries.data(); 
+    const scalar_type* bv = rhs.data();
+    scalar_type* xv = lhs.data();
+
+    const scalar_type alpha = scalar_type(1);
+
+    if (std::is_same<scalar_type,double>::value) {
+
+      if (h->pBuffer == nullptr) { std::cout << "  pBuffer invalid" << std::endl; }
+
+      status = cusparseDcsrsv2_solve(
+        h->handle, 
+        h->transpose, 
+        nrows, 
+        nnz, 
+        &alpha, 
+        h->descr, 
+        (double*)vals,
+        (int*)rm,
+        (int*)ent,
+        h->info, 
+        (double*)bv,
+        (double*)xv,
+        h->policy, 
+        h->pBuffer);
+
+      if (CUSPARSE_STATUS_SUCCESS != status)
+        std::cout << "solve status error name " << (status) << std::endl;
+    }
+    else if (std::is_same<scalar_type,Kokkos::complex<double>>::value) {
+      cuDoubleComplex cualpha;
+      cualpha.x = 1.0;
+      cualpha.y = 0.0;
+      status = cusparseZcsrsv2_solve(
+        h->handle, 
+        h->transpose, 
+        nrows, 
+        nnz, 
+        &cualpha, 
+        h->descr, 
+        (cuDoubleComplex*)vals,
+        (int*)rm,
+        (int*)ent,
+        h->info, 
+        (cuDoubleComplex*)bv,
+        (cuDoubleComplex*)xv,
+        h->policy, 
+        h->pBuffer);
+
+      if (CUSPARSE_STATUS_SUCCESS != status)
+        std::cout << "solve status error name " << (status) << std::endl;
+    }
+    else {
+      throw std::runtime_error ("This CUSPARSE wrapper currently only supports double and complex<double>.\n");
+    }
+
+  }
+  else {
+    throw std::runtime_error ("CUSPARSE requires local ordinals to be integer.\n");
+  }
+#else
+    (void)sptrsv_handle;
+    (void)nrows;
+    (void)row_map;
+    (void)entries;
+    (void)values;
+    (void)rhs;
+    (void)lhs;
+    (void)trans;
+    throw std::runtime_error ("CUSPARSE IS NOT DEFINED\n");
+#endif
+
+  }
+
+}
+}
+
+#endif
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
index 417acc223a..293a8d33ec 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_impl.hpp
@@ -50,16 +50,301 @@
 #include <KokkosKernels_config.h>
 #include <Kokkos_ArithTraits.hpp>
 #include <KokkosSparse_sptrsv_handle.hpp>
+#include <KokkosSparse_spmv.hpp>
+#include <KokkosSparse_CrsMatrix.hpp>
 
-//#define LVL_OUTPUT_INFO
+#include <KokkosBatched_Trsv_Decl.hpp>
+#include <KokkosBatched_Trsv_Serial_Impl.hpp>
+
+//#define TRISOLVE_TIMERS
+//#define SERIAL_FOR_LOOP
+
+#define KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+
+//#define KOKKOSPSTRSV_SOLVE_IMPL_PROFILE 1
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+#include "cuda_profiler_api.h"
+#endif
 
 namespace KokkosSparse {
 namespace Impl {
 namespace Experimental {
 
+#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
+  #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+#endif
 
 struct UnsortedTag {};
 
+struct LargerCutoffTag {};
+
+struct UnsortedLargerCutoffTag {};
+
+template <class ViewType>
+void print_view1d_solve(const ViewType dv, size_t range = 0) {
+  auto v = Kokkos::create_mirror_view(dv);
+  Kokkos::deep_copy(v, dv);
+  std::cout << "Output for view " << v.label() << std::endl;
+  range = range == 0 ? dv.extent(0) : range;
+  for (size_t i = 0; i < range; ++i) {
+    std::cout << "v(" << i << ") = " << v(i) << " , ";
+  }
+  std::cout << std::endl;
+}
+
+// Needed for cudagraphs
+struct EmptyFunctor {
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const int) const {}
+};
+
+
+// This functor unifies the lower and upper implementations, the hope is the "is_lowertri" check does not add noticable time on larger problems
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
+struct TriLvlSchedTP1SolverFunctor
+{
+  typedef typename RowMapType::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type member_type;
+  typedef typename EntriesType::non_const_value_type lno_t;
+  typedef typename ValuesType::non_const_value_type scalar_t;
+
+  RowMapType row_map;
+  EntriesType entries;
+  ValuesType values;
+  LHSType lhs;
+  RHSType rhs;
+  NGBLType nodes_grouped_by_level;
+
+  const bool is_lowertri;
+
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long dense_nrows;
+
+
+  TriLvlSchedTP1SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const bool is_lowertri_, long node_count_, long dense_nrows_ = 0) :
+    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), node_count(node_count_), dense_nrows(dense_nrows_) {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const member_type & team ) const {
+        auto my_league = team.league_rank(); // map to rowid
+        auto rowid = nodes_grouped_by_level(my_league + node_count);
+        auto my_rank = team.team_rank();
+
+        auto soffset = row_map(rowid);
+        auto eoffset = row_map(rowid+1);
+        auto rhs_rowid = rhs(rowid);
+        scalar_t diff = scalar_t(0.0);
+
+      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+          auto colid = entries(ptr);
+
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff = tdiff - val*lhs(colid);
+          }
+      }, diff );
+
+        team.team_barrier();
+
+        // At end, finalize rowid == colid
+        // only one thread should do this; can also use Kokkos::single
+        if ( my_rank == 0 )
+        {
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1
+          lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset);
+        }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const UnsortedTag&, const member_type & team) const {
+        auto my_league = team.league_rank(); // map to rowid
+        auto rowid = nodes_grouped_by_level(my_league + node_count);
+        auto my_rank = team.team_rank();
+
+        auto soffset = row_map(rowid);
+        auto eoffset = row_map(rowid+1);
+        auto rhs_rowid = rhs(rowid);
+        scalar_t diff = scalar_t(0.0);
+
+        auto diag = -1;
+
+        Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff = tdiff - val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }, diff );
+        team.team_barrier();
+
+        // At end, finalize rowid == colid
+        // only one thread should do this; can also use Kokkos::single
+        if ( my_rank == 0 )
+        {
+          lhs(rowid) = (rhs_rowid+diff)/values(diag);
+        }
+  }
+};
+
+
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
+struct TriLvlSchedTP1SolverFunctorDiagValues
+{
+  typedef typename RowMapType::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type member_type;
+  typedef typename EntriesType::non_const_value_type lno_t;
+  typedef typename ValuesType::non_const_value_type scalar_t;
+
+  RowMapType row_map;
+  EntriesType entries;
+  ValuesType values;
+  LHSType lhs;
+  RHSType rhs;
+  NGBLType nodes_grouped_by_level;
+  ValuesType diagonal_values; // inserted according to rowid
+
+  const bool is_lowertri;
+
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long dense_nrows;
+
+
+  TriLvlSchedTP1SolverFunctorDiagValues( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const ValuesType &diagonal_values_, const bool is_lowertri_, long node_count_, long dense_nrows_ = 0) :
+    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), diagonal_values(diagonal_values_), is_lowertri(is_lowertri_), node_count(node_count_), dense_nrows(dense_nrows_) {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const member_type & team ) const {
+        auto my_league = team.league_rank(); // map to rowid
+        auto rowid = nodes_grouped_by_level(my_league + node_count);
+        auto my_rank = team.team_rank();
+
+        auto soffset = row_map(rowid);
+        auto eoffset = row_map(rowid+1);
+        auto rhs_rowid = rhs(rowid);
+        scalar_t diff = scalar_t(0.0);
+
+      Kokkos::parallel_reduce( Kokkos::TeamThreadRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff = tdiff - val*lhs(colid);
+          }
+      }, diff );
+
+        team.team_barrier();
+
+        // At end, finalize rowid == colid
+        // only one thread should do this; can also use Kokkos::single
+        if ( my_rank == 0 )
+        {
+          //lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset);
+          lhs(rowid) = (rhs_rowid+diff)/diagonal_values(rowid);
+        }
+  }
+
+};
+
+
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
+struct TriLvlSchedTP2SolverFunctor
+{
+  typedef typename RowMapType::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type member_type;
+  typedef typename EntriesType::non_const_value_type lno_t;
+  typedef typename ValuesType::non_const_value_type scalar_t;
+
+  RowMapType row_map;
+  EntriesType entries;
+  ValuesType values;
+  LHSType lhs;
+  RHSType rhs;
+  NGBLType nodes_grouped_by_level;
+
+  const bool is_lowertri;
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long node_groups;
+  long dense_nrows;
+
+
+  TriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const bool is_lowertri_, long node_count_, long node_groups_ = 0, long dense_nrows_ = 0) :
+    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), is_lowertri(is_lowertri_), node_count(node_count_), node_groups(node_groups_), dense_nrows(dense_nrows_) {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const member_type & team) const {
+        auto my_league = team.league_rank(); // map to rowid
+
+        size_t nrows = row_map.extent(0) - 1;
+
+        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
+          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
+          if ( size_t(rowid) < nrows ) {
+
+            auto soffset = row_map(rowid);
+            auto eoffset = row_map(rowid+1);
+            auto rhs_rowid = rhs(rowid);
+            scalar_t diff = scalar_t(0.0);
+
+            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if ( colid != rowid ) {
+                tdiff = tdiff - val*lhs(colid);
+              }
+            }, diff );
+
+            // ASSUMPTION: sorted diagonal value located at eoffset - 1
+            lhs(rowid) = is_lowertri ? (rhs_rowid+diff)/values(eoffset-1) : (rhs_rowid+diff)/values(soffset);
+          } // end if
+        }); // end TeamThreadRange
+
+        team.team_barrier();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const UnsortedTag&, const member_type & team) const {
+        auto my_league = team.league_rank(); // map to rowid
+
+        size_t nrows = row_map.extent(0) - 1;
+
+        Kokkos::parallel_for( Kokkos::TeamThreadRange( team, 0, node_groups ), [&] ( const long ng ) {
+          auto rowid = nodes_grouped_by_level(node_count + my_league*node_groups + ng);
+          if ( size_t(rowid) < nrows ) {
+            auto soffset = row_map(rowid);
+            auto eoffset = row_map(rowid+1);
+            auto rhs_rowid = rhs(rowid);
+            scalar_t diff = scalar_t(0.0);
+
+            auto diag = -1;
+            Kokkos::parallel_reduce( Kokkos::ThreadVectorRange( team, soffset, eoffset ), [&] ( const long ptr, scalar_t &tdiff ) {
+              auto colid = entries(ptr);
+              auto val   = values(ptr);
+              if ( colid != rowid ) {
+                tdiff = tdiff - val*lhs(colid);
+              }
+              else {
+                diag = ptr;
+              }
+            }, diff );
+
+            lhs(rowid) = (rhs_rowid+diff)/values(diag);
+          } // end if
+        }); // end TeamThreadRange
+
+        team.team_barrier();
+  }
+};
+
+
+// Lower vs Upper Multi-block Functors
 
 template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
 struct LowerTriLvlSchedRPSolverFunctor
@@ -120,6 +405,7 @@ struct LowerTriLvlSchedRPSolverFunctor
 };
 
 
+
 template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
 struct LowerTriLvlSchedTP1SolverFunctor
 {
@@ -136,7 +422,7 @@ struct LowerTriLvlSchedTP1SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count;
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
   long node_groups;
 
 
@@ -148,7 +434,7 @@ struct LowerTriLvlSchedTP1SolverFunctor
   void operator()( const member_type & team ) const {
         auto my_league = team.league_rank(); // map to rowid
         auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_team = team.team_rank();
+        auto my_rank = team.team_rank();
 
         auto soffset = row_map(rowid);
         auto eoffset = row_map(rowid+1);
@@ -167,7 +453,7 @@ struct LowerTriLvlSchedTP1SolverFunctor
 
         // At end, finalize rowid == colid
         // only one thread should do this; can also use Kokkos::single
-        if ( my_team == 0 )
+        if ( my_rank == 0 )
         {
         // ASSUMPTION: sorted diagonal value located at eoffset - 1
           lhs(rowid) = (rhs_rowid+diff)/values(eoffset-1);
@@ -178,7 +464,7 @@ struct LowerTriLvlSchedTP1SolverFunctor
   void operator()(const UnsortedTag&, const member_type & team) const {
         auto my_league = team.league_rank(); // map to rowid
         auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_team = team.team_rank();
+        auto my_rank = team.team_rank();
 
         auto soffset = row_map(rowid);
         auto eoffset = row_map(rowid+1);
@@ -197,19 +483,18 @@ struct LowerTriLvlSchedTP1SolverFunctor
             diag = ptr;
           }
         }, diff );
-
         team.team_barrier();
 
         // At end, finalize rowid == colid
         // only one thread should do this; can also use Kokkos::single
-        if ( my_team == 0 )
+        if ( my_rank == 0 )
         {
-        // ASSUMPTION: sorted diagonal value located at eoffset - 1
           lhs(rowid) = (rhs_rowid+diff)/values(diag);
         }
   }
 };
 
+
 // FIXME CUDA: This algorithm not working with all integral type combos
 // In any case, this serves as a skeleton for 3-level hierarchical parallelism for alg dev
 template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
@@ -228,11 +513,11 @@ struct LowerTriLvlSchedTP2SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count;
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
   long node_groups;
 
 
-  LowerTriLvlSchedTP2SolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) :
+  LowerTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) :
     row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {}
 
 
@@ -359,6 +644,7 @@ struct UpperTriLvlSchedRPSolverFunctor
 
 };
 
+
 template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
 struct UpperTriLvlSchedTP1SolverFunctor
 {
@@ -375,7 +661,7 @@ struct UpperTriLvlSchedTP1SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count;
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
   long node_groups;
 
 
@@ -387,7 +673,7 @@ struct UpperTriLvlSchedTP1SolverFunctor
   void operator()(const member_type & team) const {
         auto my_league = team.league_rank(); // map to rowid
         auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_team = team.team_rank();
+        auto my_rank = team.team_rank();
 
         auto soffset = row_map(rowid);
         auto eoffset = row_map(rowid+1);
@@ -406,7 +692,7 @@ struct UpperTriLvlSchedTP1SolverFunctor
 
         // At end, finalize rowid == colid
         // only one thread should do this, also can use Kokkos::single
-        if ( my_team == 0 )
+        if ( my_rank == 0 )
         {
         // ASSUMPTION: sorted diagonal value located at start offset
           lhs(rowid) = (rhs_rowid+diff)/values(soffset);
@@ -417,7 +703,7 @@ struct UpperTriLvlSchedTP1SolverFunctor
   void operator()(const UnsortedTag&, const member_type & team) const {
         auto my_league = team.league_rank(); // map to rowid
         auto rowid = nodes_grouped_by_level(my_league + node_count);
-        auto my_team = team.team_rank();
+        auto my_rank = team.team_rank();
 
         auto soffset = row_map(rowid);
         auto eoffset = row_map(rowid+1);
@@ -440,13 +726,11 @@ struct UpperTriLvlSchedTP1SolverFunctor
 
         // At end, finalize rowid == colid
         // only one thread should do this, also can use Kokkos::single
-        if ( my_team == 0 )
+        if ( my_rank == 0 )
         {
-        // ASSUMPTION: sorted diagonal value located at start offset
           lhs(rowid) = (rhs_rowid+diff)/values(diag);
         }
   }
-
 };
 
 
@@ -468,16 +752,16 @@ struct UpperTriLvlSchedTP2SolverFunctor
   RHSType rhs;
   NGBLType nodes_grouped_by_level;
 
-  long node_count;
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
   long node_groups;
 
 
-  UpperTriLvlSchedTP2SolverFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0 ) :
+  UpperTriLvlSchedTP2SolverFunctor(const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, long node_count_, long node_groups_ = 0) :
     row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), node_count(node_count_), node_groups(node_groups_) {}
 
 
   KOKKOS_INLINE_FUNCTION
-  void operator()( const member_type & team ) const {
+  void operator()(const member_type & team) const {
         auto my_league = team.league_rank(); // map to rowid
 
         size_t nrows = row_map.extent(0) - 1;
@@ -533,7 +817,6 @@ struct UpperTriLvlSchedTP2SolverFunctor
               }
             }, diff );
 
-            // ASSUMPTION: sorted diagonal value located at start offset
             lhs(rowid) = (rhs_rowid+diff)/values(diag);
           } // end if
         }); // end TeamThreadRange
@@ -544,29 +827,1191 @@ struct UpperTriLvlSchedTP2SolverFunctor
 };
 
 
-template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
-void lower_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
-
-  typedef typename TriSolveHandle::execution_space execution_space;
-  typedef typename TriSolveHandle::size_type size_type;
-  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
-
-  auto nlevels = thandle.get_num_levels();
-  // Keep this a host View, create device version and copy to back to host during scheduling
-  auto nodes_per_level = thandle.get_nodes_per_level();
-  auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
-  Kokkos::deep_copy(hnodes_per_level, nodes_per_level);  
+// --------------------------------
+// Single-block functors
+// --------------------------------
 
-  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
+struct LowerTriLvlSchedTP1SingleBlockFunctor
+{
+  typedef typename RowMapType::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type member_type;
+  typedef typename EntriesType::non_const_value_type lno_t;
+  typedef typename ValuesType::non_const_value_type scalar_t;
 
-  size_type node_count = 0;
+  RowMapType row_map;
+  EntriesType entries;
+  ValuesType values;
+  LHSType lhs;
+  RHSType rhs;
+  NGBLType nodes_grouped_by_level;
+  NGBLType nodes_per_level;
+
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long lvl_start;
+  long lvl_end;
+  long cutoff;
+  // team_size: each team can be assigned a row, if there are enough rows...
+
+
+  LowerTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0 ) :
+    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_rank = team.team_rank();
+      diff = scalar_t(0.0);
+
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+        }, diff);
+#endif
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1
+        lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
+      } // end if team.team_rank() < nodes_this_lvl
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end operator
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const UnsortedTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_rank = team.team_rank();
+      diff = scalar_t(0.0);
+
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        auto diag = -1;
+
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }, diff);
+#endif
+        lhs(rowid) = (rhs_val+diff)/values(diag);
+      } // end if team.team_rank() < nodes_this_lvl
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end operator
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_team_rank = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
+      diff = scalar_t(0.0);
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+        },diff);
+#endif
+         // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
+         lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
+       } // end if team.team_rank() < nodes_this_lvl
+      } // end for my_rank loop
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end tagged operator
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_team_rank = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
+      diff = scalar_t(0.0);
+       if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        auto diag = -1;
+
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        },diff);
+#endif
+        lhs(rowid) = (rhs_val+diff)/values(diag);
+       } // end if team.team_rank() < nodes_this_lvl
+      } // end for my_rank loop
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end tagged operator
+
+};
+
+
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
+struct UpperTriLvlSchedTP1SingleBlockFunctor
+{
+  typedef typename RowMapType::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type member_type;
+  typedef typename EntriesType::non_const_value_type lno_t;
+  typedef typename ValuesType::non_const_value_type scalar_t;
+
+  RowMapType row_map;
+  EntriesType entries;
+  ValuesType values;
+  LHSType lhs;
+  RHSType rhs;
+  NGBLType nodes_grouped_by_level;
+  NGBLType nodes_per_level;
+
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long lvl_start;
+  long lvl_end;
+  long cutoff;
+  // team_size: each team can be assigned a row, if there are enough rows...
+
+
+  UpperTriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, long cutoff_ = 0 ) :
+    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_rank = team.team_rank();
+      diff = scalar_t(0.0);
+
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+        }, diff);
+#endif
+        // ASSUMPTION: sorted diagonal value located at soffset
+        lhs(rowid) = (rhs_val+diff)/values(soffset);
+      } // end if
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl each thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end operator
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const UnsortedTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_rank = team.team_rank();
+      diff = scalar_t(0.0);
+
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        auto diag = -1;
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        auto diag = -1;
+
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }, diff);
+#endif
+        lhs(rowid) = (rhs_val+diff)/values(diag);
+      } // end if
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl each thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end operator
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_team_rank = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
+      diff = scalar_t(0.0);
+       if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+        }, diff);
+#endif
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
+          lhs(rowid) = (rhs_val+diff)/values(soffset);
+       } // end if team.team_rank() < nodes_this_lvl
+      } // end for my_rank loop
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end tagged operator
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_team_rank = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
+       diff = scalar_t(0.0);
+       if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        auto diag = -1;
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        auto diag = -1;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }, diff);
+#endif
+        lhs(rowid) = (rhs_val+diff)/values(diag);
+       } // end if team.team_rank() < nodes_this_lvl
+      } // end for my_rank loop
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end tagged operator
+};
+
+
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
+struct TriLvlSchedTP1SingleBlockFunctor
+{
+  typedef typename RowMapType::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type member_type;
+  typedef typename EntriesType::non_const_value_type lno_t;
+  typedef typename ValuesType::non_const_value_type scalar_t;
+
+  RowMapType row_map;
+  EntriesType entries;
+  ValuesType values;
+  LHSType lhs;
+  RHSType rhs;
+  NGBLType nodes_grouped_by_level;
+  NGBLType nodes_per_level;
+
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long lvl_start;
+  long lvl_end;
+  const bool is_lowertri;
+  const int dense_nrows;
+  const int  cutoff;
+  // team_size: each team can be assigned a row, if there are enough rows...
+
+
+  TriLvlSchedTP1SingleBlockFunctor( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, NGBLType &nodes_per_level_, long node_count_, long lvl_start_, long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0 ) :
+    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), is_lowertri(is_lower_), dense_nrows(dense_nrows_), cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_rank = team.team_rank();
+      diff = scalar_t(0.0);
+
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+        }, diff);
+#endif
+
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
+        if (is_lowertri)
+          lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
+        else
+          lhs(rowid) = (rhs_val+diff)/values(soffset);
+      } // end if team.team_rank() < nodes_this_lvl
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end operator
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const UnsortedTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_rank = team.team_rank();
+      diff = scalar_t(0.0);
+
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        auto diag = -1;
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        auto diag = -1;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }, diff);
+#endif
+        lhs(rowid) = (rhs_val+diff)/values(diag);
+      } // end if team.team_rank() < nodes_this_lvl
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end operator
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_team_rank = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
+       diff = scalar_t(0.0);
+       if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+        }, diff);
+#endif
+
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
+        if (is_lowertri)
+          lhs(rowid) = (rhs_val+diff)/values(eoffset-1);
+        else
+          lhs(rowid) = (rhs_val+diff)/values(soffset);
+       } // end if team.team_rank() < nodes_this_lvl
+      } // end for my_rank loop
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end tagged operator
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const UnsortedLargerCutoffTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_team_rank = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
+       diff = scalar_t(0.0);
+       if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        auto diag = -1;
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }
+#else
+        auto trange = eoffset - soffset;
+        auto diag = -1;
+        Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+        {
+          auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+          else {
+            diag = ptr;
+          }
+        }, diff);
+#endif
+          lhs(rowid) = (rhs_val+diff)/values(diag);
+       } // end if team.team_rank() < nodes_this_lvl
+      } // end for my_rank loop
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end tagged operator
+
+};
+
+
+template <class RowMapType, class EntriesType, class ValuesType, class LHSType, class RHSType, class NGBLType>
+struct TriLvlSchedTP1SingleBlockFunctorDiagValues
+{
+  typedef typename RowMapType::execution_space execution_space;
+  typedef Kokkos::TeamPolicy<execution_space> policy_type;
+  typedef typename policy_type::member_type member_type;
+  typedef typename EntriesType::non_const_value_type lno_t;
+  typedef typename ValuesType::non_const_value_type scalar_t;
+
+  RowMapType row_map;
+  EntriesType entries;
+  ValuesType values;
+  LHSType lhs;
+  RHSType rhs;
+  NGBLType nodes_grouped_by_level;
+  NGBLType nodes_per_level;
+  ValuesType diagonal_values;
+
+  long node_count; // like "block" offset into ngbl, my_league is the "local" offset
+  long lvl_start;
+  long lvl_end;
+  const bool is_lowertri;
+  const int dense_nrows;
+  const int  cutoff;
+  // team_size: each team can be assigned a row, if there are enough rows...
+
+
+  TriLvlSchedTP1SingleBlockFunctorDiagValues( const RowMapType &row_map_, const EntriesType &entries_, const ValuesType &values_, LHSType &lhs_, const RHSType &rhs_, const NGBLType &nodes_grouped_by_level_, const NGBLType &nodes_per_level_, const ValuesType &diagonal_values_, long node_count_, const long lvl_start_, const long lvl_end_, const bool is_lower_, const int dense_nrows_ = 0, const int cutoff_ = 0 ) :
+    row_map(row_map_), entries(entries_), values(values_), lhs(lhs_), rhs(rhs_), nodes_grouped_by_level(nodes_grouped_by_level_), nodes_per_level(nodes_per_level_), diagonal_values(diagonal_values_), node_count(node_count_), lvl_start(lvl_start_), lvl_end(lvl_end_), is_lowertri(is_lower_), dense_nrows(dense_nrows_), cutoff(cutoff_) {}
+
+  // SingleBlock: Only one block (or league) executing; team_rank used to map thread to row
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_rank = team.team_rank();
+      diff = scalar_t(0.0);
+
+      if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+      auto trange = eoffset - soffset;
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+      {
+        auto ptr = soffset + loffset;
+        auto colid = entries(ptr);
+        auto val   = values(ptr);
+
+        if ( colid != rowid ) {
+          tdiff -= val*lhs(colid);
+        }
+      }, diff);
+#endif
+        // ASSUMPTION: sorted diagonal value located at eoffset - 1 for lower tri, soffset for upper tri
+        lhs(rowid) = (rhs_val+diff)/diagonal_values(rowid);
+      } // end if team.team_rank() < nodes_this_lvl
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end operator
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const LargerCutoffTag&, const member_type & team ) const {
+    long mut_node_count = node_count;
+    typename NGBLType::non_const_value_type rowid {0};
+    typename RowMapType::non_const_value_type soffset {0};
+    typename RowMapType::non_const_value_type eoffset {0};
+    typename RHSType::non_const_value_type rhs_val {0};
+    scalar_t diff = scalar_t(0.0);
+
+    for ( auto lvl = lvl_start; lvl < lvl_end; ++lvl ) {
+      auto nodes_this_lvl = nodes_per_level(lvl);
+      int my_team_rank = team.team_rank();
+      // If cutoff > team_size, then a thread will be responsible for multiple rows - this may be a helpful scenario depending on occupancy etc.
+      for (int my_rank = my_team_rank; my_rank < cutoff; my_rank+=team.team_size() ) {
+       diff = scalar_t(0.0);
+       if (my_rank < nodes_this_lvl) {
+        // THIS is where the mapping of threadid to rowid happens
+        rowid = nodes_grouped_by_level(my_rank + mut_node_count);
+        soffset = row_map(rowid);
+        eoffset = row_map(rowid+1);
+        rhs_val = rhs(rowid);
+
+#ifdef SERIAL_FOR_LOOP
+        for (auto ptr = soffset; ptr < eoffset; ++ptr) {
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            diff -= val*lhs(colid);
+          }
+        }
+#else
+      auto trange = eoffset - soffset;
+      Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team, trange), [&] (const int loffset, scalar_t& tdiff)
+      {
+        auto ptr = soffset + loffset;
+          auto colid = entries(ptr);
+          auto val   = values(ptr);
+          if ( colid != rowid ) {
+            tdiff -= val*lhs(colid);
+          }
+        }, diff);
+#endif
+        lhs(rowid) = (rhs_val+diff)/diagonal_values(rowid);
+       } // end if team.team_rank() < nodes_this_lvl
+      } // end for my_rank loop
+      {
+        // Update mut_node_count from nodes_per_level(lvl) each iteration of lvl per thread
+        mut_node_count += nodes_this_lvl;
+      }
+      team.team_barrier();
+    } // end for lvl
+  } // end tagged operator
+
+};
+
+
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+template <class SpaceType>
+struct ReturnTeamPolicyType;
+
+#ifdef KOKKOS_ENABLE_SERIAL
+template <>
+struct ReturnTeamPolicyType<Kokkos::Serial> {
+  using PolicyType = Kokkos::TeamPolicy<Kokkos::Serial>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
+    return PolicyType(nt,ts);
+    //return PolicyType(ExecInstanceType(),nt,ts);
+  }
+};
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+template <>
+struct ReturnTeamPolicyType<Kokkos::OpenMP> {
+  using PolicyType = Kokkos::TeamPolicy<Kokkos::OpenMP>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
+    return PolicyType(nt,ts);
+    //return PolicyType(ExecInstanceType(),nt,ts);
+  }
+};
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct ReturnTeamPolicyType<Kokkos::Cuda> {
+  using PolicyType = Kokkos::TeamPolicy<Kokkos::Cuda>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
+    return PolicyType(stream,nt,ts);
+  }
+};
+#endif
+
+template <class SpaceType>
+struct ReturnRangePolicyType;
+
+#ifdef KOKKOS_ENABLE_SERIAL
+template <>
+struct ReturnRangePolicyType<Kokkos::Serial> {
+  using PolicyType = Kokkos::RangePolicy<Kokkos::Serial>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
+    return PolicyType(nt,ts);
+    //return PolicyType(ExecInstanceType(),nt,ts);
+  }
+};
+#endif
+#ifdef KOKKOS_ENABLE_OPENMP
+template <>
+struct ReturnRangePolicyType<Kokkos::OpenMP> {
+  using PolicyType = Kokkos::RangePolicy<Kokkos::OpenMP>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType ) {
+    return PolicyType(nt,ts);
+    //return PolicyType(ExecInstanceType(),nt,ts);
+  }
+};
+#endif
+#ifdef KOKKOS_ENABLE_CUDA
+template <>
+struct ReturnRangePolicyType<Kokkos::Cuda> {
+  using PolicyType = Kokkos::RangePolicy<Kokkos::Cuda>;
+
+  static inline
+  PolicyType get_policy(int nt, int ts) {
+    return PolicyType(nt,ts);
+  }
+
+  template <class ExecInstanceType>
+  static inline
+  PolicyType get_policy(int nt, int ts, ExecInstanceType stream) {
+    return PolicyType(stream,nt,ts);
+  }
+};
+#endif
+
+template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
+void lower_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
+
+    typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
+    typedef typename TriSolveHandle::execution_space execution_space;
+    typedef typename TriSolveHandle::size_type size_type;
+    typename TriSolveHandle::SPTRSVcudaGraphWrapperType* lcl_cudagraph = thandle.get_sptrsvCudaGraph();
+
+    auto nlevels = thandle.get_num_levels();
+
+    auto stream1 = lcl_cudagraph->stream;
+    Kokkos::Cuda cuda1(stream1);
+    auto graph = lcl_cudagraph->cudagraph;
+
+    Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0,1), EmptyFunctor());
+    Kokkos::Cuda().fence();
+    cudaStreamSynchronize(stream1);
+    //Kokkos::fence();
+
+    auto hnodes_per_level = thandle.get_host_nodes_per_level();
+    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+
+    size_type node_count = 0;
+
+    int team_size = thandle.get_team_size();
+    team_size = team_size == -1 ? 64 : team_size;
+
+    // Start capturing stream
+    if(thandle.cudagraphCreated == false) {
+    Kokkos::fence();
+    cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
+    {
+      for (int iter = 0; iter < nlevels; ++iter) {
+        size_type lvl_nodes = hnodes_per_level(iter);
+
+        using policy_type = ReturnTeamPolicyType<execution_space>;
+
+        Kokkos::parallel_for("parfor_l_team_cudagraph",  Kokkos::Experimental::require(ReturnTeamPolicyType<execution_space>::get_policy(lvl_nodes,team_size,cuda1), Kokkos::Experimental::WorkItemProperty::HintLightWeight), LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count));
+
+        node_count += hnodes_per_level(iter);
+      }
+    }
+    cudaStreamEndCapture(stream1, &graph);
+
+    // Create graphExec
+    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0);
+      thandle.cudagraphCreated = true;
+    }
+    // Run graph
+    Kokkos::fence();
+    cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
+
+    cudaStreamSynchronize(stream1);
+    Kokkos::fence();
+} // end lower_tri_solve_cg
+
+
+template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
+void upper_tri_solve_cg( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
+
+    typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
+    typedef typename TriSolveHandle::execution_space execution_space;
+    typedef typename TriSolveHandle::size_type size_type;
+    typename TriSolveHandle::SPTRSVcudaGraphWrapperType* lcl_cudagraph = thandle.get_sptrsvCudaGraph();
+
+    auto nlevels = thandle.get_num_levels();
+
+    auto stream1 = lcl_cudagraph->stream;
+    Kokkos::Cuda cuda1(stream1);
+    auto graph = lcl_cudagraph->cudagraph;
+
+    Kokkos::parallel_for("Init", Kokkos::RangePolicy<execution_space>(0,1), EmptyFunctor());
+    Kokkos::Cuda().fence();
+    cudaStreamSynchronize(stream1);
+
+    auto hnodes_per_level = thandle.get_host_nodes_per_level();
+    auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+
+    size_type node_count = 0;
+
+    int team_size = thandle.get_team_size();
+    team_size = team_size == -1 ? 64 : team_size;
+
+    // Start capturing stream
+    if(thandle.cudagraphCreated == false) {
+    Kokkos::fence();
+    cudaStreamBeginCapture(stream1, cudaStreamCaptureModeGlobal);
+    {
+      for (int iter = 0; iter < nlevels; ++iter) {
+        size_type lvl_nodes = hnodes_per_level(iter);
+
+        using policy_type = ReturnTeamPolicyType<execution_space>;
+
+        Kokkos::parallel_for("parfor_u_team_cudagraph",  Kokkos::Experimental::require(ReturnTeamPolicyType<execution_space>::get_policy(lvl_nodes,team_size,cuda1), Kokkos::Experimental::WorkItemProperty::HintLightWeight), UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count));
+
+        node_count += hnodes_per_level(iter);
+      }
+    }
+    cudaStreamEndCapture(stream1, &graph);
+
+    // Create graphExec
+    cudaGraphInstantiate(&(lcl_cudagraph->cudagraphinstance), graph, NULL, NULL, 0);
+      thandle.cudagraphCreated = true;
+    }
+    // Run graph
+    Kokkos::fence();
+    cudaGraphLaunch(lcl_cudagraph->cudagraphinstance, stream1);
+
+    cudaStreamSynchronize(stream1);
+    Kokkos::fence();
+} // end upper_tri_solve_cg
+
+#endif
+
+
+template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
+void lower_tri_solve(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+cudaProfilerStop();
+#endif
+
+  typedef typename TriSolveHandle::execution_space execution_space;
+  typedef typename TriSolveHandle::size_type size_type;
+  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
+
+  auto nlevels = thandle.get_num_levels();
+  // Keep this a host View, create device version and copy to back to host during scheduling
+  // This requires making sure the host view in the handle is properly updated after the symbolic phase
+  auto nodes_per_level = thandle.get_nodes_per_level();
+  auto hnodes_per_level = thandle.get_host_nodes_per_level();
+  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+
+  size_type node_count = 0;
 
-  // This must stay serial; would be nice to try out Cuda's graph stuff to reduce kernel launch overhead
   for ( size_type lvl = 0; lvl < nlevels; ++lvl ) {
+   {
     size_type lvl_nodes = hnodes_per_level(lvl);
 
     if ( lvl_nodes != 0 ) {
 
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+cudaProfilerStart();
+#endif
       if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ) {
         Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::RangePolicy<execution_space>( node_count, node_count+lvl_nodes ), LowerTriLvlSchedRPSolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> (row_map, entries, values, lhs, rhs, nodes_grouped_by_level) );
       }
@@ -574,57 +2019,78 @@ void lower_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const
         typedef Kokkos::TeamPolicy<execution_space> policy_type;
         int team_size = thandle.get_team_size();
 
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count);
+#else
         LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+#endif
         if ( team_size == -1 )
           Kokkos::parallel_for("parfor_l_team", policy_type( lvl_nodes , Kokkos::AUTO ), tstf);
         else
           Kokkos::parallel_for("parfor_l_team", policy_type( lvl_nodes , team_size ), tstf);
       }
-      /*
       // TP2 algorithm has issues with some offset-ordinal combo to be addressed
+      /*
       else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
         typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
 
         int team_size = thandle.get_team_size();
         if ( team_size == -1 ) {
-          team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 128;
+          team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 64;
         }
         int vector_size = thandle.get_team_size();
         if ( vector_size == -1 ) {
           vector_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 4;
         }
 
-        // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing that many nodes
-        //       TeamThreadRange over number of node_groups
-        //       To avoid masking threads, 1 thread (team) per node in node_group
+        // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing team_size # nodes
+        //       TeamThreadRange over number nodes of node_groups
+        //       To avoid masking threads, 1 thread (team) per node in node_group (thread has full ownership of a node)
         //       ThreadVectorRange responsible for the actual solve computation
-        const int node_groups = team_size;
+        //const int node_groups = team_size;
+        const int node_groups = vector_size;
 
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+        TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count, vector_size, 0);
+#else
         LowerTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups);
+#endif
         Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), tstf);
       } // end elseif
       */
 
       node_count += lvl_nodes;
 
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+cudaProfilerStop();
+#endif
     } // end if
+   } // scope for if-block
+
   } // end for lvl
 
 } // end lower_tri_solve
 
 
+
 template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
-void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
+void upper_tri_solve(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs) {
 
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+cudaProfilerStop();
+#endif
   typedef typename TriSolveHandle::execution_space execution_space;
   typedef typename TriSolveHandle::size_type size_type;
   typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
 
+
   auto nlevels = thandle.get_num_levels();
   // Keep this a host View, create device version and copy to back to host during scheduling
+  // This requires making sure the host view in the handle is properly updated after the symbolic phase
   auto nodes_per_level = thandle.get_nodes_per_level();
-  auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
-  Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
+  auto hnodes_per_level = thandle.get_host_nodes_per_level();
+  //auto hnodes_per_level = Kokkos::create_mirror_view(nodes_per_level);
+  //Kokkos::deep_copy(hnodes_per_level, nodes_per_level);
 
   auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
 
@@ -635,6 +2101,9 @@ void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const
     size_type lvl_nodes = hnodes_per_level(lvl);
 
     if ( lvl_nodes != 0 ) {
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+cudaProfilerStart();
+#endif
 
       if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ) {
         Kokkos::parallel_for( "parfor_fixed_lvl", Kokkos::RangePolicy<execution_space>( node_count, node_count+lvl_nodes ), UpperTriLvlSchedRPSolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> (row_map, entries, values, lhs, rhs, nodes_grouped_by_level) );
@@ -644,20 +2113,24 @@ void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const
 
         int team_size = thandle.get_team_size();
 
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count);
+#else
         UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+#endif
         if ( team_size == -1 )
           Kokkos::parallel_for("parfor_u_team", policy_type( lvl_nodes , Kokkos::AUTO ), tstf);
         else
           Kokkos::parallel_for("parfor_u_team", policy_type( lvl_nodes , team_size ), tstf);
       }
-      /*
       // TP2 algorithm has issues with some offset-ordinal combo to be addressed
+      /*
       else if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 ) {
         typedef Kokkos::TeamPolicy<execution_space> tvt_policy_type;
 
         int team_size = thandle.get_team_size();
         if ( team_size == -1 ) {
-          team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 128;
+          team_size = std::is_same< typename Kokkos::DefaultExecutionSpace::memory_space, Kokkos::HostSpace >::value ? 1 : 64;
         }
         int vector_size = thandle.get_team_size();
         if ( vector_size == -1 ) {
@@ -665,24 +2138,209 @@ void upper_tri_solve( TriSolveHandle & thandle, const RowMapType row_map, const
         }
 
         // This impl: "chunk" lvl_nodes into node_groups; a league_rank is responsible for processing that many nodes
-        //       TeamThreadRange over number of node_groups
-        //       To avoid masking threads, 1 thread (team) per node in node_group
+        //       TeamThreadRange over number nodes of node_groups
+        //       To avoid masking threads, 1 thread (team) per node in node_group (thread has full ownership of a node)
         //       ThreadVectorRange responsible for the actual solve computation
-        const int node_groups = team_size;
+        //const int node_groups = team_size;
+        const int node_groups = vector_size;
 
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+        TriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, false, node_count, vector_size, 0);
+#else
         UpperTriLvlSchedTP2SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count, node_groups);
+#endif
+
         Kokkos::parallel_for("parfor_u_team_vector", tvt_policy_type( (int)std::ceil((float)lvl_nodes/(float)node_groups) , team_size, vector_size ), tstf);
       } // end elseif
       */
 
       node_count += lvl_nodes;
 
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+cudaProfilerStop();
+#endif
     } // end if
   } // end for lvl
 
 } // end upper_tri_solve
 
 
+template < class TriSolveHandle, class RowMapType, class EntriesType, class ValuesType, class RHSType, class LHSType >
+void tri_solve_chain(TriSolveHandle & thandle, const RowMapType row_map, const EntriesType entries, const ValuesType values, const RHSType & rhs, LHSType &lhs, const bool is_lowertri_) {
+
+#if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSPSTRSV_SOLVE_IMPL_PROFILE)
+cudaProfilerStop();
+#endif
+  typedef typename TriSolveHandle::execution_space execution_space;
+  typedef typename TriSolveHandle::size_type size_type;
+  typedef typename TriSolveHandle::nnz_lno_view_t NGBLType;
+
+  // Algorithm is checked before this function is called
+  auto h_chain_ptr = thandle.get_host_chain_ptr();
+  size_type num_chain_entries = thandle.get_num_chain_entries();
+
+  // Keep this a host View, create device version and copy to back to host during scheduling
+  // This requires making sure the host view in the handle is properly updated after the symbolic phase
+  auto nodes_per_level = thandle.get_nodes_per_level();
+  auto hnodes_per_level = thandle.get_host_nodes_per_level();
+
+  auto nodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
+
+  const bool is_lowertri =  thandle.is_lower_tri();
+
+  size_type node_count = 0;
+
+// REFACTORED to cleanup; next, need debug and timer routines
+  using policy_type = Kokkos::TeamPolicy<execution_space>;
+  using large_cutoff_policy_type = Kokkos::TeamPolicy<LargerCutoffTag, execution_space>;
+/*
+  using TP1Functor = TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
+  using LTP1Functor = LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
+  using UTP1Functor = UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
+  using LSingleBlockFunctor = LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
+  using USingleBlockFunctor = UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
+*/
+  using SingleBlockFunctor = TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType>;
+
+  int team_size = thandle.get_team_size();
+  int vector_size = thandle.get_vector_size() > 0 ? thandle.get_vector_size() : 1;
+
+  auto cutoff = thandle.get_chain_threshold();
+  int team_size_singleblock = team_size;
+
+  // Enumerate options
+  // ts -1,0 | cu 0 - select default ts == 1
+  // ts -1,0 | cu > 0 - select default ts; restriction: ts <= tsmax (auto)
+  // ts > 0 | cu 0 - set
+  // ts > 0 | cu > 0 - set
+  // Controls ts,cu > 0
+  // co > ts  - not all rows can be mapped to a thread - must call largercutoff impl
+  // co <= ts - okay, kernel must be careful not to access out-of-bounds; some threads idol
+  if (team_size_singleblock <= 0 && cutoff == 0) {
+    team_size_singleblock = 1;
+    // If cutoff == 0, no single-block calls will be made, team_size_singleblock is unimportant
+  }
+
+  // This is only necessary for Lower,UpperTri functor versions; else, is_lowertri can be passed as arg to the generic Tri functor...
+  if (is_lowertri) {
+
+    for ( size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink ) {
+      size_type schain = h_chain_ptr(chainlink);
+      size_type echain = h_chain_ptr(chainlink+1);
+
+      if ( echain - schain == 1 ) {
+
+        // if team_size is -1 (unset), get recommended size from Kokkos
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, true, node_count);
+#else
+        LowerTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+#endif
+        if (team_size == - 1) {
+          team_size = policy_type(1, 1, vector_size).team_size_recommended(tstf, Kokkos::ParallelForTag());
+        }
+
+        size_type lvl_nodes = hnodes_per_level(schain); //lvl == echain????
+        Kokkos::parallel_for("parfor_l_team_chain1", policy_type(lvl_nodes , team_size, vector_size), tstf);
+        node_count += lvl_nodes;
+
+      }
+      else {
+        size_type lvl_nodes = 0;
+
+        for (size_type i = schain; i < echain; ++i) {
+          lvl_nodes += hnodes_per_level(i);
+        }
+
+        if (team_size_singleblock <= 0) {
+          team_size_singleblock = policy_type(1, 1, vector_size).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag());
+        }
+
+        if (cutoff <= team_size_singleblock) {
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true);
+#else
+          LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain);
+#endif
+          Kokkos::parallel_for("parfor_l_team_chainmulti", policy_type(1, team_size_singleblock, vector_size), tstf);
+        }
+        else {
+          // team_size_singleblock < cutoff => kernel must allow for a block-stride internally
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, true, 0, cutoff);
+#else
+          LowerTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff);
+#endif
+          Kokkos::parallel_for("parfor_l_team_chainmulti_cutoff", large_cutoff_policy_type(1, team_size_singleblock, vector_size), tstf);
+        }
+        node_count += lvl_nodes;
+      }
+      Kokkos::fence(); // TODO - is this necessary? that is, can the parallel_for launch before the s/echain values have been updated?
+    }
+
+  }
+  else {
+
+    for ( size_type chainlink = 0; chainlink < num_chain_entries; ++chainlink ) {
+      size_type schain = h_chain_ptr(chainlink);
+      size_type echain = h_chain_ptr(chainlink+1);
+
+      if ( echain - schain == 1 ) {
+
+        // if team_size is -1 (unset), get recommended size from Kokkos
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+        TriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count);
+#else
+        UpperTriLvlSchedTP1SolverFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, node_count);
+#endif
+        if (team_size == - 1) {
+          team_size = policy_type(1, 1, vector_size).team_size_recommended(tstf, Kokkos::ParallelForTag());
+        }
+
+        // TODO To use cudagraph here, need to know how many non-unit chains there are, create a graph for each and launch accordingly
+        size_type lvl_nodes = hnodes_per_level(schain); //lvl == echain????
+        Kokkos::parallel_for("parfor_u_team_chain1", policy_type(lvl_nodes , team_size, vector_size), tstf);
+        node_count += lvl_nodes;
+
+      }
+      else {
+        size_type lvl_nodes = 0;
+
+        for (size_type i = schain; i < echain; ++i) {
+          lvl_nodes += hnodes_per_level(i);
+        }
+
+        if (team_size_singleblock <= 0) {
+          //team_size_singleblock = policy_type(1, 1, 1).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, is_lowertri, node_count), Kokkos::ParallelForTag());
+          team_size_singleblock = policy_type(1, 1, vector_size).team_size_recommended(SingleBlockFunctor(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri), Kokkos::ParallelForTag());
+        }
+
+        if (cutoff <= team_size_singleblock) {
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri);
+#else
+          UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain);
+#endif
+          Kokkos::parallel_for("parfor_u_team_chainmulti", policy_type(1, team_size_singleblock, vector_size), tstf);
+        }
+        else {
+          // team_size_singleblock < cutoff => kernel must allow for a block-stride internally
+#ifdef KOKKOSKERNELS_SPTRSV_TRILVLSCHED
+          TriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, is_lowertri, 0, cutoff);
+#else
+          UpperTriLvlSchedTP1SingleBlockFunctor<RowMapType, EntriesType, ValuesType, LHSType, RHSType, NGBLType> tstf(row_map, entries, values, lhs, rhs, nodes_grouped_by_level, nodes_per_level, node_count, schain, echain, cutoff);
+#endif
+          Kokkos::parallel_for("parfor_u_team_chainmulti_cutoff", large_cutoff_policy_type(1, team_size_singleblock, vector_size), tstf);
+        }
+        node_count += lvl_nodes;
+      }
+      Kokkos::fence(); // TODO - is this necessary? that is, can the parallel_for launch before the s/echain values have been updated?
+    }
+
+  }
+
+} // end tri_solve_chain
+
 } // namespace Experimental
 } // namespace Impl
 } // namespace KokkosSparse
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp b/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
index 7375aa4afb..ce6f2bc599 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_solve_spec.hpp
@@ -102,6 +102,10 @@ struct sptrsv_solve_eti_spec_avail {
 namespace KokkosSparse {
 namespace Impl {
 
+#if defined(KOKKOS_ENABLE_CUDA) && 10000 < CUDA_VERSION && defined(KOKKOSKERNELS_ENABLE_EXP_CUDAGRAPH)
+  #define KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+#endif
+
 // Unification layer
 /// \brief Implementation of KokkosSparse::sptrsv_solve
 
@@ -155,19 +159,40 @@ struct SPTRSV_SOLVE<KernelHandle, RowMapType, EntriesType, ValuesType, BType, XT
                 XType x)
   {
     // Call specific algorithm type
+    using ExecSpace = typename RowMapType::memory_space::execution_space;
     auto sptrsv_handle = handle->get_sptrsv_handle();
 
     if ( sptrsv_handle->is_lower_tri() ) {
       if ( sptrsv_handle->is_symbolic_complete() == false ) {
         Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries);
       }
-      Experimental::lower_tri_solve( *sptrsv_handle, row_map, entries, values, b, x);
+      if ( sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) {
+        Experimental::tri_solve_chain( *sptrsv_handle, row_map, entries, values, b, x, true);
+      }
+      else {
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+        if ( std::is_same<ExecSpace, Kokkos::Cuda>::value)
+          Experimental::lower_tri_solve_cg( *sptrsv_handle, row_map, entries, values, b, x);
+        else
+#endif
+          Experimental::lower_tri_solve( *sptrsv_handle, row_map, entries, values, b, x);
+      }
     }
     else {
       if ( sptrsv_handle->is_symbolic_complete() == false ) {
         Experimental::upper_tri_symbolic(*sptrsv_handle, row_map, entries);
       }
-      Experimental::upper_tri_solve( *sptrsv_handle, row_map, entries, values, b, x);
+      if ( sptrsv_handle->get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN ) {
+        Experimental::tri_solve_chain( *sptrsv_handle, row_map, entries, values, b, x, false);
+      }
+      else {
+#ifdef KOKKOSKERNELS_SPTRSV_CUDAGRAPHSUPPORT
+        if ( std::is_same<ExecSpace, Kokkos::Cuda>::value)
+          Experimental::upper_tri_solve_cg( *sptrsv_handle, row_map, entries, values, b, x);
+        else
+#endif
+          Experimental::upper_tri_solve( *sptrsv_handle, row_map, entries, values, b, x);
+      }
     }
   }
 
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp
index 2335f68dea..721b0377ac 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_impl.hpp
@@ -51,33 +51,138 @@
 #include <Kokkos_ArithTraits.hpp>
 #include <KokkosSparse_sptrsv_handle.hpp>
 
+//#define TRISOLVE_SYMB_TIMERS
 //#define LVL_OUTPUT_INFO
+//#define CHAIN_LVL_OUTPUT_INFO
+
+// TODO Pass values array and store diagonal entries - should this always be done or optional?
 
 namespace KokkosSparse {
 namespace Impl {
 namespace Experimental {
 
+template <class ViewType>
+void print_view1d_symbolic(const ViewType dv, size_t range = 0) {
+  auto v = Kokkos::create_mirror_view(dv);
+  Kokkos::deep_copy(v, dv);
+  std::cout << "Output for view " << v.label() << std::endl;
+  range = range == 0 ? dv.extent(0) : range;
+  for (size_t i = 0; i < range; ++i) {
+    std::cout << "v(" << i << ") = " << v(i) << " , ";
+  }
+  std::cout << std::endl;
+}
+
+
+// Usage:
+  // for c in [0, num_chain_entries)
+  //   s = h_chain_ptr(c); e = h_chain_ptr(c+1);
+  //   num_levels_in_current_chain = e - s;
+  //   if nlicc > 256
+  //     call current_alg
+  //   else
+  //     call single_block(s,e)
+
+template < class TriSolveHandle, class NPLViewType >
+void symbolic_chain_phase(TriSolveHandle &thandle, const NPLViewType &nodes_per_level) {
+
+#ifdef TRISOLVE_SYMB_TIMERS
+  Kokkos::Timer timer_sym_chain_total;
+#endif
+  typedef typename TriSolveHandle::size_type size_type;
 
-template < class TriSolveHandle, class RowMapType, class EntriesType >
-void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) {
+  size_type nlevels = thandle.get_num_levels();
+
+  // Create the chain now
+  // FIXME Implementations will need to be templated on exec space it seems...
+ auto cutoff_threshold = thandle.get_chain_threshold();
+ if ( thandle.algm_requires_symb_chain() ) {
+  auto h_chain_ptr = thandle.get_host_chain_ptr();
+  h_chain_ptr(0) = 0;
+  size_type chainlinks_length = 0;
+  size_type num_chain_entries = 0;
+  int chain_state = 0;
+  const int cutoff = cutoff_threshold;
+  for ( size_type i = 0; i < nlevels; ++i ) {
+    auto cnpl = nodes_per_level(i);
+    if (cnpl <= cutoff) {
+      // this nlevels may be part of a chain passed to the "single_block" solver to reduce kernel launches
+      chainlinks_length += 1;
+    }
+    else {
+      // Too many levels to run on single block...
+      // If first lvl <= cutoff but next nlevels isn't, the two aren't separately updated and info is lost...
+      // if chainlinks_length > 0, take path so that chain-links updated, then current too large chain updated (i.e. 2 updates); if chainlinks_length == 0, then no previous chains and only one update required (npl too large for single-block
+      chain_state = chainlinks_length > 0 ? 2 : 1;
+    }
+
+    // if we hit final nlevels before a trigger to update the chain, than override it 
+    // in this case, there was not a larger value to miss cutoff and reset the update
+    if ( chain_state == 0 && i == nlevels-1 ) { chain_state = 1; }
+
+    if (chain_state == 1) {
+      num_chain_entries += 1;
+      if (chainlinks_length == 0) {
+        h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + 1;
+      }
+      else {
+        h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + chainlinks_length;
+      }
+      chainlinks_length = 0; //reset
+      chain_state = 0; //reset
+    }
+    // Two updates required - should only occur if chainlinks_length > 0
+    // We have found two things: a non-one length chain, and a subsequent one length chain
+    if (chain_state == 2) {
+      if (chainlinks_length == 0) { std::runtime_error("MAJOR LOGIC ERROR! TERMINATE!"); }
+
+      num_chain_entries += 1;
+      h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + chainlinks_length;
+
+      num_chain_entries += 1;
+      h_chain_ptr(num_chain_entries) = h_chain_ptr(num_chain_entries-1) + 1;
+
+      chainlinks_length = 0; //reset
+      chain_state = 0; //reset
+    }
+  }
+  thandle.set_num_chain_entries(num_chain_entries);
+
+#ifdef CHAIN_LVL_OUTPUT_INFO
+  std::cout << "  num_chain_entries = " << thandle.get_num_chain_entries() << std::endl;
+  for ( size_type i = 0; i < num_chain_entries+1; ++i )
+  {
+    std::cout << "chain_ptr(" << i << "): " << h_chain_ptr(i) << std::endl;
+  }
+#endif
+ }
 
- if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ||
-      thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1 )
-/*   || thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 )*/
+#ifdef TRISOLVE_SYMB_TIMERS
+ std::cout << "  Symbolic Chain Phase Total Time: " << timer_sym_chain_total.seconds() << std::endl;;
+#endif
+} // end symbolic_chain_phase
+
+
+template < class TriSolveHandle, class RowMapType, class EntriesType >
+void lower_tri_symbolic (TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries) {
+#ifdef TRISOLVE_SYMB_TIMERS
+  Kokkos::Timer timer_sym_lowertri_total;
+#endif
+ if ( thandle.algm_requires_symb_lvlsched() )
  {
-  // Scheduling currently compute on host - need host copy of all views
+  // Scheduling currently computes on host - need host copy of all views
 
   typedef typename TriSolveHandle::size_type size_type;
 
   typedef typename TriSolveHandle::nnz_lno_view_t  DeviceEntriesType;
-  typedef typename TriSolveHandle::nnz_lno_view_t::HostMirror HostEntriesType;
 
   typedef typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType;
   typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror HostSignedEntriesType;
 
   typedef typename TriSolveHandle::signed_integral_t signed_integral_t;
 
-  size_type nrows = thandle.get_nrows();
+  // Necessary for partitioned persisting sparse matrix
+  size_type nrows = drow_map.extent(0)-1;
 
   auto row_map = Kokkos::create_mirror_view(drow_map);
   Kokkos::deep_copy(row_map, drow_map);
@@ -85,13 +190,13 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
   auto entries = Kokkos::create_mirror_view(dentries);
   Kokkos::deep_copy(entries, dentries);
   
+  // get device view - will deep_copy to it at end of this host routine
   DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level();
-  HostEntriesType nodes_per_level = Kokkos::create_mirror_view(dnodes_per_level);
-  Kokkos::deep_copy(nodes_per_level, dnodes_per_level);
+  auto nodes_per_level = thandle.get_host_nodes_per_level();
 
+  // get device view - will deep_copy to it at end of this host routine
   DeviceEntriesType dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
-  HostEntriesType nodes_grouped_by_level = Kokkos::create_mirror_view(dnodes_grouped_by_level);
-  Kokkos::deep_copy(nodes_grouped_by_level, dnodes_grouped_by_level);
+  auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
 
   DeviceSignedEntriesType dlevel_list = thandle.get_level_list();
   HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list);
@@ -100,19 +205,21 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
   HostSignedEntriesType previous_level_list( Kokkos::ViewAllocateWithoutInitializing("previous_level_list"), nrows );
   Kokkos::deep_copy( previous_level_list, signed_integral_t(-1) );
 
+  const bool stored_diagonal = thandle.is_stored_diagonal();
+  // diagonal_offsets is uninitialized - deep_copy unnecessary at the beginning, only needed at the end
+  auto diagonal_offsets = thandle.get_diagonal_offsets();
+  auto hdiagonal_offsets = thandle.get_host_diagonal_offsets();
 
-  // node 0 is trivially independent in lower tri solve, start with it in level 0
   size_type level = 0;
   auto starting_node = 0;
-  level_list(starting_node) = 0;
-  size_type node_count = 1; //lower tri: starting with node 0 already in level 0
+  auto ending_node = nrows;
 
-  nodes_per_level(0) = 1;
-  nodes_grouped_by_level(0) = starting_node;
+  size_type node_count = 0;
 
   while (node_count < nrows) {
 
-    for ( size_type row = 1; row < nrows; ++row ) { // row 0 already included
+    for ( size_type row = starting_node; row < ending_node; ++row )
+    {
       if ( level_list(row) == -1 ) { // unmarked
         bool is_root = true;
         signed_integral_t ptrstart = row_map(row);
@@ -126,6 +233,14 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
               break;
             }
           }
+          else if ( col == row ) {
+            if (stored_diagonal)
+              hdiagonal_offsets(row) = offset;
+          }
+          else if ( col > row ) {
+            std::cout << "\nrow = " << row << "  col = " << col << "  offset = " << offset << std::endl;
+            std::runtime_error("SYMB ERROR: Lower tri with colid > rowid - SHOULD NOT HAPPEN!!!");
+          }
         } // end for offset , i.e. cols of this row
 
         if ( is_root == true ) {
@@ -146,9 +261,15 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
     level += 1;
   } // end while
 
-  thandle.set_symbolic_complete();
   thandle.set_num_levels(level);
 
+  // Create the chain now
+  if ( thandle.algm_requires_symb_chain() ) {
+    symbolic_chain_phase(thandle, nodes_per_level);
+  }
+
+  thandle.set_symbolic_complete();
+
   // Output check
 #ifdef LVL_OUTPUT_INFO
   std::cout << "  set symbolic complete: " << thandle.is_symbolic_complete() << std::endl;
@@ -165,33 +286,62 @@ void lower_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
   { std::cout << "i: " << i << "  nodes_grouped_by_level = " << nodes_grouped_by_level(i) << std::endl; }
 #endif
 
+  // Deep copy to device views
   Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
   Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
   Kokkos::deep_copy(dlevel_list, level_list);
+  if (stored_diagonal)
+    Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets);
+
+  // Extra check:
+#ifdef LVL_OUTPUT_INFO
+  {
+  std::cout << "  End symb - extra checks" << std::endl;
+  std::cout << "  node_count = " << node_count << std::endl;
+  std::cout << "  nlevel = " << level << std::endl;
+  std::cout << "  npl.extent = " << nodes_per_level.extent(0) << std::endl;
+  long check_count = 0;
+  Kokkos::parallel_reduce("check_count host", Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, nodes_per_level.extent(0)),
+    KOKKOS_LAMBDA (const long i, long& update) {
+      update+=nodes_per_level(i);
+    }, check_count);
+  std::cout << "  host check_count= " << check_count << std::endl;
+
+  check_count = 0; // reset
+  Kokkos::parallel_reduce("check_count device", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, dnodes_per_level.extent(0)),
+    KOKKOS_LAMBDA (const long i, long& update) {
+      update+=dnodes_per_level(i);
+    }, check_count);
+  std::cout << "  devicecheck_count= " << check_count << std::endl;
+  }
+#endif
  }
-} // end lowertri_level_sched
+
+#ifdef TRISOLVE_SYMB_TIMERS
+ std::cout << "  Symbolic (lower tri) Total Time: " << timer_sym_lowertri_total.seconds() << std::endl;;
+#endif
+} // end lower_tri_symbolic
 
 
 template < class TriSolveHandle, class RowMapType, class EntriesType >
 void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, const EntriesType dentries ) {
+#ifdef TRISOLVE_SYMB_TIMERS
+  Kokkos::Timer timer_sym_uppertri_total;
+#endif
 
- if ( thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_RP ||
-      thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1 )
-/*   || thandle.get_algorithm() == KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHED_TP2 )*/
+ if ( thandle.algm_requires_symb_lvlsched() )
  {
   // Scheduling currently compute on host - need host copy of all views
 
   typedef typename TriSolveHandle::size_type size_type;
-
   typedef typename TriSolveHandle::nnz_lno_view_t  DeviceEntriesType;
-  typedef typename TriSolveHandle::nnz_lno_view_t::HostMirror HostEntriesType;
-
   typedef typename TriSolveHandle::signed_nnz_lno_view_t DeviceSignedEntriesType;
   typedef typename TriSolveHandle::signed_nnz_lno_view_t::HostMirror HostSignedEntriesType;
-
   typedef typename TriSolveHandle::signed_integral_t signed_integral_t;
 
-  size_type nrows = thandle.get_nrows();
+//  size_type nrows = thandle.get_nrows();
+  // Necessary for partitioned persisting sparse matrix
+  size_type nrows = drow_map.extent(0)-1;
 
   auto row_map = Kokkos::create_mirror_view(drow_map);
   Kokkos::deep_copy(row_map, drow_map);
@@ -199,13 +349,13 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
   auto entries = Kokkos::create_mirror_view(dentries);
   Kokkos::deep_copy(entries, dentries);
   
+  // get device view - will deep_copy to it at end of this host routine
   DeviceEntriesType dnodes_per_level = thandle.get_nodes_per_level();
-  HostEntriesType nodes_per_level = Kokkos::create_mirror_view(dnodes_per_level);
-  Kokkos::deep_copy(nodes_per_level, dnodes_per_level);
+  auto nodes_per_level = thandle.get_host_nodes_per_level();
 
+  // get device view - will deep_copy to it at end of this host routine
   DeviceEntriesType dnodes_grouped_by_level = thandle.get_nodes_grouped_by_level();
-  HostEntriesType nodes_grouped_by_level = Kokkos::create_mirror_view(dnodes_grouped_by_level);
-  Kokkos::deep_copy(nodes_grouped_by_level, dnodes_grouped_by_level);
+  auto nodes_grouped_by_level = thandle.get_host_nodes_grouped_by_level();
 
   DeviceSignedEntriesType dlevel_list = thandle.get_level_list();
   HostSignedEntriesType level_list = Kokkos::create_mirror_view(dlevel_list);
@@ -214,19 +364,21 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
   HostSignedEntriesType previous_level_list( Kokkos::ViewAllocateWithoutInitializing("previous_level_list"), nrows);
   Kokkos::deep_copy( previous_level_list, signed_integral_t(-1) );
 
+  const bool stored_diagonal = thandle.is_stored_diagonal();
+  // diagonal_offsets is uninitialized - deep_copy unnecessary at the beginning, only needed at the end
+  auto diagonal_offsets = thandle.get_diagonal_offsets();
+  auto hdiagonal_offsets = thandle.get_host_diagonal_offsets();
 
-  // final row is trivially independent in upper tri solve, start with it in level 0
   size_type level = 0;
   auto starting_node = nrows - 1;
-  level_list(starting_node) = 0;
-  size_type node_count = 1; //upper tri: starting with node n already in level 0
+  auto ending_node = 0;
 
-  nodes_per_level(0) = 1;
-  nodes_grouped_by_level(0) = starting_node;
+  size_type node_count = 0;
 
   while (node_count < nrows) {
 
-    for ( signed_integral_t row = nrows-2; row >= 0; --row ) { // row 0 already included
+    for ( signed_integral_t row = starting_node; row >= ending_node; --row )
+    {
       if ( level_list(row) == -1 ) { // unmarked
         bool is_root = true;
         signed_integral_t ptrstart = row_map(row);
@@ -234,12 +386,17 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
 
         for (signed_integral_t offset = ptrend-1; offset >= ptrstart; --offset) {
           signed_integral_t col = entries(offset);
-          if ( previous_level_list(col) == -1 && col != row ) { // unmarked
+
+          if (previous_level_list(col) == -1 && col != row) { // unmarked
             if ( col > row ) {
               is_root = false;
               break;
             }
           }
+          else if ( col == row ) {
+            if (stored_diagonal)
+              hdiagonal_offsets(row) = offset;
+          }
         } // end for offset , i.e. cols of this row
 
         if ( is_root == true ) {
@@ -260,9 +417,15 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
     level += 1;
   } // end while
 
-  thandle.set_symbolic_complete();
   thandle.set_num_levels(level);
 
+  // Create the chain now
+  if ( thandle.algm_requires_symb_chain() ) {
+    symbolic_chain_phase(thandle, nodes_per_level);
+  }
+
+  thandle.set_symbolic_complete();
+
   // Output check
 #ifdef LVL_OUTPUT_INFO
   std::cout << "  set symbolic complete: " << thandle.is_symbolic_complete() << std::endl;
@@ -279,15 +442,54 @@ void upper_tri_symbolic ( TriSolveHandle &thandle, const RowMapType drow_map, co
   { std::cout << "i: " << i << "  nodes_grouped_by_level = " << nodes_grouped_by_level(i) << std::endl; }
 #endif
 
+  // Deep copy to device views
   Kokkos::deep_copy(dnodes_grouped_by_level, nodes_grouped_by_level);
   Kokkos::deep_copy(dnodes_per_level, nodes_per_level);
   Kokkos::deep_copy(dlevel_list, level_list);
+  if (stored_diagonal)
+    Kokkos::deep_copy(diagonal_offsets, hdiagonal_offsets);
+
+  // Extra check:
+#ifdef LVL_OUTPUT_INFO
+  {
+  std::cout << "  End symb - extra checks" << std::endl;
+  std::cout << "  node_count = " << node_count << std::endl;
+  std::cout << "  nlevel = " << level << std::endl;
+  std::cout << "  npl.extent = " << nodes_per_level.extent(0) << std::endl;
+  long check_count = 0;
+  Kokkos::parallel_reduce("check_count host", Kokkos::RangePolicy<Kokkos::DefaultHostExecutionSpace>(0, nodes_per_level.extent(0)),
+    KOKKOS_LAMBDA (const long i, long& update) {
+      update+=nodes_per_level(i);
+    }, check_count);
+  std::cout << "  host check_count= " << check_count << std::endl;
+
+  check_count = 0; // reset
+  Kokkos::parallel_reduce("check_count device", Kokkos::RangePolicy<Kokkos::DefaultExecutionSpace>(0, dnodes_per_level.extent(0)),
+    KOKKOS_LAMBDA (const long i, long& update) {
+      update+=dnodes_per_level(i);
+    }, check_count);
+  std::cout << "  devicecheck_count= " << check_count << std::endl;
+  }
+#endif
+
  }
-} // end uppertri_level_sched
+
+#ifdef TRISOLVE_SYMB_TIMERS
+ std::cout << "  Symbolic (upper tri) Total Time: " << timer_sym_uppertri_total.seconds() << std::endl;;
+#endif
+} // end upper_tri_symbolic
 
 
 } // namespace Experimental
 } // namespace Impl
 } // namespace KokkosSparse
 
+#ifdef LVL_OUTPUT_INFO
+#undef LVL_OUTPUT_INFO
+#endif
+
+#ifdef CHAIN_LVL_OUTPUT_INFO
+#undef CHAIN_LVL_OUTPUT_INFO
+#endif
+
 #endif
diff --git a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp
index f0103a2171..e960a404c1 100644
--- a/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp
+++ b/src/sparse/impl/KokkosSparse_sptrsv_symbolic_spec.hpp
@@ -124,6 +124,8 @@ struct SPTRSV_SYMBOLIC<KernelHandle, RowMapType, EntriesType, false, KOKKOSKERNE
                    const EntriesType entries)
   {
     auto sptrsv_handle = handle->get_sptrsv_handle();
+    auto nrows = row_map.extent(0)-1;
+    sptrsv_handle->new_init_handle(nrows);
 
     if ( sptrsv_handle->is_lower_tri() ) {
       Experimental::lower_tri_symbolic(*sptrsv_handle, row_map, entries);
diff --git a/unit_test/sparse/Test_Sparse_sptrsv.hpp b/unit_test/sparse/Test_Sparse_sptrsv.hpp
index 27b447cd2c..a5113db472 100644
--- a/unit_test/sparse/Test_Sparse_sptrsv.hpp
+++ b/unit_test/sparse/Test_Sparse_sptrsv.hpp
@@ -52,7 +52,7 @@
 #include "KokkosKernels_SparseUtils.hpp"
 #include "KokkosSparse_spmv.hpp"
 #include "KokkosSparse_CrsMatrix.hpp"
-#include<KokkosKernels_IOUtils.hpp>
+#include <KokkosKernels_IOUtils.hpp>
 
 #include "KokkosSparse_sptrsv.hpp"
 
@@ -344,7 +344,7 @@ void run_test_sptrsv() {
 
   typedef Kokkos::View< size_type*, device >  RowMapType;
   typedef Kokkos::View< lno_t*, device >      EntriesType;
-  typedef Kokkos::View< scalar_t*, device >     ValuesType;
+  typedef Kokkos::View< scalar_t*, device >   ValuesType;
 
   // Lower tri
   {
@@ -391,10 +391,6 @@ void run_test_sptrsv() {
     typedef KokkosKernels::Experimental::KokkosKernelsHandle <size_type, lno_t, scalar_t,
       typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
 
-    KernelHandle kh;
-    bool is_lower_tri = true;
-    kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-    
     // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs
     ValuesType known_lhs("known_lhs", nrows);
     // Create known solution lhs set to all 1's
@@ -410,60 +406,107 @@ void run_test_sptrsv() {
     crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries);
     KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs);
 
-    sptrsv_symbolic( &kh, row_map, entries );
-    Kokkos::fence();
-
-    kh.get_sptrsv_handle()->print_algorithm();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
-
-    scalar_t sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-    if ( sum != lhs.extent(0) ) {
-      std::cout << "Lower Tri Solve FAILURE" << std::endl;
-    }
-    else {
-      std::cout << "Lower Tri Solve SUCCESS!" << std::endl;
+    {
+      KernelHandle kh;
+      bool is_lower_tri = true;
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+
+      sptrsv_symbolic( &kh, row_map, entries );
+      Kokkos::fence();
+
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Lower Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      Kokkos::deep_copy(lhs, 0);
+      kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Lower Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      //FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed
+      /*
+      Kokkos::deep_copy(lhs, 0);
+      kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Lower Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      */
+
+      kh.destroy_sptrsv_handle();
     }
-    EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
-
-    Kokkos::deep_copy(lhs, 0);
-    kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
-    kh.get_sptrsv_handle()->print_algorithm();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
 
-    sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-    if ( sum != lhs.extent(0) ) {
-      std::cout << "Lower Tri Solve FAILURE" << std::endl;
-    }
-    else {
-      std::cout << "Lower Tri Solve SUCCESS!" << std::endl;
+    {
+      Kokkos::deep_copy(lhs, 0);
+      KernelHandle kh;
+      bool is_lower_tri = true;
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
+      auto chain_threshold = 1;
+      kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
+
+      sptrsv_symbolic( &kh, row_map, entries );
+      Kokkos::fence();
+
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Lower Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      kh.destroy_sptrsv_handle();
     }
-    EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
-
-/*
-//FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed
-    Kokkos::deep_copy(lhs, 0);
-    kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
-    kh.get_sptrsv_handle()->print_algorithm();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
 
-    sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-    if ( sum != lhs.extent(0) ) {
-      std::cout << "Lower Tri Solve FAILURE" << std::endl;
-    }
-    else {
-      std::cout << "Lower Tri Solve SUCCESS!" << std::endl;
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    if (std::is_same<size_type,int>::value && std::is_same<lno_t,int>::value)
+    {
+      Kokkos::deep_copy(lhs, 0);
+      KernelHandle kh;
+      bool is_lower_tri = true;
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri);
+
+      sptrsv_symbolic(&kh, row_map, entries, values);
+      Kokkos::fence();
+
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Lower Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      kh.destroy_sptrsv_handle();
     }
-    EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
-*/
-
-
-    kh.destroy_sptrsv_handle();
+#endif
   }
   // Upper tri
   {
@@ -510,10 +553,6 @@ void run_test_sptrsv() {
     typedef KokkosKernels::Experimental::KokkosKernelsHandle <size_type, lno_t, scalar_t,
       typename device::execution_space, typename device::memory_space,typename device::memory_space > KernelHandle;
 
-    KernelHandle kh;
-    bool is_lower_tri = false;
-    kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
-    
     // Create known_lhs, generate rhs, then solve for lhs to compare to known_lhs
     ValuesType known_lhs("known_lhs", nrows);
     // Create known solution lhs set to all 1's
@@ -529,60 +568,108 @@ void run_test_sptrsv() {
     crsMat_t triMtx("triMtx", nrows, nrows, nnz, values, row_map, entries);
     KokkosSparse::spmv( "N", ONE, triMtx, known_lhs, ZERO, rhs);
 
-    sptrsv_symbolic( &kh, row_map, entries );
-    Kokkos::fence();
-
-    kh.get_sptrsv_handle()->print_algorithm();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
-
-    scalar_t sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-    if ( sum != lhs.extent(0) ) {
-      std::cout << "Upper Tri Solve FAILURE" << std::endl;
+    {
+      KernelHandle kh;
+      bool is_lower_tri = false;
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1, nrows, is_lower_tri);
+
+      sptrsv_symbolic( &kh, row_map, entries );
+      Kokkos::fence();
+
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Upper Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      Kokkos::deep_copy(lhs, 0);
+      kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Upper Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      //FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed
+      /*
+      Kokkos::deep_copy(lhs, 0);
+      kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Upper Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+      */
+
+
+      kh.destroy_sptrsv_handle();
     }
-    else {
-      std::cout << "Upper Tri Solve SUCCESS!" << std::endl;
-    }
-    EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
 
-    Kokkos::deep_copy(lhs, 0);
-    kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHD_RP);
-    kh.get_sptrsv_handle()->print_algorithm();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
-
-    sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-    if ( sum != lhs.extent(0) ) {
-      std::cout << "Upper Tri Solve FAILURE" << std::endl;
-    }
-    else {
-      std::cout << "Upper Tri Solve SUCCESS!" << std::endl;
+    {
+      Kokkos::deep_copy(lhs, 0);
+      KernelHandle kh;
+      bool is_lower_tri = false;
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SEQLVLSCHD_TP1CHAIN, nrows, is_lower_tri);
+      auto chain_threshold = 1;
+      kh.get_sptrsv_handle()->reset_chain_threshold(chain_threshold);
+
+      sptrsv_symbolic( &kh, row_map, entries );
+      Kokkos::fence();
+
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Upper Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      kh.destroy_sptrsv_handle();
     }
-    EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
-
-/*
-//FIXME Issues with various integral type combos - algorithm currently unavailable and commented out until fixed
-    Kokkos::deep_copy(lhs, 0);
-    kh.get_sptrsv_handle()->set_algorithm(SPTRSVAlgorithm::SEQLVLSCHED_TP2);
-    kh.get_sptrsv_handle()->print_algorithm();
-    sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
-    Kokkos::fence();
 
-    sum = 0.0;
-    Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
-    if ( sum != lhs.extent(0) ) {
-      std::cout << "Upper Tri Solve FAILURE" << std::endl;
-    }
-    else {
-      std::cout << "Upper Tri Solve SUCCESS!" << std::endl;
+#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
+    if (std::is_same<size_type,int>::value && std::is_same<lno_t,int>::value)
+    {
+      Kokkos::deep_copy(lhs, 0);
+      KernelHandle kh;
+      bool is_lower_tri = false;
+      kh.create_sptrsv_handle(SPTRSVAlgorithm::SPTRSV_CUSPARSE, nrows, is_lower_tri);
+
+      sptrsv_symbolic(&kh, row_map, entries, values);
+      Kokkos::fence();
+
+      sptrsv_solve( &kh, row_map, entries, values, rhs, lhs );
+      Kokkos::fence();
+
+      scalar_t sum = 0.0;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<typename device::execution_space>(0, lhs.extent(0)), ReductionCheck<ValuesType, scalar_t, lno_t>(lhs), sum);
+      if ( sum != lhs.extent(0) ) {
+        std::cout << "Upper Tri Solve FAILURE" << std::endl;
+        kh.get_sptrsv_handle()->print_algorithm();
+      }
+      EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
+
+      kh.destroy_sptrsv_handle();
     }
-    EXPECT_TRUE( sum == scalar_t(lhs.extent(0)) );
-*/
-
-
-    kh.destroy_sptrsv_handle();
+#endif
   }
 
 }