From d1a6ef3ef935634b61011ad132a6a6f2924cf7a3 Mon Sep 17 00:00:00 2001
From: Thomas Herault <herault@icl.utk.edu>
Date: Wed, 6 Sep 2023 13:57:26 -0400
Subject: [PATCH 01/17] User needs to include any std:: serialization they use

---
 examples/potrf/testing_dlauum.cc | 1 +
 examples/potrf/testing_dpoinv.cc | 1 +
 examples/potrf/testing_dpotrf.cc | 1 +
 examples/potrf/testing_dtrtri.cc | 1 +
 4 files changed, 4 insertions(+)
diff --git a/examples/potrf/testing_dlauum.cc b/examples/potrf/testing_dlauum.cc
index 51dc7f81a..717600094 100644
--- a/examples/potrf/testing_dlauum.cc
+++ b/examples/potrf/testing_dlauum.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "lauum.h"
 #include "plgsy.h"
diff --git a/examples/potrf/testing_dpoinv.cc b/examples/potrf/testing_dpoinv.cc
index 550e6435d..a1e419cc8 100644
--- a/examples/potrf/testing_dpoinv.cc
+++ b/examples/potrf/testing_dpoinv.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index 5cef0bd20..d5686042f 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -1,4 +1,5 @@
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"
diff --git a/examples/potrf/testing_dtrtri.cc b/examples/potrf/testing_dtrtri.cc
index 4a24dec4a..107fbf8c2 100644
--- a/examples/potrf/testing_dtrtri.cc
+++ b/examples/potrf/testing_dtrtri.cc
@@ -6,6 +6,7 @@
 #endif  // TTG_USE_PARSEC
 
 #include <ttg.h>
+#include <ttg/serialization/std/tuple.h>
 
 #include "plgsy.h"
 #include "pmw.h"

From 7576321f426a18f60d4c46a1a7f197202fea8935 Mon Sep 17 00:00:00 2001
From: Thomas Herault <herault@icl.utk.edu>
Date: Fri, 29 Sep 2023 12:12:05 -0400
Subject: [PATCH 02/17] Work in progress: adding CUDA version of POTRF task in
 POTRF example -- not compiling

---
 examples/matrixtile.h  | 46 ++++++++++------------
 examples/potrf/potrf.h | 86 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index e23007803..b3e5a3b8e 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -6,12 +6,13 @@
 
 #include <ttg/serialization/splitmd_data_descriptor.h>
 
-template <typename T>
-class MatrixTile {
+template <typename T, class _Storage>
+class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
  public:
   using metadata_t = typename std::tuple<int, int, int>;
 
-  using pointer_t = typename std::shared_ptr<T>;
+  using pointer_t = typename ttg::buffer<T>;
+  using ttvalue_type = ttg::TTValue<MatrixTile<T, _Storage>>;
 
  private:
   pointer_t _data;
@@ -20,18 +21,18 @@ class MatrixTile {
   // (Re)allocate the tile memory
   void realloc() {
     // std::cout << "Reallocating new tile" << std::endl;
-    _data = std::shared_ptr<T>(new T[_lda * _cols], [](T* p) { delete[] p; });
+    _data = ttg::buffer<T>(_data, _lda * _cols);
   }
 
  public:
   MatrixTile() {}
 
-  MatrixTile(int rows, int cols, int lda) : _rows(rows), _cols(cols), _lda(lda) { realloc(); }
+  MatrixTile(int rows, int cols, int lda) : ttvalue_type(), _rows(rows), _cols(cols), _lda(lda) { realloc(); }
 
   MatrixTile(const metadata_t& metadata)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {}
 
-  MatrixTile(int rows, int cols, pointer_t data, int lda) : _data(data), _rows(rows), _cols(cols), _lda(lda) {}
+  MatrixTile(int rows, int cols, pointer_t data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {}
 
   MatrixTile(const metadata_t& metadata, pointer_t data)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {}
@@ -40,38 +41,20 @@ class MatrixTile {
    * Constructor with outside memory. The tile will *not* delete this memory
    * upon destruction.
    */
-  MatrixTile(int rows, int cols, T* data, int lda) : _data(data, [](T*) {}), _rows(rows), _cols(cols), _lda(lda) {}
+  MatrixTile(int rows, int cols, T* data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {}
 
   MatrixTile(const metadata_t& metadata, T* data)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {}
 
-#if 0
-  /* Copy dtor and operator with a static_assert to catch unexpected copying */
-  MatrixTile(const MatrixTile& other) {
-    static_assert("Oops, copy ctor called?!");
-  }
-
-  MatrixTile& operator=(const MatrixTile& other) {
-    static_assert("Oops, copy ctor called?!");
-  }
-#endif
-
   MatrixTile(MatrixTile<T>&& other) = default;
 
   MatrixTile& operator=(MatrixTile<T>&& other) = default;
 
-#if 0
-  /* Defaulted copy ctor and op for shallow copies, see comment below */
-  MatrixTile(const MatrixTile<T>& other)  = default;
-
-  MatrixTile& operator=(const MatrixTile<T>& other)  = default;
-#endif  // 0
   /* Deep copy ctor und op are not needed for PO since tiles will never be read
    * and written concurrently. Hence shallow copies are enough, will all
    * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
    * can handle data sharing without excessive copying */
-#if 1
-  MatrixTile(const MatrixTile<T>& other) : _rows(other._rows), _cols(other._cols), _lda(other._lda) {
+  MatrixTile(const MatrixTile<T>& other) : ttvalue_type(), _rows(other._rows), _cols(other._cols), _lda(other._lda) {
     this->realloc();
     std::copy_n(other.data(), _lda * _cols, this->data());
   }
@@ -84,7 +67,6 @@ class MatrixTile {
     std::copy_n(other.data(), _lda * _cols, this->data());
     return *this;
   }
-#endif  // 1
 
   void set_metadata(metadata_t meta) {
     _rows = std::get<0>(meta);
@@ -144,6 +126,16 @@ class MatrixTile {
   }
 };
 
+#if defined(TTG_HAVE_CUDA)
+using blk_t = DeviceTensor<double, btas::DEFAULT::range,
+                           btas::mohndle<btas::varray<double, TiledArray::cuda_pinned_allocator<double>>,
+                                         btas::Handle::shared_ptr>>;
+#else
+// TODO: no hip pinned allocator in TA?
+using blk_t = DeviceTensor<double, btas::DEFAULT::range,
+                           btas::mohndle<btas::varray<double>, btas::Handle::shared_ptr>>;
+#endif
+
 namespace ttg {
 
   template <typename T>
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index 6079bc6bc..e536c655a 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -19,7 +19,90 @@ namespace potrf {
                   ttg::Edge<Key1, MatrixTile<typename MatrixT::element_type>>& input,
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_trsm,
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
-    using T = typename MatrixT::element_type;
+      using T = typename MatrixT::element_type;
+#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+    static int device_potrf_workspace_size(blk_t &A) {
+      int Lwork;
+      #if defined(TTG_HAVE_CUDA)
+        cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(),
+                                    CUBLAS_FILL_MODE_LOWER, A.extent(1),
+                                    nullptr, A.extent(0),
+                                    &Lwork);
+        return Lwork;
+      #elif defined(TTG_HAVE_HIPBLAS)
+        #error TBCoded
+      #else
+        return 0;
+      #endif
+    }
+
+    static void device_potrf(blk_t &A, double *workspace, int Lwork, int *devInfo) {
+      int device = A.b.get_current_device();
+      assert(device != 0);
+    #if defined(TTG_HAVE_CUDA)
+      cusolverDnDpotrf(ttg::detail::cublas_get_handle(),
+                       CUBLAS_FILL_MODE_LOWER, A.extent(1),
+                       A.b.device_ptr_on(device), A.extent(0),
+                       workspace, Lwork,
+                       devInfo);
+    #elif defined(TTG_HAVE_HIPBLAS)
+      hipsolverDpotrf(ttg::detail::hipblas_get_handle(),
+                       HIPSOLVER_FILL_MODE_LOWER, A.extent(1),
+                       A.b.device_ptr_on(device), A.extent(0),
+                       workspace, Lwork,
+                       devInfo);
+    #endif
+    }
+
+    auto f_dev = [=](const Key1& key, MatrixTile<T>&& A,
+                     std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) -> ttg::device_task {
+      const auto K = key[0]; 
+
+      /* pull the matrix onto the device, as computing the workspace size might in theory depend on the data */
+      //TODO: extend MatrixTile<T> to be heterogeneous-aware. Look at spmm-cuda.cc 50-253
+      //       Need to include a ttg::buffer<T> _data instead of a shared_ptr;
+      //       Check pmw.h: when we generate the MatrixTile
+      //       Also check pinned allocator at the end of DeviceTensor (250-253)
+
+      int Lwork = device_potrf_workspace_size(A);
+
+      // Instead of using scratch here, we should have hostWS and hostInfo globals and use to_device
+      // this would reduce the number of I/O operations to devices
+      double hostWS[Lwork];
+      ttg::devicescratch<double> devWS = ttg::make_scratch(hostWS, ttg::scope::Allocate);
+      int hostInfo = -1;
+      ttg::devicescratch<int> devInfo = ttg::make_scratch(&hostInfo, ttg::scope::Allocate);
+
+      /* the workspace and the devInfo must be device-level pointers */
+      co_await ttg::to_device(A._data, devWS, devInfo);
+
+      /* everything is on the device, call the POTRF */
+      device_potrf(A, devWS, Lwork, devInfo);
+
+      /* compute successors while the kernel is running */
+      std::vector<Key2> keylist;
+      keylist.reserve(A.rows() - K);
+      /* TODO: reverse order of arrays */
+      for (int m = K + 1; m < A.rows(); ++m) {
+        /* send tile to trsm */
+        keylist.push_back(Key2(m, K));
+      }
+
+      /* wait for the kernel to complete */
+      co_await ttg::wait_kernel(devInfo);
+
+      if( hostInfo == 0 ) {
+        co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), keylist), std::move(A), out));
+        // Anything after this co_await is never executed
+        // co_return would look better, but co_return would destroy keylist before the runtime can handle it
+      } else {
+        // Well... Here we should interrupt the DAG of tasks, there is an error. Raise?
+        std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonal dominant)" << std::endl;
+      }
+    }
+    return ttg::make_tt<ttg::ExecutionSpace::CUDA>(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
+                        {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
+#else /* defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) */
     auto f = [=](const Key1& key, MatrixTile<T>&& tile_kk,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) {
       const int K = key[0];
@@ -49,6 +132,7 @@ namespace potrf {
     };
     return ttg::make_tt(f, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
                         {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
+#endif
   }
 
   template <typename MatrixT>

From 4a23f42ba4465c0ffe7697f667f1e76a67a10174 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Sun, 1 Oct 2023 19:27:27 +0000
Subject: [PATCH 03/17] Further steps towards device-enabled POTRF

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/matrixtile.h  |  64 ++++++------
 examples/potrf/potrf.h | 216 ++++++++++++++++++++++++++++++++++++++---
 2 files changed, 230 insertions(+), 50 deletions(-)

diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index b3e5a3b8e..929b6a07e 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -6,34 +6,38 @@
 
 #include <ttg/serialization/splitmd_data_descriptor.h>
 
-template <typename T, class _Storage>
+template <typename T, class _Storage = TiledArray::cuda_pinned_allocator<double>>
 class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
  public:
   using metadata_t = typename std::tuple<int, int, int>;
 
-  using pointer_t = typename ttg::buffer<T>;
+  using buffer_t  = typename ttg::buffer<T>;
   using ttvalue_type = ttg::TTValue<MatrixTile<T, _Storage>>;
 
  private:
-  pointer_t _data;
+  buffer_t b;
   int _rows = 0, _cols = 0, _lda = 0;
 
   // (Re)allocate the tile memory
   void realloc() {
     // std::cout << "Reallocating new tile" << std::endl;
-    _data = ttg::buffer<T>(_data, _lda * _cols);
+    b.reset(_lda * _cols);
   }
 
  public:
   MatrixTile() {}
 
-  MatrixTile(int rows, int cols, int lda) : ttvalue_type(), _rows(rows), _cols(cols), _lda(lda) { realloc(); }
+  MatrixTile(int rows, int cols, int lda)
+  : ttvalue_type()
+  , b(lda*cols)
+  , _rows(rows)
+  , _cols(cols)
+  , _lda(lda)
+  { }
 
   MatrixTile(const metadata_t& metadata)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {}
 
-  MatrixTile(int rows, int cols, pointer_t data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {}
-
   MatrixTile(const metadata_t& metadata, pointer_t data)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {}
 
@@ -41,7 +45,13 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
    * Constructor with outside memory. The tile will *not* delete this memory
    * upon destruction.
    */
-  MatrixTile(int rows, int cols, T* data, int lda) : ttvalue_type(), _data(data), _rows(rows), _cols(cols), _lda(lda) {}
+  MatrixTile(int rows, int cols, T* data, int lda)
+  : ttvalue_type()
+  , _data(data, lda*cols)
+  , _rows(rows)
+  , _cols(cols)
+  , _lda(lda)
+  { }
 
   MatrixTile(const metadata_t& metadata, T* data)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {}
@@ -54,8 +64,12 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
    * and written concurrently. Hence shallow copies are enough, will all
    * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
    * can handle data sharing without excessive copying */
-  MatrixTile(const MatrixTile<T>& other) : ttvalue_type(), _rows(other._rows), _cols(other._cols), _lda(other._lda) {
-    this->realloc();
+  MatrixTile(const MatrixTile<T>& other)
+  : ttvalue_type()
+  , b(other._lda*other._cols)
+  , _rows(other._rows)
+  , _cols(other._cols)
+  , _lda(other._lda) {
     std::copy_n(other.data(), _lda * _cols, this->data());
   }
 
@@ -77,22 +91,9 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
   metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; }
 
   // Accessing the raw data
-  T* data() { return _data.get(); }
-
-  const T* data() const { return _data.get(); }
-
-  /// @return shared_ptr to data
-  pointer_t data_shared() & { return _data; }
+  T* data() { return b.host_ptr(); }
 
-  /// @return shared_ptr to data
-  pointer_t data_shared() const& { return _data; }
-
-  /// yields data and resets this object to a default-constucted state
-  pointer_t yield_data() && {
-    pointer_t result = _data;
-    *this = MatrixTile();
-    return std::move(result);
-  }
+  const T* data() const { return b.host_ptr(); }
 
   size_t size() const { return _cols * _lda; }
 
@@ -103,7 +104,8 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
   int lda() const { return _lda; }
 
   auto& fill(T value) {
-    std::fill(_data.get(), _data.get() + size(), value);
+    std::fill(data().get(), data().get() + size(), value);
+    b.set_current_device(0);
     return *this;
   }
 
@@ -126,16 +128,6 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
   }
 };
 
-#if defined(TTG_HAVE_CUDA)
-using blk_t = DeviceTensor<double, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<double, TiledArray::cuda_pinned_allocator<double>>,
-                                         btas::Handle::shared_ptr>>;
-#else
-// TODO: no hip pinned allocator in TA?
-using blk_t = DeviceTensor<double, btas::DEFAULT::range,
-                           btas::mohndle<btas::varray<double>, btas::Handle::shared_ptr>>;
-#endif
-
 namespace ttg {
 
   template <typename T>
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index e536c655a..e6966bb2b 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -6,6 +6,19 @@
 
 #undef DEBUG_TILES_VALUES
 
+#if defined(TTG_HAS_CUDART)
+#define ES ttg::ExecutionSpace::CUDA
+#define TASKRET -> ttg::device_task
+#elif defined(TTG_HAS_HIP)
+#define ES ttg::ExecutionSpace::HIP
+#define TASKRET -> ttg::device_task
+#else
+#define ES ttg::ExecutionSpace::Host
+#define TASKRET -> void
+#endif
+
+
+
 namespace potrf {
 
   /* FLOP macros taken from DPLASMA */
@@ -55,8 +68,19 @@ namespace potrf {
     }
 
     auto f_dev = [=](const Key1& key, MatrixTile<T>&& A,
-                     std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) -> ttg::device_task {
-      const auto K = key[0]; 
+                     std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
+      const auto K = key[0];
+
+      /* compute successors before submitting the kernel running
+       * TODO: this is parsec specific since this code is still executing on the worker threads
+       */
+      std::vector<Key2> keylist;
+      keylist.reserve(A.rows() - K);
+      /* TODO: reverse order of arrays */
+      for (int m = K + 1; m < A.rows(); ++m) {
+        /* send tile to trsm */
+        keylist.push_back(Key2(m, K));
+      }
 
       /* pull the matrix onto the device, as computing the workspace size might in theory depend on the data */
       //TODO: extend MatrixTile<T> to be heterogeneous-aware. Look at spmm-cuda.cc 50-253
@@ -79,28 +103,20 @@ namespace potrf {
       /* everything is on the device, call the POTRF */
       device_potrf(A, devWS, Lwork, devInfo);
 
-      /* compute successors while the kernel is running */
-      std::vector<Key2> keylist;
-      keylist.reserve(A.rows() - K);
-      /* TODO: reverse order of arrays */
-      for (int m = K + 1; m < A.rows(); ++m) {
-        /* send tile to trsm */
-        keylist.push_back(Key2(m, K));
-      }
-
       /* wait for the kernel to complete */
       co_await ttg::wait_kernel(devInfo);
 
       if( hostInfo == 0 ) {
-        co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), keylist), std::move(A), out));
+        co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), std::move(keylist)), std::move(A), out));
         // Anything after this co_await is never executed
         // co_return would look better, but co_return would destroy keylist before the runtime can handle it
       } else {
         // Well... Here we should interrupt the DAG of tasks, there is an error. Raise?
-        std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonal dominant)" << std::endl;
+        std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonally dominant)" << std::endl;
+        ttg::abort();
       }
     }
-    return ttg::make_tt<ttg::ExecutionSpace::CUDA>(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
+    return ttg::make_tt<ES>(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
                         {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
 #else /* defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) */
     auto f = [=](const Key1& key, MatrixTile<T>&& tile_kk,
@@ -145,6 +161,69 @@ namespace potrf {
                  ttg::Edge<Key3, MatrixTile<typename MatrixT::element_type>>& output_col,   // to GEMM
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
     using T = typename MatrixT::element_type;
+#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile_kk, MatrixTile<T>&& tile_mk,
+                 std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>,
+                            ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
+      const int M = key[0];
+      const int K = key[1];  // the column equals the outer most look K (same as PO)
+
+      auto mb = tile_mk.rows();
+      auto nb = tile_mk.cols();
+
+      /* in trsm, tile_mk is mb x nb, and tile_kk needs to be lda x nb because side = Right */
+      assert(nb == tile_kk.rows());
+
+      if (ttg::tracing()) ttg::print("TRSM(", key, ")");
+
+      /* populate successor keys while we're on the worker thread */
+      std::vector<Key3> keylist_row;
+      keylist_row.reserve(M - K);
+      std::vector<Key3> keylist_col;
+      keylist_col.reserve(A.rows() - M - 1);
+
+      /* send tile to syrk on diagonal */
+      if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to syrk(", Key2{K, M}, ")");
+
+      /* send the tile to all gemms across in row i */
+      for (int n = K + 1; n < M; ++n) {
+        if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to gemm( ", Key3{M, n, K}, ")");
+        keylist_row.push_back(Key3(M, n, K));
+      }
+
+      /* send the tile to all gemms down in column i */
+      for (int m = M + 1; m < A.rows(); ++m) {
+        if (ttg::tracing()) ttg::print("TRSM(", key, "): sending output to gemm( ", Key3{m, M, K}, ")");
+        keylist_col.push_back(Key3(m, M, K));
+      }
+
+
+      co_await ttg::to_device(tile_kk.b, tile_mk.b);
+      int device = tile_kk.b.get_current_device();
+      double alpha = 1.0;
+#if defined(TTG_HAVE_CUDA)
+      cublasDtrsm(ttg::detail::cublas_get_handle(),
+                  CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER,
+                  CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
+                  mb, nb, &alpha,
+                  tile_kk.b.device_ptr_on(device), tile_kk.lda(),
+                  tile_mk.b.device_ptr_on(device), tile_mk.lda());
+#elif defined(TTG_HAVE_HIPBLAS)
+      hipblasDtrsm(ttg::detail:hipblas_get_handle(),
+                   HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER,
+                   HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT,
+                   mb, nb, &alpha,
+                   tile_kk.b.device_ptr_on(device), tile_kk.lda(),
+                   tile_mk.b.device_ptr_on(device), tile_mk.lda());
+#endif
+
+      co_await ttg::device::forward(ttg::device::broadcast<0, 1, 2, 3>(std::make_tuple(key, Key2(K, M), keylist_row, keylist_col),
+                                                                       std::move(tile_mk), out));
+    };
+    return ttg::make_tt<ES>(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)),
+                            ttg::edges(output_result, output_diag, output_row, output_col), "TRSM",
+                            {"tile_kk", "tile_mk/dispatcher"}, {"output_result", "tile_mk", "output_row", "output_col"});
+#else
     auto f = [=](const Key2& key, const MatrixTile<T>& tile_kk, MatrixTile<T>&& tile_mk,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>,
                             ttg::Out<Key3, MatrixTile<T>>>& out) {
@@ -195,6 +274,7 @@ namespace potrf {
     return ttg::make_tt(f, ttg::edges(input_kk, ttg::fuse(input_mk, input_disp)),
                         ttg::edges(output_result, output_diag, output_row, output_col), "TRSM",
                         {"tile_kk", "tile_mk/dispatcher"}, {"output_result", "tile_mk", "output_row", "output_col"});
+#endif
   }
 
   template <typename MatrixT>
@@ -205,6 +285,54 @@ namespace potrf {
                  ttg::Edge<Key1, MatrixTile<typename MatrixT::element_type>>& output_potrf,  // to POTRF
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_syrk) {
     using T = typename MatrixT::element_type;
+#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+    auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk,
+                 std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
+      const int K = key[0];
+      const int M = key[1];
+
+      /* tile_kk is mb x mb and tile_mk is mb x nb */
+      assert(tile_kk.rows() == tile_kk.cols());
+      assert(tile_mk.rows() == tile_kk.rows());
+
+      auto mb = tile_mk.rows();
+      auto nb = tile_mk.cols();
+
+      if (ttg::tracing()) ttg::print("SYRK(", key, ")");
+
+      co_await ttg::to_device(tile_kk.b, tile_mk.b);
+
+      double alpha = -1.0;
+      double beta  =  1.0;
+#if defined(TTG_HAVE_CUDA)
+      cublasDsyrk(ttg::detail::cublas_get_handle(),
+                  CUBLAS_FILL_MODE_LOWER,
+                  CUBLAS_OP_N,
+                  mb, nb, &alpha,
+                  tile_nk.b.device_ptr_on(device), tile_mk.lda(), &beta,
+                  tile_kk.b.device_ptr_on(device), tile_kk.lda());
+#elif defined(TTG_HAVE_HIPBLAS)
+      hipblasDsyrk(ttg::detail:hipblas_get_handle(),
+                   HIPBLAS_FILL_MODE_LOWER,
+                   HIPBLAS_OP_N,
+                   mb, nb, &alpha,
+                   tile_kk.b.device_ptr_on(device), tile_kk.lda(), &beta,
+                   tile_mk.b.device_ptr_on(device), tile_mk.lda());
+#endif
+
+      if (M == K + 1) {
+        /* send the tile to potrf */
+        if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to POTRF(", Key1{K + 1}, ")");
+        co_await ttg::device::send<0>(Key1(K + 1), std::move(tile_kk), out);
+      } else {
+        /* send output to next syrk */
+        if (ttg::tracing()) ttg::print("SYRK(", key, "): sending output to SYRK(", Key2{K + 1, M}, ")");
+        co_await ttg::device::send<1>(Key2(K + 1, M), std::move(tile_kk), out);
+      }
+    };
+    return ttg::make_tt<ES>(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk),
+                            "SYRK", {"tile_mk", "tile_kk/dispatcher"}, {"output_potrf", "output_syrk"});
+#else
     auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk,
                  std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) {
       const int K = key[0];
@@ -242,6 +370,7 @@ namespace potrf {
     };
     return ttg::make_tt(f, ttg::edges(input_mk, ttg::fuse(input_kk, input_disp)), ttg::edges(output_potrf, output_syrk),
                         "SYRK", {"tile_mk", "tile_kk/dispatcher"}, {"output_potrf", "output_syrk"});
+#endif
   }
 
   template <typename MatrixT>
@@ -253,6 +382,64 @@ namespace potrf {
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_trsm,  // to TRSM
                  ttg::Edge<Key3, MatrixTile<typename MatrixT::element_type>>& output_gemm) {
     using T = typename MatrixT::element_type;
+#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+    auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn,
+                 std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
+      const int M = key[0];
+      const int N = key[1];
+      const int K = key[2];
+      assert(M != N && M > K && N > K);
+
+      assert(tile_mk.cols() == tile_nk.cols());
+      assert(tile_mk.rows() == tile_mn.rows());
+      assert(tile_nk.rows() == tile_mn.cols());
+
+      if (ttg::tracing()) ttg::print("GEMM(", key, ")");
+#if defined(DEBUG_TILES_VALUES)
+      std::cout << "Before GEMM(" << key << "), A(" << M << ", " << K << ") is " << tile_mk << " and A(" << K << ", "
+                << N << ") is " << tile_nk << " and A(" << M << ", " << N << ") is " << tile_mn;
+#endif
+
+      co_await ttg::to_device(tile_mk.b, tile_nk.b, tile_mn.b);
+
+      double alpha = -1.0;
+      double beta  =  1.0;
+#if defined(TTG_HAVE_CUDA)
+      cublasDgemm(ttg::detail:cublas_get_handle(),
+                  CUBLAS_OP_N, CUBLAS_OP_T,
+                  tile_mk.rows(), tile_nk.rows(),
+                  tile_nk.cols(), &alpha,
+                  tile_mk.data(), tile_mk.lda(),
+                  tile_nk.data(), tile_nk.lda(), &beta,
+                  tile_mn.data(), tile_mn.lda());
+#elif defined(TTG_HAVE_HIPBLAS)
+      hipblasDgemm(ttg::detail:hipblas_get_handle(),
+                   HIPBLAS_OP_N, HIPBLAS_OP_T,
+                   tile_mk.rows(), tile_nk.rows(),
+                   tile_nk.cols(), &alpha,
+                   tile_mk.data(), tile_mk.lda(),
+                   tile_nk.data(), tile_nk.lda(), &beta,
+                   tile_mn.data(), tile_mn.lda());
+#endif
+
+#if defined(DEBUG_TILES_VALUES)
+      std::cout << "After GEMM(" << key << "), A(" << M << ", " << N << ") is " << tile_mn << std::endl;
+#endif
+
+      if (N == K + 1) {
+        /* send the tile to trsm */
+        if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to TRSM(", Key2{M, N}, ")");
+        co_await ttg::device::send<0>(Key2(M, N), std::move(tile_mn), out);
+      } else {
+        /* send the tile to the next gemm */
+        if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to GEMM(", Key3{M, N, K + 1}, ")");
+        co_await ttg::device::send<1>(Key3(M, N, K + 1), std::move(tile_mn), out);
+      }
+    };
+    return ttg::make_tt<ES>(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)),
+                            ttg::edges(output_trsm, output_gemm), "GEMM", {"input_mk", "input_kn", "input_mn/dispatcher"},
+                            {"output_trsm", "outout_gemm"});
+#else
     auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>>& out) {
       const int M = key[0];
@@ -291,6 +478,7 @@ namespace potrf {
     return ttg::make_tt(f, ttg::edges(input_mk, input_nk, ttg::fuse(input_disp, input_mn)),
                         ttg::edges(output_trsm, output_gemm), "GEMM", {"input_mk", "input_kn", "input_mn/dispatcher"},
                         {"output_trsm", "outout_gemm"});
+#endif
   }
 
   template <typename T>

From 57faac424de2f510d828196d131ec03a82a5f04d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 2 Oct 2023 02:16:45 +0000
Subject: [PATCH 04/17] Add allocator to ttg::buffer

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/matrixtile.h   | 25 +++++++-------
 ttg/ttg/buffer.h        |  8 ++---
 ttg/ttg/parsec/buffer.h | 72 ++++++++++++++++++++++-------------------
 ttg/ttg/parsec/fwd.h    |  2 +-
 4 files changed, 55 insertions(+), 52 deletions(-)

diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index 929b6a07e..693ae3d1e 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -6,13 +6,15 @@
 
 #include <ttg/serialization/splitmd_data_descriptor.h>
 
-template <typename T, class _Storage = TiledArray::cuda_pinned_allocator<double>>
-class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
+#include <TiledArray/device/allocators.h>
+
+template <typename T, class Allocator = TiledArray::device_pinned_allocator<double>>
+class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
  public:
   using metadata_t = typename std::tuple<int, int, int>;
 
-  using buffer_t  = typename ttg::buffer<T>;
-  using ttvalue_type = ttg::TTValue<MatrixTile<T, _Storage>>;
+  using buffer_t  = typename ttg::buffer<T, Allocator>;
+  using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;
 
  private:
   buffer_t b;
@@ -38,7 +40,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
   MatrixTile(const metadata_t& metadata)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::get<2>(metadata)) {}
 
-  MatrixTile(const metadata_t& metadata, pointer_t data)
+  MatrixTile(const metadata_t& metadata, T* data)
       : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), std::forward(data), std::get<2>(metadata)) {}
 
   /**
@@ -47,24 +49,21 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
    */
   MatrixTile(int rows, int cols, T* data, int lda)
   : ttvalue_type()
-  , _data(data, lda*cols)
+  , b(data, lda*cols)
   , _rows(rows)
   , _cols(cols)
   , _lda(lda)
   { }
 
-  MatrixTile(const metadata_t& metadata, T* data)
-      : MatrixTile(std::get<0>(metadata), std::get<1>(metadata), data, std::get<2>(metadata)) {}
-
-  MatrixTile(MatrixTile<T>&& other) = default;
+  MatrixTile(MatrixTile<T, Allocator>&& other) = default;
 
-  MatrixTile& operator=(MatrixTile<T>&& other) = default;
+  MatrixTile& operator=(MatrixTile<T, Allocator>&& other) = default;
 
   /* Deep copy ctor und op are not needed for PO since tiles will never be read
    * and written concurrently. Hence shallow copies are enough, will all
    * receiving tasks sharing tile data. Re-enable this once the PaRSEC backend
    * can handle data sharing without excessive copying */
-  MatrixTile(const MatrixTile<T>& other)
+  MatrixTile(const MatrixTile<T, Allocator>& other)
   : ttvalue_type()
   , b(other._lda*other._cols)
   , _rows(other._rows)
@@ -73,7 +72,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, _Storage>> {
     std::copy_n(other.data(), _lda * _cols, this->data());
   }
 
-  MatrixTile& operator=(const MatrixTile<T>& other) {
+  MatrixTile& operator=(const MatrixTile<T, Allocator>& other) {
     this->_rows = other._rows;
     this->_cols = other._cols;
     this->_lda = other._lda;
diff --git a/ttg/ttg/buffer.h b/ttg/ttg/buffer.h
index 1868b7e0c..6d252cb09 100644
--- a/ttg/ttg/buffer.h
+++ b/ttg/ttg/buffer.h
@@ -7,16 +7,16 @@
 
 namespace ttg {
 
-template<typename T>
-using buffer = TTG_IMPL_NS::buffer<T>;
+template<typename T, typename Allocator = std::allocator<T>>
+using buffer = TTG_IMPL_NS::buffer<T, Allocator>;
 
 namespace detail {
   template<typename T>
   struct is_buffer : std::false_type
   { };
 
-  template<typename T>
-  struct is_buffer<ttg::buffer<T>> : std::true_type
+  template<typename T, typename A>
+  struct is_buffer<ttg::buffer<T, A>> : std::true_type
   { };
 
   template<typename T>
diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
index 0e9c58cf9..00c7c996e 100644
--- a/ttg/ttg/parsec/buffer.h
+++ b/ttg/ttg/parsec/buffer.h
@@ -34,33 +34,43 @@ namespace detail {
  * in order for TTG to properly facilitate ownership
  * tracking of the containing object.
  */
-template<typename T>
-struct buffer : public detail::ttg_parsec_data_wrapper_t {
+template<typename T, typename Allocator>
+struct buffer : public detail::ttg_parsec_data_wrapper_t
+              , private Allocator {
 
   using element_type = std::decay_t<T>;
 
+  using allocator_traits = std::allocator_traits<Allocator>;
+  using allocator_type = typename  allocator_traits::allocator_type;
+
   static_assert(std::is_trivially_copyable_v<element_type>,
                 "Only trivially copyable types are supported for devices.");
   static_assert(std::is_default_constructible_v<element_type>,
                 "Only default constructible types are supported for devices.");
 
 private:
-  using delete_fn_t = std::add_pointer_t<void(element_type*)>;
+  using delete_fn_t = std::function<void(element_type*)>;
 
   using host_data_ptr   = std::unique_ptr<element_type[], delete_fn_t>;
   host_data_ptr m_host_data;
   std::size_t m_count = 0;
 
-  static void delete_owned(element_type *ptr) {
-    delete[] ptr;
-  }
-
   static void delete_non_owned(element_type *ptr) {
     // nothing to be done, we don't own the memory
   }
 
   friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::buffer<T>&);
 
+  allocator_type& get_allocator_reference() { return static_cast<allocator_type&>(*this); }
+
+  element_type* allocate(std::size_t n) {
+    return allocator_traits::allocate(get_allocator_reference(), n);
+  }
+
+  void deallocate(element_type *ptr, std::size_t n) {
+    allocator_traits::deallocate(get_allocator_reference(), ptr, n);
+  }
+
 public:
 
   /* The device ID of the CPU. */
@@ -69,30 +79,29 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t {
   buffer() : buffer(nullptr, 0)
   { }
 
-  buffer(std::size_t count)
+  buffer(std::size_t n)
   : ttg_parsec_data_wrapper_t()
-  , m_host_data(new element_type[count](), &delete_owned)
-  , m_count(count)
+  , allocator_type()
+  , m_host_data(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); })
+  , m_count(n)
   {
     //std::cout << "buffer " << this << " ctor count "
     //          << count << "(" << m_host_data.get() << ") ttg_copy "
     //          << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
-    this->reset_parsec_data(m_host_data.get(), count*sizeof(element_type));
+    this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
   }
 
   /* Constructing a buffer using application-managed memory.
    * The memory pointed to by ptr must be accessible during
    * the life-time of the buffer. */
-  buffer(element_type* ptr, std::size_t count = 1)
+  buffer(element_type* ptr, std::size_t n = 1)
   : ttg_parsec_data_wrapper_t()
+  , allocator_type()
   , m_host_data(ptr, &delete_non_owned)
-  , m_count(count)
+  , m_count(n)
   {
-    this->reset_parsec_data(m_host_data.get(), count*sizeof(element_type));
-    //std::cout << "buffer " << this << " ctor ptr " << ptr << " count " << count
-    //          << " ttg_copy " << m_ttg_copy
-    //          << " parsec_data " << m_data.get() << std::endl;
+    this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
   }
 
   virtual ~buffer() {
@@ -102,18 +111,11 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t {
   /* allow moving device buffers */
   buffer(buffer&& db)
   : ttg_parsec_data_wrapper_t(std::move(db))
+  , allocator_type(std::move(db))
   , m_host_data(std::move(db.m_host_data))
   , m_count(db.m_count)
   {
-    //std::cout << "buffer " << this << " other " << &db << " mv ctor ttg_copy " << m_ttg_copy << std::endl;
     db.m_count = 0;
-
-    //std::cout << "buffer::move-ctor from " << &db << " ttg-copy " << db.m_ttg_copy
-    //          << " to " << this << " ttg-copy " << m_ttg_copy
-    //          << " parsec-data " << m_data.get()
-    //          << std::endl;
-
-//#endif // 0
   }
 
   /* explicitly disable copying of buffers
@@ -124,6 +126,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t {
   /* allow moving device buffers */
   buffer& operator=(buffer&& db) {
     ttg_parsec_data_wrapper_t::operator=(std::move(db));
+    allocator_type::operator=(std::move(db));
     m_host_data = std::move(db.m_host_data);
     m_count = db.m_count;
     db.m_count = 0;
@@ -202,6 +205,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t {
 
   void allocate_on(int device_id) {
     /* TODO: need exposed PaRSEC memory allocator */
+    throw std::runtime_error("not implemented yet");
   }
 
   /* TODO: can we do this automatically?
@@ -246,26 +250,26 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t {
   }
 
   /* Reallocate the buffer with count elements */
-  void reset(std::size_t count) {
+  void reset(std::size_t n) {
     /* TODO: can we resize if count is smaller than m_count? */
     /* drop the current data and reallocate */
     reset();
-    if (count == 0) {
+    if (n == 0) {
       m_host_data = host_data_ptr(nullptr, &delete_non_owned);
     } else {
-      m_host_data = host_data_ptr(new element_type[count], &delete_owned);
+      m_host_data = host_data_ptr(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); });
     }
-    reset_parsec_data(m_host_data.get(), count*sizeof(element_type));
+    reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
     //std::cout << "buffer::reset(" << count << ") ptr " << m_host_data.get()
     //          << " ttg_copy " << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
-    m_count = count;
+    m_count = n;
   }
 
   /* Reset the buffer to use the ptr to count elements */
-  void reset(T* ptr, std::size_t count = 1) {
+  void reset(T* ptr, std::size_t n = 1) {
     /* TODO: can we resize if count is smaller than m_count? */
-    if (count == m_count) {
+    if (n == m_count) {
       return;
     }
 
@@ -274,9 +278,9 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t {
       m_count = 0;
     } else {
       m_host_data = host_data_ptr(ptr, &delete_non_owned);
-      m_count = count;
+      m_count = n;
     }
-    reset_parsec_data(m_host_data.get(), count*sizeof(element_type));
+    reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
     //std::cout << "buffer::reset(" << ptr << ", " << count << ") ptr " << m_host_data.get()
     //          << " ttg_copy " << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
diff --git a/ttg/ttg/parsec/fwd.h b/ttg/ttg/parsec/fwd.h
index f184548d4..098f51220 100644
--- a/ttg/ttg/parsec/fwd.h
+++ b/ttg/ttg/parsec/fwd.h
@@ -16,7 +16,7 @@ namespace ttg_parsec {
   template<typename T>
   struct ptr;
 
-  template<typename T>
+  template<typename T, typename Allocator = std::allocator<T>>
   struct buffer;
   template<typename T>
   struct devicescratch;

From 194e93da4ab653476c133603a7602a2798cf6aef Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 2 Oct 2023 02:19:35 +0000
Subject: [PATCH 05/17] Add first full draft of device-enabled POTRF

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/CMakeLists.txt          | 10 +++++++++-
 examples/potrf/potrf.h           |  6 ++----
 examples/potrf/testing_dpotrf.cc |  8 +++++++-
 ttg/ttg.h                        |  1 +
 4 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 24b6f5897..b599d97f6 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -14,13 +14,21 @@ if (TARGET tiledarray)
     if (TARGET CUDA::cublas)
       add_ttg_executable(bspmm-cuda spmm/spmm_cuda.cc
                          LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost CUDA::cublas
-                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_HAVE_CUDA=1
+                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
+                         RUNTIMES "parsec")
+
+      add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
+                         LINK_LIBRARIES lapackpp tiledarray CUDA::cublas
+                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                          RUNTIMES "parsec")
     elseif (TARGET roc::hipblas)
       add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
                         LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas
                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                         RUNTIMES "parsec")
+      add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
+                         LINK_LIBRARIES lapackpp tiledarray roc::hipblas
+                         RUNTIMES "parsec")
     endif()
 
     add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index e6966bb2b..039a0065b 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <ttg.h>
+#include <ttg/config.h>
 #include "lapack.hh"
 #include "pmw.h"
 
@@ -34,6 +35,7 @@ namespace potrf {
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
       using T = typename MatrixT::element_type;
 #if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+    std::cout << "Creating CUDA POTRF task " << std::endl;
     static int device_potrf_workspace_size(blk_t &A) {
       int Lwork;
       #if defined(TTG_HAVE_CUDA)
@@ -422,10 +424,6 @@ namespace potrf {
                    tile_mn.data(), tile_mn.lda());
 #endif
 
-#if defined(DEBUG_TILES_VALUES)
-      std::cout << "After GEMM(" << key << "), A(" << M << ", " << N << ") is " << tile_mn << std::endl;
-#endif
-
       if (N == K + 1) {
         /* send the tile to trsm */
         if (ttg::tracing()) ttg::print("GEMM(", key, "): sending output to TRSM(", Key2{M, N}, ")");
diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index d5686042f..76055b4d4 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -56,7 +56,12 @@ int main(int argc, char **argv)
   bool check = !cmdOptionExists(argv+1, argv+argc, "-x");
   bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w");
 
-  ttg::initialize(argc, argv, nthreads);
+  // TODO: need to filter out our arguments to make parsec happy
+  ttg::initialize(1, argv, nthreads);
+
+  // initialize MADNESS so that TA allocators can be created
+  madness::ParsecRuntime::initialize_with_existing_context(ttg::default_execution_context().impl().context());
+  madness::initialize(argc, argv, /* nthread = */ 1, /* quiet = */ true);
 
   auto world = ttg::default_execution_context();
   if(nullptr != prof_filename) {
@@ -207,6 +212,7 @@ int main(int argc, char **argv)
 
   world.profile_off();
 
+  madness::finalize();
   ttg::finalize();
   return ret;
 }
diff --git a/ttg/ttg.h b/ttg/ttg.h
index 09ccfaf90..c84af7854 100644
--- a/ttg/ttg.h
+++ b/ttg/ttg.h
@@ -1,6 +1,7 @@
 #ifndef TTG_H_INCLUDED
 #define TTG_H_INCLUDED
 
+#include "ttg/config.h"
 #include "ttg/fwd.h"
 
 #include "ttg/runtimes.h"

From e41a4b1354c92cd63ff544cdbdd0c2e6036ed742 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 2 Oct 2023 02:44:45 +0000
Subject: [PATCH 06/17] POTRF: Use the right precompiler guard for CUDA and HIP

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/potrf/potrf.h | 82 +++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index 039a0065b..e6230371a 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -7,10 +7,10 @@
 
 #undef DEBUG_TILES_VALUES
 
-#if defined(TTG_HAS_CUDART)
+#if defined(TTG_HAVE_CUDART)
 #define ES ttg::ExecutionSpace::CUDA
 #define TASKRET -> ttg::device_task
-#elif defined(TTG_HAS_HIP)
+#elif defined(TTG_HAVE_HIP)
 #define ES ttg::ExecutionSpace::HIP
 #define TASKRET -> ttg::device_task
 #else
@@ -27,6 +27,41 @@ namespace potrf {
   inline double FADDS_POTRF(double __n) { return (__n * (((1. / 6.) * __n) * __n - (1. / 6.))); }
   inline double FLOPS_DPOTRF(double __n) { return FMULS_POTRF(__n) + FADDS_POTRF(__n); }
 
+#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
+  static int device_potrf_workspace_size(MatrixTile<double> &A) {
+    int Lwork;
+    #if defined(TTG_HAVE_CUDA)
+      cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(),
+                                  CUBLAS_FILL_MODE_LOWER, A.extent(1),
+                                  nullptr, A.extent(0),
+                                  &Lwork);
+      return Lwork;
+    #elif defined(TTG_HAVE_HIPBLAS)
+      #error TBCoded
+    #else
+      return 0;
+    #endif
+  }
+
+  static void device_potrf(MatrixTile<double> &A, double *workspace, int Lwork, int *devInfo) {
+    int device = A.b.get_current_device();
+    assert(device != 0);
+  #if defined(TTG_HAVE_CUDA)
+    cusolverDnDpotrf(ttg::detail::cublas_get_handle(),
+                      CUBLAS_FILL_MODE_LOWER, A.extent(1),
+                      A.b.device_ptr_on(device), A.extent(0),
+                      workspace, Lwork,
+                      devInfo);
+  #elif defined(TTG_HAVE_HIPBLAS)
+    hipsolverDpotrf(ttg::detail::hipblas_get_handle(),
+                      HIPSOLVER_FILL_MODE_LOWER, A.extent(1),
+                      A.b.device_ptr_on(device), A.extent(0),
+                      workspace, Lwork,
+                      devInfo);
+  #endif
+  }
+#endif // defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
+
   template <typename MatrixT>
   auto make_potrf(MatrixT& A,
                   ttg::Edge<Key1, MatrixTile<typename MatrixT::element_type>>& input_disp,  // from the dispatcher
@@ -34,41 +69,8 @@ namespace potrf {
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_trsm,
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
       using T = typename MatrixT::element_type;
-#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
     std::cout << "Creating CUDA POTRF task " << std::endl;
-    static int device_potrf_workspace_size(blk_t &A) {
-      int Lwork;
-      #if defined(TTG_HAVE_CUDA)
-        cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(),
-                                    CUBLAS_FILL_MODE_LOWER, A.extent(1),
-                                    nullptr, A.extent(0),
-                                    &Lwork);
-        return Lwork;
-      #elif defined(TTG_HAVE_HIPBLAS)
-        #error TBCoded
-      #else
-        return 0;
-      #endif
-    }
-
-    static void device_potrf(blk_t &A, double *workspace, int Lwork, int *devInfo) {
-      int device = A.b.get_current_device();
-      assert(device != 0);
-    #if defined(TTG_HAVE_CUDA)
-      cusolverDnDpotrf(ttg::detail::cublas_get_handle(),
-                       CUBLAS_FILL_MODE_LOWER, A.extent(1),
-                       A.b.device_ptr_on(device), A.extent(0),
-                       workspace, Lwork,
-                       devInfo);
-    #elif defined(TTG_HAVE_HIPBLAS)
-      hipsolverDpotrf(ttg::detail::hipblas_get_handle(),
-                       HIPSOLVER_FILL_MODE_LOWER, A.extent(1),
-                       A.b.device_ptr_on(device), A.extent(0),
-                       workspace, Lwork,
-                       devInfo);
-    #endif
-    }
-
     auto f_dev = [=](const Key1& key, MatrixTile<T>&& A,
                      std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
       const auto K = key[0];
@@ -120,7 +122,7 @@ namespace potrf {
     }
     return ttg::make_tt<ES>(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
                         {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
-#else /* defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP) */
+#else /* defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) */
     auto f = [=](const Key1& key, MatrixTile<T>&& tile_kk,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) {
       const int K = key[0];
@@ -163,7 +165,7 @@ namespace potrf {
                  ttg::Edge<Key3, MatrixTile<typename MatrixT::element_type>>& output_col,   // to GEMM
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
     using T = typename MatrixT::element_type;
-#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
     auto f = [=](const Key2& key, const MatrixTile<T>& tile_kk, MatrixTile<T>&& tile_mk,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>,
                             ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
@@ -287,7 +289,7 @@ namespace potrf {
                  ttg::Edge<Key1, MatrixTile<typename MatrixT::element_type>>& output_potrf,  // to POTRF
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_syrk) {
     using T = typename MatrixT::element_type;
-#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
     auto f = [=](const Key2& key, const MatrixTile<T>& tile_mk, MatrixTile<T>&& tile_kk,
                  std::tuple<ttg::Out<Key1, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
       const int K = key[0];
@@ -384,7 +386,7 @@ namespace potrf {
                  ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_trsm,  // to TRSM
                  ttg::Edge<Key3, MatrixTile<typename MatrixT::element_type>>& output_gemm) {
     using T = typename MatrixT::element_type;
-#if defined(TTG_HAS_CUDART) || defined(TTG_HAS_HIP)
+#if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
     auto f = [=](const Key3& key, const MatrixTile<T>& tile_mk, const MatrixTile<T>& tile_nk, MatrixTile<T>&& tile_mn,
                  std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key3, MatrixTile<T>>>& out) TASKRET {
       const int M = key[0];

From 18577f96d8a75472fb8e5e23753577cf53392449 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Mon, 2 Oct 2023 22:59:48 +0000
Subject: [PATCH 07/17] POTRF: make it run, still fails factorization

Notable changes:
- add ttg::device::current_device and ttg::device::current_stream to
  query current streams
- Consistently start counting devices at 0
- The host has ID -2
- Fixes in the broadcast

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/CMakeLists.txt   |  18 +++--
 examples/devblas_helper.h | 158 ++++++++++++++++++++++++++++++++++++++
 examples/matrixtile.h     |  25 ++++--
 examples/potrf/potrf.h    | 121 +++++++++++++++++------------
 ttg/CMakeLists.txt        |   1 +
 ttg/ttg/device/device.h   |  61 +++++++++++++++
 ttg/ttg/device/task.h     |  38 +++++----
 ttg/ttg/devicescratch.h   |   2 +-
 ttg/ttg/parsec/buffer.h   |  41 +++++-----
 ttg/ttg/parsec/ttg.h      |  30 +++++++-
 10 files changed, 388 insertions(+), 107 deletions(-)
 create mode 100644 examples/devblas_helper.h
 create mode 100644 ttg/ttg/device/device.h

diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b599d97f6..4d8896444 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -17,18 +17,22 @@ if (TARGET tiledarray)
                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                          RUNTIMES "parsec")
 
-      add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
-                         LINK_LIBRARIES lapackpp tiledarray CUDA::cublas
-                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
-                         RUNTIMES "parsec")
+      if (TARGET CUDA::cusolver)
+        add_ttg_executable(testing_dpotrf_cuda potrf/testing_dpotrf.cc
+                          LINK_LIBRARIES lapackpp tiledarray CUDA::cublas CUDA::cusolver
+                          COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2;TTG_ENABLE_CUDA=1
+                          RUNTIMES "parsec")
+      endif(TARGET CUDA::cusolver)
     elseif (TARGET roc::hipblas)
       add_ttg_executable(bspmm-hip spmm/spmm_cuda.cc
                         LINK_LIBRARIES tiledarray TiledArray_Eigen BTAS Boost::boost roc::hipblas
                         COMPILE_DEFINITIONS BLOCK_SPARSE_GEMM=1;BTAS_TARGET_MAX_INDEX_RANK=2
                         RUNTIMES "parsec")
-      add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
-                         LINK_LIBRARIES lapackpp tiledarray roc::hipblas
-                         RUNTIMES "parsec")
+      if (TARGET roc::hipsolver)
+        add_ttg_executable(testing_dpotrf_hip potrf/testing_dpotrf.cc
+                           LINK_LIBRARIES lapackpp tiledarray roc::hipblas roc::hipsolver
+                           RUNTIMES "parsec")
+      endif(TARGET roc::hipsolver)
     endif()
 
     add_ttg_executable(chain-ttg-dev task-benchmarks/chain-ttg-dev.cc LINK_LIBRARIES tiledarray RUNTIMES "parsec")
diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h
new file mode 100644
index 000000000..704a2bc85
--- /dev/null
+++ b/examples/devblas_helper.h
@@ -0,0 +1,158 @@
+#include "ttg/config.h"
+
+#include <memory>
+#include <stdexcept>
+#include <optional>
+#include <map>
+
+#ifdef TTG_HAVE_CUDART
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cusolverDn.h>
+
+namespace detail {
+  template<typename Stream>
+  struct device_stream_t {
+    int device;
+    Stream stream;
+    device_stream_t(int device, Stream stream)
+    : device(device)
+    , stream(stream)
+    { }
+
+    bool operator<(const device_stream_t& ds) const {
+      bool result = ((device < ds.device) && (reinterpret_cast<uintptr_t>(stream) < reinterpret_cast<uintptr_t>(ds.stream)));
+      std::cout << *this << " < " << ds << ": " << result << std::endl;
+      return result;
+    }
+
+    bool operator==(const device_stream_t& ds) const {
+      bool result = ((device == ds.device) && (stream == ds.stream));
+      std::cout << *this << " == " << ds << ": " << result << std::endl;
+      return result;
+    }
+  };
+} // namespace detail
+
+namespace std {
+template<typename Stream>
+  std::ostream& operator<<(std::ostream& os, const ::detail::device_stream_t<Stream>& ds) {
+    os << "[" << ds.device << ", " << ds.stream << "]";
+    return os;
+  }
+
+} //namespace std
+
+/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
+/// \return the cuBLAS handle for the current thread
+inline const cublasHandle_t& cublas_handle() {
+  using map_type = std::map<std::pair<int, cudaStream_t>, cublasHandle_t>;
+  static thread_local map_type handles;
+
+  int device = ttg::device::current_device();
+  cudaStream_t stream = ttg::device::current_stream();
+
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    cublasHandle_t handle;
+    auto status = cublasCreate_v2(&handle);
+    if (CUBLAS_STATUS_SUCCESS != status) {
+      std::cerr << "cublasCreate_v2 failed: " << status << std::endl;
+      throw std::runtime_error("cublasCreate_v2 failed");
+    }
+    status = cublasSetStream_v2(handle, ttg::device::current_stream());
+    if (CUBLAS_STATUS_SUCCESS != status) {
+      std::cerr << "cublasSetStream_v2 failed: " << status << std::endl;
+      throw std::runtime_error("cublasSetStream_v2 failed");
+    }
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  }
+  return it->second;
+}
+
+inline const cusolverDnHandle_t& cusolver_handle() {
+
+  //using map_type = std::map<detail::device_stream_t<cudaStream_t>, cusolverDnHandle_t>;
+  using map_type = std::map<std::pair<int, cudaStream_t>, cusolverDnHandle_t>;
+  static thread_local map_type handles;
+
+  int device = ttg::device::current_device();
+  cudaStream_t stream = ttg::device::current_stream();
+
+  map_type::iterator it;
+  if ((it = handles.find({device, stream})) == handles.end()){
+    cusolverDnHandle_t handle;
+    auto status = cusolverDnCreate(&handle);
+    if (CUSOLVER_STATUS_SUCCESS != status) {
+      std::cerr << "cusolverDnCreate failed: " << status << std::endl;
+      throw std::runtime_error("cusolverDnCreate failed");
+    }
+    status = cusolverDnSetStream(handle, stream);
+    if (CUSOLVER_STATUS_SUCCESS != status) {
+      std::cerr << "cusolverDnSetStream failed: " << status << std::endl;
+      throw std::runtime_error("cusolverDnSetStream failed");
+    }
+
+    std::cout << "Creating cusolver handle " << handle << " for device " << device << " stream " << stream << std::endl;
+    auto [iterator, success] = handles.insert({{device, stream}, handle});
+    it = iterator;
+  } else {
+    std::cout << "Found cusolver handle " << it->second << " for device " << device << " stream " << stream << std::endl;
+  }
+
+  return it->second;
+}
+#endif // TTG_HAVE_CUDART
+
+#ifdef TTG_HAVE_HIPBLAS
+
+#include <hip_runtime.h>
+#include <hipblas.h>
+#include <hipsolverDn.h>
+
+/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
+/// \return the rocBLAS handle for the current thread
+const hipblasHandle_t& hipblas_handle() {
+  static thread_local std::map<int, hipblasHandle_t> handles;
+  int device = ttg::device::current_device();
+  std::map<int, hipblasHandle_t>::iterator it;
+  if ((it = handles.find(device)) == handles.end()){
+    hipblasHandle_t handle;
+    auto status = hipblasCreate(&handle);
+    if (HIPBLAS_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipblasCreate failed");
+    }
+    auto [iterator, success] = handles.insert({device, handle});
+    it = iterator;
+  }
+  hipblasStatus_t status = hipblasSetStream(it->second, ttg::device::current_stream());
+  if (HIPBLAS_STATUS_SUCCESS != status) {
+    throw std::runtime_error("hipblasSetStream failed");
+  }
+  return it->second;
+}
+
+/// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread
+/// \return the hipsolver handle for the current thread
+const hipsolverDnHandle_t& hipsolver_handle() {
+  static thread_local std::map<int, hipsolverDnHandle_t> handles;
+  int device = ttg::device::current_device();
+  std::map<int, hipsolverDnHandle_t>::iterator it;
+  if ((it = handles.find(device)) == handles.end()){
+    hipsolverDnHandle_t handle;
+    auto status = hipsolverDnCreate(&handle);
+    if (HIPSOLVER_STATUS_SUCCESS != status) {
+      throw std::runtime_error("hipsolverCreate failed");
+    }
+    auto [iterator, success] = handles.insert({device, handle});
+    it = iterator;
+  }
+  hipsolverStatus_t status = hipsolverDnSetStream(it->second, ttg::device::current_stream());
+  if (HIPSOLVER_STATUS_SUCCESS != status) {
+    throw std::runtime_error("hipsolverSetStream failed");
+  }
+  return it->second;
+}
+#endif // TTG_HAVE_HIPBLAS
diff --git a/examples/matrixtile.h b/examples/matrixtile.h
index 693ae3d1e..ba2fcf9cd 100644
--- a/examples/matrixtile.h
+++ b/examples/matrixtile.h
@@ -17,13 +17,13 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
   using ttvalue_type = ttg::TTValue<MatrixTile<T, Allocator>>;
 
  private:
-  buffer_t b;
+  buffer_t _buffer;
   int _rows = 0, _cols = 0, _lda = 0;
 
   // (Re)allocate the tile memory
   void realloc() {
     // std::cout << "Reallocating new tile" << std::endl;
-    b.reset(_lda * _cols);
+    _buffer.reset(_lda * _cols);
   }
 
  public:
@@ -31,7 +31,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
 
   MatrixTile(int rows, int cols, int lda)
   : ttvalue_type()
-  , b(lda*cols)
+  , _buffer(lda*cols)
   , _rows(rows)
   , _cols(cols)
   , _lda(lda)
@@ -49,7 +49,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
    */
   MatrixTile(int rows, int cols, T* data, int lda)
   : ttvalue_type()
-  , b(data, lda*cols)
+  , _buffer(data, lda*cols)
   , _rows(rows)
   , _cols(cols)
   , _lda(lda)
@@ -65,7 +65,7 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
    * can handle data sharing without excessive copying */
   MatrixTile(const MatrixTile<T, Allocator>& other)
   : ttvalue_type()
-  , b(other._lda*other._cols)
+  , _buffer(other._lda*other._cols)
   , _rows(other._rows)
   , _cols(other._cols)
   , _lda(other._lda) {
@@ -85,14 +85,15 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
     _rows = std::get<0>(meta);
     _cols = std::get<1>(meta);
     _lda = std::get<2>(meta);
+    this->realloc();
   }
 
   metadata_t get_metadata(void) const { return metadata_t{_rows, _cols, _lda}; }
 
   // Accessing the raw data
-  T* data() { return b.host_ptr(); }
+  T* data() { return _buffer.host_ptr(); }
 
-  const T* data() const { return b.host_ptr(); }
+  const T* data() const { return _buffer.host_ptr(); }
 
   size_t size() const { return _cols * _lda; }
 
@@ -102,9 +103,17 @@ class MatrixTile : public ttg::TTValue<MatrixTile<T, Allocator>> {
 
   int lda() const { return _lda; }
 
+  buffer_t& buffer() {
+    return _buffer;
+  }
+
+  const buffer_t& buffer() const {
+    return _buffer;
+  }
+
   auto& fill(T value) {
     std::fill(data().get(), data().get() + size(), value);
-    b.set_current_device(0);
+    _buffer.set_current_device(0);
     return *this;
   }
 
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index e6230371a..50a57a766 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -4,22 +4,24 @@
 #include <ttg/config.h>
 #include "lapack.hh"
 #include "pmw.h"
+#include "../devblas_helper.h"
 
 #undef DEBUG_TILES_VALUES
 
 #if defined(TTG_HAVE_CUDART)
 #define ES ttg::ExecutionSpace::CUDA
 #define TASKRET -> ttg::device_task
+#include <cusolverDn.h>
 #elif defined(TTG_HAVE_HIP)
 #define ES ttg::ExecutionSpace::HIP
 #define TASKRET -> ttg::device_task
+#include <hip/hipsolver.h>
+#include <hip/hipblas.h>
 #else
 #define ES ttg::ExecutionSpace::Host
 #define TASKRET -> void
 #endif
 
-
-
 namespace potrf {
 
   /* FLOP macros taken from DPLASMA */
@@ -31,9 +33,9 @@ namespace potrf {
   static int device_potrf_workspace_size(MatrixTile<double> &A) {
     int Lwork;
     #if defined(TTG_HAVE_CUDA)
-      cusolverDnDpotrf_bufferSize(ttg::detail::cublas_get_handle(),
-                                  CUBLAS_FILL_MODE_LOWER, A.extent(1),
-                                  nullptr, A.extent(0),
+      cusolverDnDpotrf_bufferSize(cusolver_handle(),
+                                  CUBLAS_FILL_MODE_LOWER, A.cols(),
+                                  nullptr, A.lda(),
                                   &Lwork);
       return Lwork;
     #elif defined(TTG_HAVE_HIPBLAS)
@@ -44,18 +46,21 @@ namespace potrf {
   }
 
   static void device_potrf(MatrixTile<double> &A, double *workspace, int Lwork, int *devInfo) {
-    int device = A.b.get_current_device();
-    assert(device != 0);
+    int device = ttg::device::current_device();
+    assert(device >= 0);
   #if defined(TTG_HAVE_CUDA)
-    cusolverDnDpotrf(ttg::detail::cublas_get_handle(),
-                      CUBLAS_FILL_MODE_LOWER, A.extent(1),
-                      A.b.device_ptr_on(device), A.extent(0),
+    //std::cout << "POTRF A " << A.buffer().device_ptr_on(device) << " device " << device << " cols " << A.cols() << " lda " << A.lda() << " Lwork " << Lwork << " WS " << workspace << " devInfo " << devInfo << std::endl;
+    auto handle = cusolver_handle();
+    std::cout << "POTRF handle  " << handle << " device " << device << " stream " << ttg::device::current_stream() << std::endl;
+    cusolverDnDpotrf(handle,
+                      CUBLAS_FILL_MODE_LOWER, A.cols(),
+                      A.buffer().device_ptr_on(device), A.lda(),
                       workspace, Lwork,
                       devInfo);
   #elif defined(TTG_HAVE_HIPBLAS)
-    hipsolverDpotrf(ttg::detail::hipblas_get_handle(),
-                      HIPSOLVER_FILL_MODE_LOWER, A.extent(1),
-                      A.b.device_ptr_on(device), A.extent(0),
+    hipsolverDpotrf(hipsolver_handle(),
+                      HIPSOLVER_FILL_MODE_LOWER, A.cols(),
+                      A.buffer().device_ptr_on(device), A.lda(),
                       workspace, Lwork,
                       devInfo);
   #endif
@@ -69,9 +74,11 @@ namespace potrf {
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_trsm,
                   ttg::Edge<Key2, MatrixTile<typename MatrixT::element_type>>& output_result) {
       using T = typename MatrixT::element_type;
+      auto iallocator = std::make_shared<TiledArray::device_pinned_allocator<int>>();
 #if defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP)
-    std::cout << "Creating CUDA POTRF task " << std::endl;
-    auto f_dev = [=](const Key1& key, MatrixTile<T>&& A,
+    //std::cout << "Creating CUDA POTRF task " << std::endl;
+    auto f_dev = [=, iallocator = std::move(iallocator)]
+                    (const Key1& key, MatrixTile<T>&& tile_kk,
                      std::tuple<ttg::Out<Key2, MatrixTile<T>>, ttg::Out<Key2, MatrixTile<T>>>& out) TASKRET {
       const auto K = key[0];
 
@@ -81,8 +88,10 @@ namespace potrf {
       std::vector<Key2> keylist;
       keylist.reserve(A.rows() - K);
       /* TODO: reverse order of arrays */
+      //std::cout << "POTRF K " << K << " A.rows " << A.rows() << std::endl;
       for (int m = K + 1; m < A.rows(); ++m) {
         /* send tile to trsm */
+        //std::cout << "POTRF successor " << Key2(m, K) << std::endl;
         keylist.push_back(Key2(m, K));
       }
 
@@ -92,34 +101,43 @@ namespace potrf {
       //       Check pmw.h: when we generate the MatrixTile
       //       Also check pinned allocator at the end of DeviceTensor (250-253)
 
-      int Lwork = device_potrf_workspace_size(A);
+      int Lwork = device_potrf_workspace_size(tile_kk);
 
       // Instead of using scratch here, we should have hostWS and hostInfo globals and use to_device
       // this would reduce the number of I/O operations to devices
-      double hostWS[Lwork];
-      ttg::devicescratch<double> devWS = ttg::make_scratch(hostWS, ttg::scope::Allocate);
-      int hostInfo = -1;
-      ttg::devicescratch<int> devInfo = ttg::make_scratch(&hostInfo, ttg::scope::Allocate);
+      double *hostWS = new double[Lwork];
+      ttg::devicescratch<double> devWS = ttg::make_scratch(hostWS, ttg::scope::Allocate, Lwork);
+      int *hostInfo = iallocator->allocate(1);
+      ttg::devicescratch<int> devInfo = ttg::make_scratch(hostInfo, ttg::scope::Allocate);
+
+      *hostInfo = -32;
 
       /* the workspace and the devInfo must be device-level pointers */
-      co_await ttg::to_device(A._data, devWS, devInfo);
+      co_await ttg::to_device(tile_kk.buffer(), devWS, devInfo);
+
+      //std::cout << "devWS host ptr " << hostWS << " device ptr " << devWS.device_ptr() << " size " <<  devWS.size()
+      //          << " devInfo host ptr " << hostInfo << " device ptr " << devInfo.device_ptr() << "size "  << devInfo.size() << std::endl;
 
       /* everything is on the device, call the POTRF */
-      device_potrf(A, devWS, Lwork, devInfo);
+      device_potrf(tile_kk, devWS.device_ptr(), Lwork, devInfo.device_ptr());
 
       /* wait for the kernel to complete */
       co_await ttg::wait_kernel(devInfo);
 
-      if( hostInfo == 0 ) {
-        co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), std::move(keylist)), std::move(A), out));
+      delete[] hostWS;
+      int info = *hostInfo;
+      assert(info == 0);
+      iallocator->deallocate(hostInfo, 1);
+      if( info == 0 ) {
+        co_await ttg::device::forward(ttg::device::broadcast<0, 1>(std::make_tuple(Key2(K, K), std::move(keylist)), std::move(tile_kk), out));
         // Anything after this co_await is never executed
         // co_return would look better, but co_return would destroy keylist before the runtime can handle it
       } else {
         // Well... Here we should interrupt the DAG of tasks, there is an error. Raise?
-        std::cerr << "Factorization is SUSPICIOUS (the matrix might not be diagonally dominant)" << std::endl;
+        std::cerr << "Factorization of tile " << K << " failed: " << info << std::endl;
         ttg::abort();
       }
-    }
+    };
     return ttg::make_tt<ES>(f_dev, ttg::edges(ttg::fuse(input, input_disp)), ttg::edges(output_result, output_trsm), "POTRF",
                         {"tile_kk/dispatcher"}, {"output_result", "output_trsm"});
 #else /* defined(TTG_HAVE_CUDART) || defined(TTG_HAVE_HIP) */
@@ -202,23 +220,23 @@ namespace potrf {
       }
 
 
-      co_await ttg::to_device(tile_kk.b, tile_mk.b);
-      int device = tile_kk.b.get_current_device();
+      co_await ttg::to_device(tile_kk.buffer(), tile_mk.buffer());
+      int device = ttg::device::current_device();
       double alpha = 1.0;
 #if defined(TTG_HAVE_CUDA)
-      cublasDtrsm(ttg::detail::cublas_get_handle(),
+      cublasDtrsm(cublas_handle(),
                   CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_LOWER,
                   CUBLAS_OP_T, CUBLAS_DIAG_NON_UNIT,
                   mb, nb, &alpha,
-                  tile_kk.b.device_ptr_on(device), tile_kk.lda(),
-                  tile_mk.b.device_ptr_on(device), tile_mk.lda());
+                  tile_kk.buffer().device_ptr_on(device), tile_kk.lda(),
+                  tile_mk.buffer().device_ptr_on(device), tile_mk.lda());
 #elif defined(TTG_HAVE_HIPBLAS)
-      hipblasDtrsm(ttg::detail:hipblas_get_handle(),
+      hipblasDtrsm(hipblas_handle(),
                    HIPBLAS_SIDE_RIGHT, HIPBLAS_FILL_MODE_LOWER,
                    HIPBLAS_OP_T, HIPBLAS_DIAG_NON_UNIT,
                    mb, nb, &alpha,
-                   tile_kk.b.device_ptr_on(device), tile_kk.lda(),
-                   tile_mk.b.device_ptr_on(device), tile_mk.lda());
+                   tile_kk.buffer().device_ptr_on(device), tile_kk.lda(),
+                   tile_mk.buffer().device_ptr_on(device), tile_mk.lda());
 #endif
 
       co_await ttg::device::forward(ttg::device::broadcast<0, 1, 2, 3>(std::make_tuple(key, Key2(K, M), keylist_row, keylist_col),
@@ -304,24 +322,26 @@ namespace potrf {
 
       if (ttg::tracing()) ttg::print("SYRK(", key, ")");
 
-      co_await ttg::to_device(tile_kk.b, tile_mk.b);
+      co_await ttg::to_device(tile_kk.buffer(), tile_mk.buffer());
+
+      int device = ttg::device::current_device();
 
       double alpha = -1.0;
       double beta  =  1.0;
 #if defined(TTG_HAVE_CUDA)
-      cublasDsyrk(ttg::detail::cublas_get_handle(),
+      cublasDsyrk(cublas_handle(),
                   CUBLAS_FILL_MODE_LOWER,
                   CUBLAS_OP_N,
                   mb, nb, &alpha,
-                  tile_nk.b.device_ptr_on(device), tile_mk.lda(), &beta,
-                  tile_kk.b.device_ptr_on(device), tile_kk.lda());
+                  tile_mk.buffer().device_ptr_on(device), tile_mk.lda(), &beta,
+                  tile_kk.buffer().device_ptr_on(device), tile_kk.lda());
 #elif defined(TTG_HAVE_HIPBLAS)
-      hipblasDsyrk(ttg::detail:hipblas_get_handle(),
+      hipblasDsyrk(hipblas_handle(),
                    HIPBLAS_FILL_MODE_LOWER,
                    HIPBLAS_OP_N,
                    mb, nb, &alpha,
-                   tile_kk.b.device_ptr_on(device), tile_kk.lda(), &beta,
-                   tile_mk.b.device_ptr_on(device), tile_mk.lda());
+                   tile_mk.buffer().device_ptr_on(device), tile_mk.lda(), &beta,
+                   tile_kk.buffer().device_ptr_on(device), tile_kk.lda());
 #endif
 
       if (M == K + 1) {
@@ -404,26 +424,27 @@ namespace potrf {
                 << N << ") is " << tile_nk << " and A(" << M << ", " << N << ") is " << tile_mn;
 #endif
 
-      co_await ttg::to_device(tile_mk.b, tile_nk.b, tile_mn.b);
+      co_await ttg::to_device(tile_mk.buffer(), tile_nk.buffer(), tile_mn.buffer());
 
+      int device = ttg::device::current_device();
       double alpha = -1.0;
       double beta  =  1.0;
 #if defined(TTG_HAVE_CUDA)
-      cublasDgemm(ttg::detail:cublas_get_handle(),
+      cublasDgemm(cublas_handle(),
                   CUBLAS_OP_N, CUBLAS_OP_T,
                   tile_mk.rows(), tile_nk.rows(),
                   tile_nk.cols(), &alpha,
-                  tile_mk.data(), tile_mk.lda(),
-                  tile_nk.data(), tile_nk.lda(), &beta,
-                  tile_mn.data(), tile_mn.lda());
+                  tile_mk.buffer().device_ptr_on(device), tile_mk.lda(),
+                  tile_nk.buffer().device_ptr_on(device), tile_nk.lda(), &beta,
+                  tile_mn.buffer().device_ptr_on(device), tile_mn.lda());
 #elif defined(TTG_HAVE_HIPBLAS)
-      hipblasDgemm(ttg::detail:hipblas_get_handle(),
+      hipblasDgemm(hipblas_handle(),
                    HIPBLAS_OP_N, HIPBLAS_OP_T,
                    tile_mk.rows(), tile_nk.rows(),
                    tile_nk.cols(), &alpha,
-                   tile_mk.data(), tile_mk.lda(),
-                   tile_nk.data(), tile_nk.lda(), &beta,
-                   tile_mn.data(), tile_mn.lda());
+                   tile_mk.buffer().device_ptr_on(device), tile_mk.lda(),
+                   tile_nk.buffer().device_ptr_on(device), tile_nk.lda(), &beta,
+                   tile_mn.buffer().device_ptr_on(device), tile_mn.lda());
 #endif
 
       if (N == K + 1) {
diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index a60f4f833..476118524 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -69,6 +69,7 @@ set(ttg-impl-headers
     )
 set(ttg_device_headers
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/device.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/task.h
     )
 set(ttg-headers
diff --git a/ttg/ttg/device/device.h b/ttg/ttg/device/device.h
new file mode 100644
index 000000000..17f8364f1
--- /dev/null
+++ b/ttg/ttg/device/device.h
@@ -0,0 +1,61 @@
+
+#include <ttg/config.h>
+
+#if defined(TTG_HAVE_CUDA)
+#include <cuda_runtime.h>
+
+namespace ttg::device {
+  namespace detail {
+    inline thread_local int current_device_ts = 0;
+    inline thread_local cudaStream_t current_stream_ts = 0; // default stream
+
+    void reset_current() {
+      current_device_ts = 0;
+      current_stream_ts = 0;
+    }
+
+    void set_current(int device, cudaStream_t stream) {
+      current_device_ts = device;
+      current_stream_ts = stream;
+    }
+  } // namespace detail
+
+  int current_device() {
+    return detail::current_device_ts;
+  }
+
+  cudaStream_t current_stream() {
+    return detail::current_stream_ts;
+  }
+} // namespace ttg
+
+#elif defined(TTG_HAVE_HIP)
+
+#include <hip/hip_runtime.h>
+
+namespace ttg::device {
+  namespace detail {
+    inline thread_local int current_device_ts = 0;
+    inline thread_local hipStream_t current_stream_ts = 0; // default stream
+
+    void reset_current() {
+      current_device_ts = 0;
+      current_stream_ts = 0;
+    }
+
+    void set_current(int device, hipStream_t stream) {
+      current_device_ts = device;
+      current_stream_ts = stream;
+    }
+  } // namespace detail
+
+  int current_device() {
+    return detail::current_device_ts;
+  }
+
+  hipStream_t current_stream() {
+    return detail::current_stream_ts;
+  }
+} // namespace ttg
+
+#endif // defined(TTG_HAVE_HIP)
diff --git a/ttg/ttg/device/task.h b/ttg/ttg/device/task.h
index b275da195..c51990d99 100644
--- a/ttg/ttg/device/task.h
+++ b/ttg/ttg/device/task.h
@@ -134,7 +134,8 @@ namespace ttg {
         }
 
         void unhandled_exception() {
-
+          std::cerr << "Send coroutine caught an unhandled exception!" << std::endl;
+          throw; // fwd
         }
 
       };
@@ -284,9 +285,9 @@ namespace ttg {
                 typename... out_keysT, typename... out_valuesT>
       inline void prepare_broadcast(const std::tuple<RangesT...> &keylists, valueT &&value,
                                     std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
-        std::get<I>(t)->prepare_send(std::get<KeyId>(keylists), std::forward<valueT>(value));
+        std::get<I>(t).prepare_send(std::get<KeyId>(keylists), std::forward<valueT>(value));
         if constexpr (sizeof...(Is) > 0) {
-          detail::prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value), t);
+          prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value), t);
         }
       }
 
@@ -299,7 +300,7 @@ namespace ttg {
         auto *terminal_ptr = ttg::detail::get_out_terminal<key_t, valueT>(I, "ttg::device::broadcast(keylists, value)");
         terminal_ptr->prepare_send(std::get<KeyId>(keylists), value);
         if constexpr (sizeof...(Is) > 0) {
-          detail::prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
+          prepare_broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
         }
       }
 
@@ -307,7 +308,7 @@ namespace ttg {
                 typename... out_keysT, typename... out_valuesT>
       inline void broadcast(const std::tuple<RangesT...> &keylists, valueT &&value,
                                     std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
-        std::get<I>(t)->broadcast(std::get<KeyId>(keylists), std::forward<valueT>(value));
+        std::get<I>(t).broadcast(std::get<KeyId>(keylists), std::forward<valueT>(value));
         if constexpr (sizeof...(Is) > 0) {
           detail::broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value), t);
         }
@@ -322,7 +323,7 @@ namespace ttg {
         auto *terminal_ptr = ttg::detail::get_out_terminal<key_t, valueT>(I, "ttg::device::broadcast(keylists, value)");
         terminal_ptr->broadcast(std::get<KeyId>(keylists), value);
         if constexpr (sizeof...(Is) > 0) {
-          detail::broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
+          ttg::device::detail::broadcast<KeyId+1, Is...>(keylists, std::forward<valueT>(value));
         }
       }
 
@@ -340,12 +341,12 @@ namespace ttg {
           // treat as tuple
           prepare_broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value), t);
           co_await ttg::Void{}; // we'll come back once the task is done
-          broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value), t);
+          ttg::device::detail::broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value), t);
         } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
           // create a tie to the captured keylist
           prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value), t);
           co_await ttg::Void{}; // we'll come back once the task is done
-          broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value), t);
+          ttg::device::detail::broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value), t);
         }
       }
 
@@ -363,12 +364,12 @@ namespace ttg {
                         "Size of keylist tuple must match the number of output terminals");
           prepare_broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value));
           co_await ttg::Void{}; // we'll come back once the task is done
-          broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value));
+          ttg::device::detail::broadcast<0, I, Is...>(kl, std::forward<std::decay_t<decltype(value)>>(value));
         } else if constexpr (!ttg::meta::is_tuple_v<RangesT>) {
           // create a tie to the captured keylist
           prepare_broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
           co_await ttg::Void{}; // we'll come back once the task is done
-          broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
+          ttg::device::detail::broadcast<0, I, Is...>(std::tie(kl), std::forward<std::decay_t<decltype(value)>>(value));
         }
       }
     }  // namespace detail
@@ -379,12 +380,12 @@ namespace ttg {
     [[nodiscard]]
     inline detail::send_t broadcast(rangeT &&keylist,
                                     valueT &&value,
-                                    const std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
+                                    std::tuple<ttg::Out<out_keysT, out_valuesT>...> &t) {
       ttg::detail::value_copy_handler<Runtime> copy_handler;
       return detail::send_t{
-              broadcast_coro<0, I, Is...>(std::forward<rangeT>(keylist),
-                                          copy_handler(std::forward<valueT>(value)),
-                                          t, std::move(copy_handler))};
+             detail::broadcast_coro<I, Is...>(std::forward<rangeT>(keylist),
+                                              copy_handler(std::forward<valueT>(value)),
+                                              t, std::move(copy_handler))};
     }
 
     /* overload with implicit terminals and keylist passed by const reference */
@@ -392,8 +393,8 @@ namespace ttg {
               ttg::Runtime Runtime = ttg::ttg_runtime>
     inline detail::send_t broadcast(rangeT &&keylist, valueT &&value) {
       ttg::detail::value_copy_handler<Runtime> copy_handler;
-      return detail::send_t{broadcast_coro<0, i>(std::tie(keylist), copy_handler(std::forward<valueT>(value)),
-                                                 std::move(copy_handler))};
+      return detail::send_t{broadcast_coro<i>(std::tie(keylist), copy_handler(std::forward<valueT>(value)),
+                                              std::move(copy_handler))};
     }
 
     template<typename... Args, ttg::Runtime Runtime = ttg::ttg_runtime>
@@ -577,7 +578,10 @@ namespace ttg {
 
     device_task get_return_object() { return device_task{device_task_handle_type::from_promise(*this)}; }
 
-    void unhandled_exception() { }
+    void unhandled_exception() {
+      std::cerr << "Task coroutine caught an unhandled exception!" << std::endl;
+      throw; // fwd
+    }
 
     //using iterator = std::vector<device_obj_view>::iterator;
 
diff --git a/ttg/ttg/devicescratch.h b/ttg/ttg/devicescratch.h
index 37510436f..729361fa2 100644
--- a/ttg/ttg/devicescratch.h
+++ b/ttg/ttg/devicescratch.h
@@ -11,7 +11,7 @@ using devicescratch = TTG_IMPL_NS::devicescratch<T>;
 
 template<typename T>
 auto make_scratch(T* val, ttg::scope scope, std::size_t count = 1) {
-  return devicescratch<T>(val, scope, 1);
+  return devicescratch<T>(val, scope, count);
 }
 
 namespace detail {
diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
index 00c7c996e..aac5de428 100644
--- a/ttg/ttg/parsec/buffer.h
+++ b/ttg/ttg/parsec/buffer.h
@@ -19,8 +19,8 @@ namespace ttg_parsec {
 
 namespace detail {
   // fwd decl
-  template<typename T>
-  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T>& db);
+  template<typename T, typename A>
+  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T, A>& db);
 } // namespace detail
 
 /**
@@ -59,7 +59,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
     // nothing to be done, we don't own the memory
   }
 
-  friend parsec_data_t* detail::get_parsec_data<T>(const ttg_parsec::buffer<T>&);
+  friend parsec_data_t* detail::get_parsec_data<T, Allocator>(const ttg_parsec::buffer<T, Allocator>&);
 
   allocator_type& get_allocator_reference() { return static_cast<allocator_type&>(*this); }
 
@@ -74,7 +74,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
 public:
 
   /* The device ID of the CPU. */
-  static constexpr int cpu_device = 0;
+  static constexpr int cpu_device = -2;
 
   buffer() : buffer(nullptr, 0)
   { }
@@ -151,15 +151,16 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
     /* make sure it's a valid device */
     assert(parsec_nb_devices > device_id);
     /* make sure it's a valid copy */
-    assert(m_data->device_copies[device_id] != nullptr);
-    m_data->owner_device = device_id;
+    assert(m_data->device_copies[device_id+2] != nullptr);
+    m_data->owner_device = device_id+2;
   }
 
   /* get the current device ID, i.e., the last updated
-   * device buffer.  */
+   * device buffer. A value of -2 designates the host
+   * as the current device. */
   int get_current_device() const {
     assert(is_valid());
-    return m_data->owner_device;
+    return m_data->owner_device - 2; // 0: host, 1: recursive, 2: first device
   }
 
   /* get the current device pointer */
@@ -179,7 +180,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
    */
   element_type* device_ptr_on(int device_id) {
     assert(is_valid());
-    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id + 2));
   }
 
   /* get the device pointer at the given device
@@ -187,20 +188,20 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
    */
   const element_type* device_ptr_on(int device_id) const {
     assert(is_valid());
-    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id));
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), device_id + 2)); // GPUs start at 2
   }
 
   element_type* host_ptr() {
-    return device_ptr_on(cpu_device);
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), 0));
   }
 
   const element_type* host_ptr() const {
-    return device_ptr_on(cpu_device);
+    return static_cast<element_type*>(parsec_data_get_ptr(m_data.get(), 0));
   }
 
   bool is_valid_on(int device_id) const {
     assert(is_valid());
-    return (parsec_data_get_ptr(m_data.get(), device_id) != nullptr);
+    return (parsec_data_get_ptr(m_data.get(), device_id+2) != nullptr);
   }
 
   void allocate_on(int device_id) {
@@ -222,7 +223,7 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
   /* Unpin the memory on all devices we currently track. */
   void unpin() {
     if (!is_valid()) return;
-    for (int i = 1; i < parsec_nb_devices; ++i) {
+    for (int i = 0; i < parsec_nb_devices-2; ++i) {
       unpin_on(i);
     }
   }
@@ -362,20 +363,20 @@ template<typename T>
 struct is_buffer : std::false_type
 { };
 
-template<typename T>
-struct is_buffer<buffer<T>> : std::true_type
+template<typename T, typename A>
+struct is_buffer<buffer<T, A>> : std::true_type
 { };
 
-template<typename T>
-struct is_buffer<const buffer<T>> : std::true_type
+template<typename T, typename A>
+struct is_buffer<const buffer<T, A>> : std::true_type
 { };
 
 template<typename T>
 constexpr static const bool is_buffer_v = is_buffer<T>::value;
 
 namespace detail {
-  template<typename T>
-  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T>& db) {
+  template<typename T, typename A>
+  parsec_data_t* get_parsec_data(const ttg_parsec::buffer<T, A>& db) {
     return const_cast<parsec_data_t*>(db.m_data.get());
   }
 } // namespace detail
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 288545dbb..2bca31533 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -113,6 +113,8 @@
 #include "ttg/device/cublas_helper.h"
 #include "ttg/parsec/parsec-ext.h"
 
+#include "ttg/device/device.h"
+
 #undef TTG_PARSEC_DEBUG_TRACK_DATA_COPIES
 
 /* PaRSEC function declarations */
@@ -1323,15 +1325,35 @@ namespace ttg_parsec {
              dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL);
 
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDART)
-      parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
-      ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream);
+      {
+        parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
+        ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream);
+      }
 #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
 
+#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
+      {
+        parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
+        int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU
+        ttg::device::detail::set_current(device, cuda_stream->cuda_stream);
+      }
+#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
+
 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIPBLAS)
-      parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
-      ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream);
+      {
+        parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
+        ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream);
+      }
 #endif // PARSEC_HAVE_DEV_HIP_SUPPORT
 
+#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP)
+      {
+        parsec_cuda_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
+        int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU
+        ttg::device::detail::set_current(device, hip_stream->hip_stream);
+      }
+#endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
+
       /* Here we call back into the coroutine again after the transfers have completed */
       static_op<Space>(&task->parsec_task);
 

From b60577c62430b2b41e44b1898f4f3ab733430d57 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Tue, 3 Oct 2023 17:51:01 +0000
Subject: [PATCH 08/17] Fix release of data copies in buffer

This fixes a segfault in multi-gpu operations. We should not destroy the
full data_t but only remove the host-side copy and release our reference
on the data. The data will be destroyed once all gpu copies have been
removed.

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ttg/ttg/parsec/devicefunc.h    |  4 ++--
 ttg/ttg/parsec/ttg_data_copy.h | 11 +++++++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
index d250e060e..703b42fb4 100644
--- a/ttg/ttg/parsec/devicefunc.h
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -117,7 +117,7 @@ namespace ttg_parsec {
       /* enqueue the transfer into the compute stream to come back once the compute and transfer are complete */
 
 #if defined(TTG_HAVE_CUDART) && defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
-      std::cout << "cudaMemcpyAsync of " << data->nb_elts << "B" << std::endl;
+      //std::cout << "cudaMemcpyAsync of " << data->nb_elts << "B from " << data->device_copies[data->owner_device]->device_private << " device " << (int)data->owner_device << " to " << data->device_copies[0]->device_private << std::endl;
       parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)stream;
       cudaMemcpyAsync(data->device_copies[0]->device_private,
                       data->device_copies[data->owner_device]->device_private,
@@ -176,4 +176,4 @@ namespace ttg_parsec {
 
 } // namespace ttg_parsec
 
-#endif // TTG_PARSEC_DEVICEFUNC_H
\ No newline at end of file
+#endif // TTG_PARSEC_DEVICEFUNC_H
diff --git a/ttg/ttg/parsec/ttg_data_copy.h b/ttg/ttg/parsec/ttg_data_copy.h
index 27814ecf5..9d21cc2d1 100644
--- a/ttg/ttg/parsec/ttg_data_copy.h
+++ b/ttg/ttg/parsec/ttg_data_copy.h
@@ -62,9 +62,16 @@ namespace ttg_parsec {
           status = cudaHostUnregister(data->device_copies[0]->device_private);
           assert(cudaSuccess == status);
           data->device_copies[0]->flags ^= TTG_PARSEC_DATA_FLAG_REGISTERED;
-      }
+        }
 #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
-        parsec_data_destroy(data);
+        //std::fprintf(stderr, "parsec_data_destroy %p device_copy[0] %p\n", data, data->device_copies[0]);
+        //parsec_data_destroy(data);
+        assert(data->device_copies[0] != nullptr);
+        auto copy = data->device_copies[0];
+        parsec_data_copy_detach(data, data->device_copies[0], 0);
+        PARSEC_OBJ_RELEASE(copy);
+        PARSEC_OBJ_RELEASE(data);
+
       }
 
       static void delete_null_parsec_data(parsec_data_t *) {

From a2f4bbeed5f17cc433354ed88e7385e8fad3dd3b Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 4 Oct 2023 01:08:13 +0000
Subject: [PATCH 09/17] Enable POTRF on hip

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 CMakeLists.txt                   |  5 +++++
 examples/devblas_helper.h        |  6 +++---
 examples/potrf/potrf.h           | 12 ++++++++----
 examples/spmm/spmm_cuda.cc       |  6 ++++--
 ttg/ttg/device/cublas_helper.cpp |  2 +-
 ttg/ttg/parsec/devicefunc.h      |  5 +++++
 ttg/ttg/parsec/ttg.h             |  4 ++++
 7 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06169f4e8..b6b9c39fe 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -120,6 +120,11 @@ if (TTG_ENABLE_HIP)
   if (TARGET roc::hipblas)
     set(TTG_HAVE_HIPBLAS True CACHE BOOL "TTG detected support for hipBLAS")
   endif()
+
+  find_package(hipsolver)
+  if (TARGET roc::hipsolver)
+    set(TTG_HAVE_HIPSOLVER True CACHE BOOL "TTG detected support for hipSolver")
+  endif()
   add_compile_definitions(${TTG_HIP_PLATFORM})
 endif(TTG_ENABLE_HIP)
 
diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h
index 704a2bc85..10c420111 100644
--- a/examples/devblas_helper.h
+++ b/examples/devblas_helper.h
@@ -108,9 +108,9 @@ inline const cusolverDnHandle_t& cusolver_handle() {
 
 #ifdef TTG_HAVE_HIPBLAS
 
-#include <hip_runtime.h>
-#include <hipblas.h>
-#include <hipsolverDn.h>
+#include <hip/hip_runtime.h>
+#include <hipblas/hipblas.h>
+#include <hipsolver/hipsolver.h>
 
 /// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
 /// \return the rocBLAS handle for the current thread
diff --git a/examples/potrf/potrf.h b/examples/potrf/potrf.h
index 50a57a766..24bdd44d2 100644
--- a/examples/potrf/potrf.h
+++ b/examples/potrf/potrf.h
@@ -15,8 +15,8 @@
 #elif defined(TTG_HAVE_HIP)
 #define ES ttg::ExecutionSpace::HIP
 #define TASKRET -> ttg::device_task
-#include <hip/hipsolver.h>
-#include <hip/hipblas.h>
+#include <hipsolver/hipsolver.h>
+#include <hipblas/hipblas.h>
 #else
 #define ES ttg::ExecutionSpace::Host
 #define TASKRET -> void
@@ -39,7 +39,11 @@ namespace potrf {
                                   &Lwork);
       return Lwork;
     #elif defined(TTG_HAVE_HIPBLAS)
-      #error TBCoded
+      hipsolverDnDpotrf_bufferSize(hipsolver_handle(),
+                                   HIPSOLVER_FILL_MODE_LOWER, A.cols(),
+                                   nullptr, A.lda(),
+                                   &Lwork);
+      return Lwork;
     #else
       return 0;
     #endif
@@ -48,7 +52,7 @@ namespace potrf {
   static void device_potrf(MatrixTile<double> &A, double *workspace, int Lwork, int *devInfo) {
     int device = ttg::device::current_device();
     assert(device >= 0);
-  #if defined(TTG_HAVE_CUDA)
+#if defined(TTG_HAVE_CUDA)
     //std::cout << "POTRF A " << A.buffer().device_ptr_on(device) << " device " << device << " cols " << A.cols() << " lda " << A.lda() << " Lwork " << Lwork << " WS " << workspace << " devInfo " << devInfo << std::endl;
     auto handle = cusolver_handle();
     std::cout << "POTRF handle  " << handle << " device " << device << " stream " << ttg::device::current_stream() << std::endl;
diff --git a/examples/spmm/spmm_cuda.cc b/examples/spmm/spmm_cuda.cc
index 05a6599b9..3f6ef0dec 100644
--- a/examples/spmm/spmm_cuda.cc
+++ b/examples/spmm/spmm_cuda.cc
@@ -37,6 +37,8 @@
 
 #include "ttg.h"
 
+#include "../devblas_helper.h"
+
 using namespace ttg;
 
 #include "ttg/util/future.h"
@@ -273,13 +275,13 @@ static void device_gemm(blk_t &C, const blk_t &A, const blk_t &B) {
   int device = C.b.get_current_device();
   assert(device != 0);
 #if defined(TTG_HAVE_CUDA)
-  cublasDgemm(ttg::detail::cublas_get_handle(),
+  cublasDgemm(cublas_handle(),
               CUBLAS_OP_N, CUBLAS_OP_N, C.extent(0), C.extent(1), A.extent(1), &alpha,
               A.b.device_ptr_on(device), A.extent(0),
               B.b.device_ptr_on(device), B.extent(0), &beta,
               C.b.current_device_ptr(), C.extent(0));
 #elif defined(TTG_HAVE_HIPBLAS)
-  hipblasDgemm(ttg::detail::hipblas_get_handle(),
+  hipblasDgemm(hipblas_handle(),
                 HIPBLAS_OP_N, HIPBLAS_OP_N,
                 C.extent(0), C.extent(1), A.extent(1), &alpha,
                 A.b.device_ptr_on(device), A.extent(0),
diff --git a/ttg/ttg/device/cublas_helper.cpp b/ttg/ttg/device/cublas_helper.cpp
index 24e34f52c..768411282 100644
--- a/ttg/ttg/device/cublas_helper.cpp
+++ b/ttg/ttg/device/cublas_helper.cpp
@@ -10,7 +10,7 @@
 #endif // TTG_HAVE_CUDART
 
 #ifdef TTG_HAVE_HIPBLAS
-#include <hip_runtime.h>
+#include <hip/hip_runtime.h>
 #endif // TTG_HAVE_HIPBLAS
 
 namespace ttg::detail {
diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
index 703b42fb4..8686588f4 100644
--- a/ttg/ttg/parsec/devicefunc.h
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -122,6 +122,11 @@ namespace ttg_parsec {
       cudaMemcpyAsync(data->device_copies[0]->device_private,
                       data->device_copies[data->owner_device]->device_private,
                       data->nb_elts, cudaMemcpyDeviceToHost, cuda_stream->cuda_stream);
+#elif defined(TTG_HAVE_HIP) && defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+      parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)stream;
+      hipMemcpyAsync(data->device_copies[0]->device_private,
+                      data->device_copies[data->owner_device]->device_private,
+                      data->nb_elts, hipMemcpyDeviceToHost, hip_stream->hip_stream);
 #else
       static_assert(DeviceAvail, "No device implementation detected!");
 #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 2bca31533..422e9f1ef 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -1324,12 +1324,14 @@ namespace ttg_parsec {
       assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER ||
              dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL);
 
+#if 0
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDART)
       {
         parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
         ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream);
       }
 #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
+#endif //0
 
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
       {
@@ -1339,12 +1341,14 @@ namespace ttg_parsec {
       }
 #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
 
+#if 0
 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIPBLAS)
       {
         parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
         ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream);
       }
 #endif // PARSEC_HAVE_DEV_HIP_SUPPORT
+#endif // 0
 
 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP)
       {

From 965433580334b87d46f7e197fe3f9c47a5929181 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 4 Oct 2023 02:41:55 +0000
Subject: [PATCH 10/17] POTRF: add -n parameters to control number of
 iterations

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/potrf/testing_dpotrf.cc | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index 76055b4d4..402f50dd9 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -36,6 +36,7 @@ int main(int argc, char **argv)
   const char* prof_filename = nullptr;
   char *opt = nullptr;
   int ret = EXIT_SUCCESS;
+  int niter = 3;
 
   if( (opt = getCmdOption(argv+1, argv+argc, "-N")) != nullptr ) {
     N = M = atoi(opt);
@@ -53,6 +54,10 @@ int main(int argc, char **argv)
     prof_filename = opt;
   }
 
+  if( (opt = getCmdOption(argv+1, argv+argc, "-n")) != nullptr) {
+    niter = atoi(opt);
+  }
+
   bool check = !cmdOptionExists(argv+1, argv+argc, "-x");
   bool cow_hint = !cmdOptionExists(argv+1, argv+argc, "-w");
 
@@ -128,16 +133,23 @@ int main(int argc, char **argv)
       std::cout << "==== end dot ====\n";
       beg = std::chrono::high_resolution_clock::now();
     }
-    init_tt->invoke();
 
-    ttg::execute(world);
-    ttg::fence(world);
-    if (world.rank() == 0) {
-      end = std::chrono::high_resolution_clock::now();
-      auto elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
-      end = std::chrono::high_resolution_clock::now();
-      std::cout << "TTG Execution Time (milliseconds) : "
-                << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
+    for (int i = 0; i < niter; ++i) {
+      if (world.rank() == 0) {
+        beg = std::chrono::high_resolution_clock::now();
+      }
+
+      init_tt->invoke();
+      ttg::execute(world);
+      ttg::fence(world);
+
+      if (world.rank() == 0) {
+        end = std::chrono::high_resolution_clock::now();
+        auto elapsed = (std::chrono::duration_cast<std::chrono::microseconds>(end - beg).count());
+        end = std::chrono::high_resolution_clock::now();
+        std::cout << "TTG Execution Time (milliseconds) : "
+                  << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
+      }
     }
 
     world.dag_off();

From 410dc23120fdf7dbdb9147a0957f39e04464c78c Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Wed, 4 Oct 2023 02:43:11 +0000
Subject: [PATCH 11/17] Minor fixes to compile on Frontier

---
 examples/devblas_helper.h   | 46 +++++++------------------------------
 ttg/ttg/parsec/devicefunc.h |  2 ++
 ttg/ttg/parsec/ttg.h        |  2 +-
 3 files changed, 11 insertions(+), 39 deletions(-)

diff --git a/examples/devblas_helper.h b/examples/devblas_helper.h
index 10c420111..349025209 100644
--- a/examples/devblas_helper.h
+++ b/examples/devblas_helper.h
@@ -11,42 +11,10 @@
 #include <cublas_v2.h>
 #include <cusolverDn.h>
 
-namespace detail {
-  template<typename Stream>
-  struct device_stream_t {
-    int device;
-    Stream stream;
-    device_stream_t(int device, Stream stream)
-    : device(device)
-    , stream(stream)
-    { }
-
-    bool operator<(const device_stream_t& ds) const {
-      bool result = ((device < ds.device) && (reinterpret_cast<uintptr_t>(stream) < reinterpret_cast<uintptr_t>(ds.stream)));
-      std::cout << *this << " < " << ds << ": " << result << std::endl;
-      return result;
-    }
-
-    bool operator==(const device_stream_t& ds) const {
-      bool result = ((device == ds.device) && (stream == ds.stream));
-      std::cout << *this << " == " << ds << ": " << result << std::endl;
-      return result;
-    }
-  };
-} // namespace detail
-
-namespace std {
-template<typename Stream>
-  std::ostream& operator<<(std::ostream& os, const ::detail::device_stream_t<Stream>& ds) {
-    os << "[" << ds.device << ", " << ds.stream << "]";
-    return os;
-  }
-
-} //namespace std
-
 /// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
 /// \return the cuBLAS handle for the current thread
-inline const cublasHandle_t& cublas_handle() {
+template<typename T = int>
+inline const cublasHandle_t& cublas_handle(T _ = 0) {
   using map_type = std::map<std::pair<int, cudaStream_t>, cublasHandle_t>;
   static thread_local map_type handles;
 
@@ -72,9 +40,9 @@ inline const cublasHandle_t& cublas_handle() {
   return it->second;
 }
 
-inline const cusolverDnHandle_t& cusolver_handle() {
+template<typename T = int>
+inline const cusolverDnHandle_t& cusolver_handle(T _ = 0) {
 
-  //using map_type = std::map<detail::device_stream_t<cudaStream_t>, cusolverDnHandle_t>;
   using map_type = std::map<std::pair<int, cudaStream_t>, cusolverDnHandle_t>;
   static thread_local map_type handles;
 
@@ -114,7 +82,8 @@ inline const cusolverDnHandle_t& cusolver_handle() {
 
 /// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
 /// \return the rocBLAS handle for the current thread
-const hipblasHandle_t& hipblas_handle() {
+template<typename T = int>
+const hipblasHandle_t& hipblas_handle(T _ = 0) {
   static thread_local std::map<int, hipblasHandle_t> handles;
   int device = ttg::device::current_device();
   std::map<int, hipblasHandle_t>::iterator it;
@@ -136,7 +105,8 @@ const hipblasHandle_t& hipblas_handle() {
 
 /// \brief Returns the hipsolver handle to be used for launching rocBLAS kernels from the current thread
 /// \return the hipsolver handle for the current thread
-const hipsolverDnHandle_t& hipsolver_handle() {
+template<typename T = int>
+const hipsolverDnHandle_t& hipsolver_handle(T _ = 0) {
   static thread_local std::map<int, hipsolverDnHandle_t> handles;
   int device = ttg::device::current_device();
   std::map<int, hipsolverDnHandle_t>::iterator it;
diff --git a/ttg/ttg/parsec/devicefunc.h b/ttg/ttg/parsec/devicefunc.h
index 8686588f4..325750e04 100644
--- a/ttg/ttg/parsec/devicefunc.h
+++ b/ttg/ttg/parsec/devicefunc.h
@@ -11,6 +11,8 @@
 
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT)
 #include <parsec/mca/device/cuda/device_cuda.h>
+#elif defined(PARSEC_HAVE_DEV_HIP_SUPPORT)
+#include <parsec/mca/device/hip/device_hip.h>
 #endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
 
 namespace ttg_parsec {
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index 422e9f1ef..ea2063d56 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -1352,7 +1352,7 @@ namespace ttg_parsec {
 
 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP)
       {
-        parsec_cuda_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
+        parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
         int device = gpu_device->super.device_index - 2; // 0: host, 1: recursive, 2: first GPU
         ttg::device::detail::set_current(device, hip_stream->hip_stream);
       }

From 5b81ac4dc380af7473c33fecbbd6f2ef64f614f5 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 5 Oct 2023 15:12:41 +0000
Subject: [PATCH 12/17] buffer: cannot capture this pointer for deleter and
 move, use raw pointer instead

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ttg/ttg/parsec/buffer.h | 60 ++++++++++++++++++++++++++++-------------
 1 file changed, 42 insertions(+), 18 deletions(-)

diff --git a/ttg/ttg/parsec/buffer.h b/ttg/ttg/parsec/buffer.h
index aac5de428..0309e7883 100644
--- a/ttg/ttg/parsec/buffer.h
+++ b/ttg/ttg/parsec/buffer.h
@@ -51,9 +51,10 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
 private:
   using delete_fn_t = std::function<void(element_type*)>;
 
-  using host_data_ptr   = std::unique_ptr<element_type[], delete_fn_t>;
-  host_data_ptr m_host_data;
+  using host_data_ptr   = std::add_pointer_t<element_type>;
+  host_data_ptr m_host_data = nullptr;
   std::size_t m_count = 0;
+  bool m_owned= false;
 
   static void delete_non_owned(element_type *ptr) {
     // nothing to be done, we don't own the memory
@@ -67,8 +68,8 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
     return allocator_traits::allocate(get_allocator_reference(), n);
   }
 
-  void deallocate(element_type *ptr, std::size_t n) {
-    allocator_traits::deallocate(get_allocator_reference(), ptr, n);
+  void deallocate() {
+    allocator_traits::deallocate(get_allocator_reference(), m_host_data, m_count);
   }
 
 public:
@@ -82,14 +83,15 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
   buffer(std::size_t n)
   : ttg_parsec_data_wrapper_t()
   , allocator_type()
-  , m_host_data(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); })
+  , m_host_data(allocate(n))
   , m_count(n)
+  , m_owned(true)
   {
     //std::cout << "buffer " << this << " ctor count "
     //          << count << "(" << m_host_data.get() << ") ttg_copy "
     //          << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
-    this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
+    this->reset_parsec_data(m_host_data, n*sizeof(element_type));
   }
 
   /* Constructing a buffer using application-managed memory.
@@ -98,13 +100,18 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
   buffer(element_type* ptr, std::size_t n = 1)
   : ttg_parsec_data_wrapper_t()
   , allocator_type()
-  , m_host_data(ptr, &delete_non_owned)
+  , m_host_data(ptr)
   , m_count(n)
+  , m_owned(false)
   {
-    this->reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
+    this->reset_parsec_data(m_host_data, n*sizeof(element_type));
   }
 
   virtual ~buffer() {
+    if (m_owned) {
+      deallocate();
+      m_owned = false;
+    }
     unpin(); // make sure the copies are not pinned
   }
 
@@ -112,10 +119,13 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
   buffer(buffer&& db)
   : ttg_parsec_data_wrapper_t(std::move(db))
   , allocator_type(std::move(db))
-  , m_host_data(std::move(db.m_host_data))
+  , m_host_data(db.m_host_data)
   , m_count(db.m_count)
+  , m_owned(db.m_owned)
   {
+    db.m_host_data = nullptr;
     db.m_count = 0;
+    db.m_owned = false;
   }
 
   /* explicitly disable copying of buffers
@@ -127,9 +137,9 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
   buffer& operator=(buffer&& db) {
     ttg_parsec_data_wrapper_t::operator=(std::move(db));
     allocator_type::operator=(std::move(db));
-    m_host_data = std::move(db.m_host_data);
-    m_count = db.m_count;
-    db.m_count = 0;
+    std::swap(m_host_data, db.m_host_data);
+    std::swap(m_count, db.m_count);
+    std::swap(m_owned, db.m_owned);
     //std::cout << "buffer " << this << " other " << &db << " mv op ttg_copy " << m_ttg_copy << std::endl;
     //std::cout << "buffer::move-assign from " << &db << " ttg-copy " << db.m_ttg_copy
     //          << " to " << this << " ttg-copy " << m_ttg_copy
@@ -255,12 +265,20 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
     /* TODO: can we resize if count is smaller than m_count? */
     /* drop the current data and reallocate */
     reset();
+
+    if (m_owned) {
+      deallocate();
+      m_owned = false;
+    }
+
     if (n == 0) {
-      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+      m_host_data = nullptr;
+      m_owned = false;
     } else {
-      m_host_data = host_data_ptr(allocate(n), [this, n](element_type* ptr){ deallocate(ptr, n); });
+      m_host_data = allocate(n);
+      m_owned = true;
     }
-    reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
+    reset_parsec_data(m_host_data, n*sizeof(element_type));
     //std::cout << "buffer::reset(" << count << ") ptr " << m_host_data.get()
     //          << " ttg_copy " << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;
@@ -274,14 +292,20 @@ struct buffer : public detail::ttg_parsec_data_wrapper_t
       return;
     }
 
+    if (m_owned) {
+      deallocate();
+    }
+
     if (nullptr == ptr) {
-      m_host_data = host_data_ptr(nullptr, &delete_non_owned);
+      m_host_data = nullptr;
       m_count = 0;
+      m_owned = false;
     } else {
-      m_host_data = host_data_ptr(ptr, &delete_non_owned);
+      m_host_data = ptr;
       m_count = n;
+      m_owned = false;
     }
-    reset_parsec_data(m_host_data.get(), n*sizeof(element_type));
+    reset_parsec_data(m_host_data, n*sizeof(element_type));
     //std::cout << "buffer::reset(" << ptr << ", " << count << ") ptr " << m_host_data.get()
     //          << " ttg_copy " << m_ttg_copy
     //          << " parsec_data " << m_data.get() << std::endl;

From 403b06b575cbddb6de08c1880cf5acb823d8afca Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 5 Oct 2023 15:15:04 +0000
Subject: [PATCH 13/17] POTRF: construct whole TTG each iteration

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/potrf/testing_dpotrf.cc | 68 ++++++++++++++++----------------
 1 file changed, 35 insertions(+), 33 deletions(-)

diff --git a/examples/potrf/testing_dpotrf.cc b/examples/potrf/testing_dpotrf.cc
index 402f50dd9..4ea3b9167 100644
--- a/examples/potrf/testing_dpotrf.cc
+++ b/examples/potrf/testing_dpotrf.cc
@@ -96,45 +96,46 @@ int main(int argc, char **argv)
   parsec_data_collection_set_key((parsec_data_collection_t*)&dcA, (char*)"Matrix A");
 
   if(!check) {
-    ttg::Edge<Key2, void> startup("startup");
-    ttg::Edge<Key2, MatrixTile<double>> topotrf("To POTRF");
-    ttg::Edge<Key2, MatrixTile<double>> result("To result");
-
-    //Matrix<double>* A = new Matrix<double>(n_rows, n_cols, NB, NB);
-    MatrixT<double> A{&dcA};
-    /* TODO: initialize the matrix */
-    /* This works only with the parsec backend! */
-    int random_seed = 3872;
-
-    auto init_tt =  ttg::make_tt<void>([&](std::tuple<ttg::Out<Key2, void>>& out) {
-      for(int i = 0; i < A.rows(); i++) {
-        for(int j = 0; j <= i && j < A.cols(); j++) {
-          if(A.is_local(i, j)) {
-            if(ttg::tracing()) ttg::print("init(", Key2{i, j}, ")");
-            ttg::sendk<0>(Key2{i, j}, out);
+    for (int i = 0; i < niter; ++i) {
+      parsec_devices_release_memory();
+      ttg::Edge<Key2, void> startup("startup");
+      ttg::Edge<Key2, MatrixTile<double>> topotrf("To POTRF");
+      ttg::Edge<Key2, MatrixTile<double>> result("To result");
+
+      //Matrix<double>* A = new Matrix<double>(n_rows, n_cols, NB, NB);
+      MatrixT<double> A{&dcA};
+      /* TODO: initialize the matrix */
+      /* This works only with the parsec backend! */
+      int random_seed = 3872;
+
+      auto init_tt =  ttg::make_tt<void>([&](std::tuple<ttg::Out<Key2, void>>& out) {
+        for(int i = 0; i < A.rows(); i++) {
+          for(int j = 0; j <= i && j < A.cols(); j++) {
+            if(A.is_local(i, j)) {
+              if(ttg::tracing()) ttg::print("init(", Key2{i, j}, ")");
+              ttg::sendk<0>(Key2{i, j}, out);
+            }
           }
         }
-      }
-    }, ttg::edges(), ttg::edges(startup), "Startup Trigger", {}, {"startup"});
-    init_tt->set_keymap([&]() {return world.rank();});
+      }, ttg::edges(), ttg::edges(startup), "Startup Trigger", {}, {"startup"});
+      init_tt->set_keymap([&]() {return world.rank();});
 
-    auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint);
-    auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint);
-    auto result_ttg = make_result_ttg(A, result, cow_hint);
+      auto plgsy_ttg = make_plgsy_ttg(A, N, random_seed, startup, topotrf, cow_hint);
+      auto potrf_ttg = potrf::make_potrf_ttg(A, topotrf, result, cow_hint);
+      auto result_ttg = make_result_ttg(A, result, cow_hint);
 
-    auto connected = make_graph_executable(init_tt.get());
-    assert(connected);
-    TTGUNUSED(connected);
-    std::cout << "Graph is connected: " << connected << std::endl;
+      auto connected = make_graph_executable(init_tt.get());
+      assert(connected);
+      TTGUNUSED(connected);
+      std::cout << "Graph is connected: " << connected << std::endl;
 
-    if (world.rank() == 0) {
-      std::cout << "==== begin dot ====\n";
-      std::cout << ttg::Dot()(init_tt.get()) << std::endl;
-      std::cout << "==== end dot ====\n";
-      beg = std::chrono::high_resolution_clock::now();
-    }
+      if (world.rank() == 0) {
+        std::cout << "==== begin dot ====\n";
+        std::cout << ttg::Dot()(init_tt.get()) << std::endl;
+        std::cout << "==== end dot ====\n";
+        beg = std::chrono::high_resolution_clock::now();
+      }
 
-    for (int i = 0; i < niter; ++i) {
       if (world.rank() == 0) {
         beg = std::chrono::high_resolution_clock::now();
       }
@@ -150,6 +151,7 @@ int main(int argc, char **argv)
         std::cout << "TTG Execution Time (milliseconds) : "
                   << elapsed / 1E3 << " : Flops " << (potrf::FLOPS_DPOTRF(N)) << " " << (potrf::FLOPS_DPOTRF(N)/1e9)/(elapsed/1e6) << " GF/s" << std::endl;
       }
+      parsec_devices_reset_load(ttg::default_execution_context().impl().context());
     }
 
     world.dag_off();

From 1998655dfaafb564ec089f26fa2e31b38266d646 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Thu, 5 Oct 2023 15:19:22 +0000
Subject: [PATCH 14/17] Add TTG_ENABLE_ASAN CMake option

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b6b9c39fe..3a65a6335 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ option(TTG_PARSEC_USE_BOOST_SERIALIZATION "Whether to select Boost serialization
 option(TTG_ENABLE_CUDA "Whether to TTG will look for CUDA" OFF)
 option(TTG_ENABLE_HIP "Whether to TTG will look for HIP" OFF)
 option(TTG_EXAMPLES "Whether to build examples" OFF)
+option(TTG_ENABLE_ASAN "Whether to enable address sanitizer" OFF)
 
 option(TTG_FETCH_BOOST "Whether to fetch+build Boost, if missing" OFF)
 option(TTG_IGNORE_BUNDLED_EXTERNALS "Whether to skip installation and use of bundled external depenedencies (Boost.CallableTraits)" OFF)
@@ -68,6 +69,11 @@ if (FORCE_COLORED_OUTPUT)
     endif ()
 endif (FORCE_COLORED_OUTPUT)
 
+if (TTG_ENABLE_ASAN)
+    add_compile_options(-fsanitize=address)
+    add_link_options(-fsanitize=address)
+endif (TTG_ENABLE_ASAN)
+
 set(TTG_HIP_PLATFORM "__HIP_PLATFORM_AMD__" CACHE STRING "Which platform to use when compiling HIP-related code (default: __HIP_PLATFORM_AMD__)")
 ##########################
 #### prerequisites

From 548b65bd4664a0ff7faecc951eb9ed0c512be150 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 6 Oct 2023 14:33:17 +0000
Subject: [PATCH 15/17] Chain benchmark: Disable wait_kernel co_yield

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 examples/task-benchmarks/chain-ttg-dev.cc | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/examples/task-benchmarks/chain-ttg-dev.cc b/examples/task-benchmarks/chain-ttg-dev.cc
index 5ccf1297d..ac96e16b3 100644
--- a/examples/task-benchmarks/chain-ttg-dev.cc
+++ b/examples/task-benchmarks/chain-ttg-dev.cc
@@ -11,7 +11,7 @@
 #error "Either CUDA OR HIP is required to build this test!"
 #endif // 0
 
-#define NUM_TASKS 10000
+#define NUM_TASKS 100000
 
 using namespace ttg;
 
@@ -56,7 +56,7 @@ auto make_ttg<1>(bool do_move) {
   auto next = make_tt<ES, int>([=](const int &key, auto&& value) -> ttg::device_task {
     //++task_counter;
     co_await ttg::to_device(value.b);
-    co_await ttg::wait_kernel(); // empty kernel
+    //co_await ttg::wait_kernel(); // empty kernel
     if (key < NUM_TASKS) {
       if (do_move) {
         co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(value)));
@@ -83,7 +83,7 @@ auto make_ttg<2>(bool do_move) {
 
   auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2) -> ttg::device_task {
     co_await ttg::to_device(v1.b, v2.b);
-    co_await ttg::wait_kernel(); // empty kernel
+    //co_await ttg::wait_kernel(); // empty kernel
     if (key < NUM_TASKS) {
       if (do_move) {
         co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
@@ -114,7 +114,7 @@ auto make_ttg<4>(bool do_move) {
 
   auto next = make_tt<ES, int>([=](const int &key, A&& v1, A&& v2, A&& v3, A&& v4) -> ttg::device_task {
     co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b);
-    co_await ttg::wait_kernel(); // empty kernel
+    //co_await ttg::wait_kernel(); // empty kernel
     if (key < NUM_TASKS) {
       if (do_move) {
         co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
@@ -155,7 +155,7 @@ auto make_ttg<8>(bool do_move) {
 
   auto next = make_tt<ES, int>([=](const int &key, auto&& v1, auto&& v2, auto&& v3, auto&& v4, auto&& v5, auto&& v6, auto&& v7, auto&& v8) -> ttg::device_task {
     co_await ttg::to_device(v1.b, v2.b, v3.b, v4.b, v5.b, v6.b, v7.b, v8.b);
-    co_await ttg::wait_kernel(); // empty kernel
+    //co_await ttg::wait_kernel(); // empty kernel
     if (key < NUM_TASKS) {
       if (do_move) {
         co_await ttg::device::forward(ttg::device::send<0>(key+1, std::move(v1)),
@@ -193,7 +193,7 @@ auto make_ttg<0>(bool do_move) {
 
   auto next = make_tt<ES>([](const int& key) -> ttg::device_task {
     co_await ttg::to_device();
-    co_await ttg::wait_kernel();
+    //co_await ttg::wait_kernel();
     if (key < NUM_TASKS) {
       co_await ttg::device::forward(ttg::device::sendk<0>(key+1));
     }
@@ -211,6 +211,11 @@ void run_bench(bool do_move)
   assert(connected);
   std::cout << "Graph " << num_flows << " is connected.\n";
 
+  if (ttg::default_execution_context().rank() == 0) init->invoke();
+
+  ttg_execute(ttg_default_execution_context());
+  ttg_fence(ttg_default_execution_context());
+
   auto t0 = now();
   if (ttg::default_execution_context().rank() == 0) init->invoke();
 
@@ -218,8 +223,8 @@ void run_bench(bool do_move)
   ttg_fence(ttg_default_execution_context());
   auto t1 = now();
 
-  std::cout << "# of tasks = " << task_counter.load() << std::endl;
-  std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << std::endl;
+  std::cout << "# of tasks = " << NUM_TASKS << std::endl;
+  std::cout << "time elapsed (microseconds) = " << duration_in_mus(t0, t1) << ", avg " << duration_in_mus(t0, t1) / (double)NUM_TASKS << std::endl;
 }
 
 int main(int argc, char* argv[]) {
@@ -242,7 +247,7 @@ int main(int argc, char* argv[]) {
   case 2: run_bench<2>(do_move); break;
   case 4: run_bench<4>(do_move); break;
   case 8: run_bench<8>(do_move); break;
-  default: std::cout << "Unsupported number of flows: " << num_flows << std::endl;
+  default: std::cout << "Unsupported number of flows: " << NUM_TASKS << std::endl;
   }
 
   ttg_finalize();

From fdf6e2e1218ea81661606da390df38d6b36e459d Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 6 Oct 2023 14:33:53 +0000
Subject: [PATCH 16/17] PaRSEC: Increase max AM size to 1MB

---
 ttg/ttg/parsec/ttg.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index ea2063d56..c942940b1 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -240,7 +240,7 @@ namespace ttg_parsec {
     int parsec_ttg_profile_backend_allocate_datacopy, parsec_ttg_profile_backend_free_datacopy;
 #endif
 
-    static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 4 * 1024;
+    static constexpr const int PARSEC_TTG_MAX_AM_SIZE = 1 * 1024*1024;
     WorldImpl(int *argc, char **argv[], int ncores, parsec_context_t *c = nullptr)
         : WorldImplBase(query_comm_size(), query_comm_rank())
         , ctx(c)

From 9d21225a774c461470cecac424f3006000bcac34 Mon Sep 17 00:00:00 2001
From: Joseph Schuchart <schuchart@icl.utk.edu>
Date: Fri, 6 Oct 2023 14:39:58 +0000
Subject: [PATCH 17/17] Remove TTG cublas_helper, superseeded by devblas helper
 in examples

Signed-off-by: Joseph Schuchart <schuchart@icl.utk.edu>
---
 ttg/CMakeLists.txt               | 15 +-----
 ttg/ttg/device/cublas_helper.cpp | 81 --------------------------------
 ttg/ttg/device/cublas_helper.h   | 35 --------------
 ttg/ttg/parsec/ttg.h             | 19 --------
 4 files changed, 1 insertion(+), 149 deletions(-)
 delete mode 100644 ttg/ttg/device/cublas_helper.cpp
 delete mode 100644 ttg/ttg/device/cublas_helper.h

diff --git a/ttg/CMakeLists.txt b/ttg/CMakeLists.txt
index 476118524..427a3ffee 100644
--- a/ttg/CMakeLists.txt
+++ b/ttg/CMakeLists.txt
@@ -68,7 +68,6 @@ set(ttg-impl-headers
         ${CMAKE_CURRENT_BINARY_DIR}/ttg/config.h
     )
 set(ttg_device_headers
-        ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/device.h
         ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/task.h
     )
@@ -117,23 +116,11 @@ if (TARGET std::coroutine)
           )
 endif ()
 if (TTG_HAVE_CUDA)
-  list(APPEND ttg-sources
-          ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.cpp
-          )
-  list(APPEND ttg-deps CUDA::cudart CUDA::cublas)
+  list(APPEND ttg-deps CUDA::cudart)
 endif (TTG_HAVE_CUDA)
 
 if (TTG_HAVE_HIPBLAS)
-  # add the helper file if not added already
-  if (NOT TTG_HAVE_CUDA)
-    list(APPEND ttg-sources
-      ${CMAKE_CURRENT_SOURCE_DIR}/ttg/device/cublas_helper.cpp
-    )
-  endif (NOT TTG_HAVE_CUDA)
-  list(APPEND ttg-deps roc::hipblas)
-  find_package(hip REQUIRED)
   list(APPEND ttg-deps hip::host)
-
 endif (TTG_HAVE_HIPBLAS)
 
 set(ttg-public-headers ${ttg-headers};${ttg-impl-headers};${ttg-base-headers};${ttg-util-headers};${ttg_device_headers})
diff --git a/ttg/ttg/device/cublas_helper.cpp b/ttg/ttg/device/cublas_helper.cpp
deleted file mode 100644
index 768411282..000000000
--- a/ttg/ttg/device/cublas_helper.cpp
+++ /dev/null
@@ -1,81 +0,0 @@
-#include "ttg/config.h"
-#include "ttg/device/cublas_helper.h"
-
-#include <memory>
-#include <stdexcept>
-#include <optional>
-#include <map>
-#ifdef TTG_HAVE_CUDART
-#include <cuda_runtime.h>
-#endif // TTG_HAVE_CUDART
-
-#ifdef TTG_HAVE_HIPBLAS
-#include <hip/hip_runtime.h>
-#endif // TTG_HAVE_HIPBLAS
-
-namespace ttg::detail {
-
-#ifdef TTG_HAVE_CUDART
-/// \brief Returns the cuBLAS handle to be used for launching cuBLAS kernels from the current thread
-/// \return the cuBLAS handle for the current thread
-const cublasHandle_t& cublas_get_handle() {
-  static thread_local std::map<int, cublasHandle_t> handles;
-  int device;
-  if (cudaSuccess != cudaGetDevice(&device)){
-      throw std::runtime_error("cudaGetDevice failed");
-  }
-  std::map<int, cublasHandle_t>::iterator it;
-  if ((it = handles.find(device)) == handles.end()){
-    cublasHandle_t handle;
-    auto status = cublasCreate_v2(&handle);
-    if (CUBLAS_STATUS_SUCCESS != status) {
-      throw std::runtime_error("cublasCreate_v2 failed");
-    }
-    auto [iterator, success] = handles.insert({device, handle});
-    it = iterator;
-  }
-
-  return it->second;
-}
-
-void cublas_set_kernel_stream(cudaStream_t stream) {
-    cublasStatus_t status = cublasSetStream_v2(cublas_get_handle(), stream);
-    if (CUBLAS_STATUS_SUCCESS != status) {
-        throw std::runtime_error("cublasSetStream_v2 failed");
-    }
-}
-#endif // TTG_HAVE_CUDART
-
-#ifdef TTG_HAVE_HIPBLAS
-/// \brief Returns the rocBLAS handle to be used for launching rocBLAS kernels from the current thread
-/// \return the rocBLAS handle for the current thread
-const hipblasHandle_t& hipblas_get_handle() {
-  static thread_local std::map<int, hipblasHandle_t> handles;
-  int device;
-  if (hipSuccess != hipGetDevice(&device)){
-      throw std::runtime_error("hipGetDevice failed");
-  }
-  std::map<int, hipblasHandle_t>::iterator it;
-  if ((it = handles.find(device)) == handles.end()){
-    hipblasHandle_t handle;
-    auto status = hipblasCreate(&handle);
-    if (HIPBLAS_STATUS_SUCCESS != status) {
-      throw std::runtime_error("hipblasCreate failed");
-    }
-    auto [iterator, success] = handles.insert({device, handle});
-    it = iterator;
-  }
-  return it->second;
-}
-
-void hipblas_set_kernel_stream(hipStream_t stream) {
-    hipblasStatus_t status = hipblasSetStream(hipblas_get_handle(), stream);
-    if (HIPBLAS_STATUS_SUCCESS != status) {
-        throw std::runtime_error("hipblasSetStream failed");
-    }
-}
-
-#endif // TTG_HAVE_HIPBLAS
-
-
-} // namespace
\ No newline at end of file
diff --git a/ttg/ttg/device/cublas_helper.h b/ttg/ttg/device/cublas_helper.h
deleted file mode 100644
index 6ab106bfb..000000000
--- a/ttg/ttg/device/cublas_helper.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#ifndef TTG_DEVICE_CUBLAS_HELPER_H
-#define TTG_DEVICE_CUBLAS_HELPER_H
-
-#include "ttg/config.h"
-
-#ifdef TTG_HAVE_CUDART
-#include <cublas_v2.h>
-
-namespace ttg::detail {
-
-/// \brief Returns the cuBLAS context used by this thread
-const cublasHandle_t& cublas_get_handle();
-
-/// associates the given CUDA stream with the cuBLAS context used by this thread
-void cublas_set_kernel_stream(cudaStream_t stream);
-
-} // namespace ttg::detail
-#endif // TTG_HAVE_CUDART
-
-
-#ifdef TTG_HAVE_HIPBLAS
-#include <hipblas/hipblas.h>
-
-namespace ttg::detail {
-
-const hipblasHandle_t& hipblas_get_handle();
-
-/// associates the given HIP stream with the hipblas context used by this thread
-void hipblas_set_kernel_stream(hipStream_t stream);
-
-} // namespace ttg::detail
-#endif // TTG_HAVE_HIPBLAS
-
-
-#endif // TTG_DEVICE_CUBLAS_HELPER_H
\ No newline at end of file
diff --git a/ttg/ttg/parsec/ttg.h b/ttg/ttg/parsec/ttg.h
index c942940b1..367d6c250 100644
--- a/ttg/ttg/parsec/ttg.h
+++ b/ttg/ttg/parsec/ttg.h
@@ -110,7 +110,6 @@
 #include "ttg/parsec/thread_local.h"
 #include "ttg/parsec/ptr.h"
 #include "ttg/parsec/task.h"
-#include "ttg/device/cublas_helper.h"
 #include "ttg/parsec/parsec-ext.h"
 
 #include "ttg/device/device.h"
@@ -1324,15 +1323,6 @@ namespace ttg_parsec {
       assert(dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_TRANSFER ||
              dev_data.state() == ttg::TTG_DEVICE_CORO_WAIT_KERNEL);
 
-#if 0
-#if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDART)
-      {
-        parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
-        ttg::detail::cublas_set_kernel_stream(cuda_stream->cuda_stream);
-      }
-#endif // PARSEC_HAVE_DEV_CUDA_SUPPORT
-#endif //0
-
 #if defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
       {
         parsec_cuda_exec_stream_t *cuda_stream = (parsec_cuda_exec_stream_t *)gpu_stream;
@@ -1341,15 +1331,6 @@ namespace ttg_parsec {
       }
 #endif // defined(PARSEC_HAVE_DEV_CUDA_SUPPORT) && defined(TTG_HAVE_CUDA)
 
-#if 0
-#if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIPBLAS)
-      {
-        parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;
-        ttg::detail::hipblas_set_kernel_stream(hip_stream->hip_stream);
-      }
-#endif // PARSEC_HAVE_DEV_HIP_SUPPORT
-#endif // 0
-
 #if defined(PARSEC_HAVE_DEV_HIP_SUPPORT) && defined(TTG_HAVE_HIP)
       {
         parsec_hip_exec_stream_t *hip_stream = (parsec_hip_exec_stream_t *)gpu_stream;