From 28fea2e421b992aebf7485af44fd3b2d345feaf8 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Wed, 21 Apr 2021 09:31:53 +0200
Subject: [PATCH 01/15] TEMP: Playing around with senders/receivers

---
 include/dlaf/blas/tile.h                    |  55 +++++++
 include/dlaf/communication/sync/broadcast.h |  98 ++++++++++++
 include/dlaf/matrix/matrix.h                |  22 +++
 include/dlaf/matrix/matrix_const.tpp        |   4 +
 include/dlaf/matrix/tile.tpp                |   2 +-
 include/dlaf/solver/triangular/impl.h       |  36 ++++-
 include/dlaf/transform.h                    | 161 ++++++++++++++++++++
 spack/packages/dla-future/package.py        |   3 +-
 test/include/dlaf_test/matrix/util_matrix.h |  10 ++
 test/unit/solver/test_triangular.cpp        |   5 +
 10 files changed, 390 insertions(+), 6 deletions(-)
 create mode 100644 include/dlaf/transform.h
diff --git a/include/dlaf/blas/tile.h b/include/dlaf/blas/tile.h
index e9d42447a0..95e574d143 100644
--- a/include/dlaf/blas/tile.h
+++ b/include/dlaf/blas/tile.h
@@ -11,7 +11,13 @@
 
 #include "blas.hh"
 
+#include <functional>
+
+#include <hpx/execution_base/sender.hpp>
+#include <hpx/synchronization/async_rw_mutex.hpp>
+
 #include "dlaf/common/callable_object.h"
+#include "dlaf/init.h"
 #include "dlaf/matrix/tile.h"
 #include "dlaf/types.h"
 #include "dlaf/util_blas.h"
@@ -39,6 +45,29 @@ void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile<co
              beta, c.ptr(), c.ld());
 }
 
+// Potential overload for using async_rw_mutex.
+// NOTE: No implicit conversions to templated function parameters, hence the
+// explicit async_rw_mutex_access_wrappers. Add another unwrapping variant for
+// unwrapping those?
+// template <class T>
+// void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha,
+//           hpx::experimental::detail::async_rw_mutex_access_wrapper<
+//               Tile<T, Device::CPU>, const Tile<const T, Device::CPU>,
+//               hpx::experimental::detail::async_rw_mutex_access_type::read>,
+//           hpx::experimental::detail::async_rw_mutex_access_wrapper<
+//               Tile<T, Device::CPU>, const dlaf::matrix::Tile<const T, Device::CPU>,
+//               hpx::experimental::detail::async_rw_mutex_access_type::read>,
+//           const T beta,
+//           hpx::experimental::detail::async_rw_mutex_access_wrapper<
+//               Tile<T, Device::CPU>, const Tile<const T, Device::CPU>,
+//               hpx::experimental::detail::async_rw_mutex_access_type::readwrite>
+//               c) noexcept {
+//   // TODO: Should async_rw_mutex_access_wrapper have a get method?
+//   // auto s = tile::internal::getGemmSizes(op_a, op_b, a.get(), b.get(), c);
+//   // blas::gemm(blas::Layout::ColMajor, op_a, op_b, s.m, s.n, s.k, alpha, a.get().ptr(), a.get().ld(),
+//   //            b.get().ptr(), b.get().ld(), beta, c.ptr(), c.ld());
+// }
+
 /// Computes matrix matrix multiplication where matrix @p a is hermitian (symmetric if T is real).
 template <class T>
 void hemm(const blas::Side side, const blas::Uplo uplo, const T alpha,
@@ -76,6 +105,15 @@ void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const
              b.ld());
 }
 
+template <class T>
+void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
+          const T alpha, std::reference_wrapper<const Tile<const T, Device::CPU>> a,
+          const Tile<T, Device::CPU>& b) noexcept {
+  auto s = tile::internal::getTrsmSizes(side, a.get(), b);
+  blas::trsm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.get().ptr(), a.get().ld(),
+             b.ptr(), b.ld());
+}
+
 #ifdef DLAF_WITH_CUDA
 namespace internal {
 template <typename T>
@@ -226,5 +264,22 @@ DLAF_MAKE_CALLABLE_OBJECT(her2k);
 DLAF_MAKE_CALLABLE_OBJECT(herk);
 DLAF_MAKE_CALLABLE_OBJECT(trsm);
 
+// TODO: Useful? Take only the predecessor sender, internally call transform.
+// template <Backend B, typename S>
+// decltype(auto) gemm(S&& s) {
+//   return transform<B>(std::forward<S>(s), gemm_o);
+// }
+
+// TODO: Or? Automatically wrap and lift arguments in when_all.
+// template <Backend B, typename... Ts>
+// decltype(auto) gemm(Ts&&... ts) {
+//   return transform<B>(when_all_lift(std::forward<Ts>(ts)...), gemm_o);
+// }
+
+// TODO: Or eagerly submitted? Additionally call detach.
+// template <Backend B, typename... Ts>
+// void gemm(Ts&&... ts) {
+//   ex::detach(transform<B>(when_all_lift(std::forward<Ts>(ts)...), gemm_o));
+// }
 }
 }
diff --git a/include/dlaf/communication/sync/broadcast.h b/include/dlaf/communication/sync/broadcast.h
index 15c4d79bc8..3271a61699 100644
--- a/include/dlaf/communication/sync/broadcast.h
+++ b/include/dlaf/communication/sync/broadcast.h
@@ -25,6 +25,8 @@
 #include "dlaf/matrix/copy_tile.h"
 #include "dlaf/matrix/tile.h"
 
+#include <hpx/synchronization/async_rw_mutex.hpp>
+
 namespace dlaf {
 namespace comm {
 namespace sync {
@@ -39,6 +41,7 @@ void send(Communicator& communicator, DataIn&& message_to_send) {
   using DataT = std::remove_const_t<typename common::data_traits<decltype(data)>::element_t>;
 
   auto message = comm::make_message(std::move(data));
+  // std::cerr << "send with data = " << message.data() << " and size " << message.count() << "\n";
   DLAF_MPI_CALL(MPI_Bcast(const_cast<DataT*>(message.data()), message.count(), message.mpi_type(),
                           communicator.rank(), communicator));
 }
@@ -50,10 +53,105 @@ template <class DataOut>
 void receive_from(const int broadcaster_rank, Communicator& communicator, DataOut&& data) {
   DLAF_ASSERT_HEAVY(broadcaster_rank != communicator.rank(), broadcaster_rank, communicator.rank());
   auto message = comm::make_message(common::make_data(std::forward<DataOut>(data)));
+  // std::cerr << "receive with data = " << message.data() << " and size " << message.count() << "\n";
   DLAF_MPI_CALL(
       MPI_Bcast(message.data(), message.count(), message.mpi_type(), broadcaster_rank, communicator));
 }
 }
 }
+
+// TODO: These are here only temporarily. MPI handling has changed.
+
+/// Task for broadcasting (send endpoint) a Tile in a direction over a CommunicatorGrid
+template <class T>
+void sendTile(hpx::future<common::PromiseGuard<comm::CommunicatorGrid>> mpi_task_chain, Coord rc_comm,
+              hpx::shared_future<matrix::Tile<const T, Device::CPU>> tile) {
+  using PromiseComm_t = common::PromiseGuard<comm::CommunicatorGrid>;
+
+  PromiseComm_t pcomm = mpi_task_chain.get();
+  comm::sync::broadcast::send(pcomm.ref().subCommunicator(rc_comm), tile.get());
+}
+
+template <class T>
+void sendTile(common::PromiseGuard<comm::CommunicatorGrid> pcomm, Coord rc_comm,
+              std::reference_wrapper<const matrix::Tile<const T, Device::CPU>> tile) {
+  comm::sync::broadcast::send(pcomm.ref().subCommunicator(rc_comm), tile.get());
+}
+
+template <class T>
+void sendTile(hpx::experimental::async_rw_mutex<comm::CommunicatorGrid>::readwrite_access_type pcomm,
+              Coord rc_comm, std::reference_wrapper<const matrix::Tile<const T, Device::CPU>> tile) {
+  comm::CommunicatorGrid& pcomm_ref = pcomm;
+  comm::sync::broadcast::send(pcomm_ref.subCommunicator(rc_comm), tile.get());
+}
+
+template <class T>
+void sendTile(hpx::experimental::async_rw_mutex<comm::CommunicatorGrid>::readwrite_access_type pcomm,
+              Coord rc_comm, matrix::Tile<const T, Device::CPU> const& tile) {
+  comm::CommunicatorGrid& pcomm_ref = pcomm;
+  comm::sync::broadcast::send(pcomm_ref.subCommunicator(rc_comm), tile);
+}
+
+DLAF_MAKE_CALLABLE_OBJECT(sendTile);
+
+/// Task for broadcasting (receiving endpoint) a Tile in a direction over a CommunicatorGrid
+template <class T>
+void recvTile(hpx::future<common::PromiseGuard<comm::CommunicatorGrid>> mpi_task_chain, Coord rc_comm,
+              hpx::future<matrix::Tile<T, Device::CPU>> tile, comm::IndexT_MPI rank) {
+  using PromiseComm_t = common::PromiseGuard<comm::CommunicatorGrid>;
+
+  PromiseComm_t pcomm = mpi_task_chain.get();
+  comm::sync::broadcast::receive_from(rank, pcomm.ref().subCommunicator(rc_comm), tile.get());
+}
+
+DLAF_MAKE_CALLABLE_OBJECT(recvTile);
+
+/// Task for broadcasting (receiving endpoint) a Tile ("JIT" allocation) in a direction over a CommunicatorGrid
+template <class T>
+matrix::Tile<const T, Device::CPU> recvAllocTile(
+    hpx::future<common::PromiseGuard<comm::CommunicatorGrid>> mpi_task_chain, Coord rc_comm,
+    TileElementSize tile_size, comm::IndexT_MPI rank) {
+  using ConstTile_t = matrix::Tile<const T, Device::CPU>;
+  using PromiseComm_t = common::PromiseGuard<comm::CommunicatorGrid>;
+  using MemView_t = memory::MemoryView<T, Device::CPU>;
+  using Tile_t = matrix::Tile<T, Device::CPU>;
+
+  PromiseComm_t pcomm = mpi_task_chain.get();
+  MemView_t mem_view(tile_size.linear_size());
+  Tile_t tile(tile_size, std::move(mem_view), tile_size.rows());
+  comm::sync::broadcast::receive_from(rank, pcomm.ref().subCommunicator(rc_comm), tile);
+  return ConstTile_t(std::move(tile));
+}
+
+/// Task for broadcasting (receiving endpoint) a Tile ("JIT" allocation) in a direction over a
+/// CommunicatorGrid
+template <class T>
+matrix::Tile<const T, Device::CPU> recvAllocTileSender(common::PromiseGuard<comm::CommunicatorGrid> pcomm,
+                                                       Coord rc_comm, TileElementSize tile_size,
+                                                       comm::IndexT_MPI rank) {
+  using ConstTile_t = matrix::Tile<const T, Device::CPU>;
+  using MemView_t = memory::MemoryView<T, Device::CPU>;
+  using Tile_t = matrix::Tile<T, Device::CPU>;
+
+  MemView_t mem_view(tile_size.linear_size());
+  Tile_t tile(tile_size, std::move(mem_view), tile_size.rows());
+  comm::sync::broadcast::receive_from(rank, pcomm.ref().subCommunicator(rc_comm), tile);
+  return ConstTile_t(std::move(tile));
+}
+
+template <class T>
+matrix::Tile<const T, Device::CPU> recvAllocTileSenderMutex(
+    hpx::experimental::async_rw_mutex<comm::CommunicatorGrid>::readwrite_access_type pcomm,
+    Coord rc_comm, TileElementSize tile_size, comm::IndexT_MPI rank) {
+  using ConstTile_t = matrix::Tile<const T, Device::CPU>;
+  using MemView_t = memory::MemoryView<T, Device::CPU>;
+  using Tile_t = matrix::Tile<T, Device::CPU>;
+
+  MemView_t mem_view(tile_size.linear_size());
+  Tile_t tile(tile_size, std::move(mem_view), tile_size.rows());
+  comm::CommunicatorGrid& pcomm_ref = pcomm;
+  comm::sync::broadcast::receive_from(rank, pcomm_ref.subCommunicator(rc_comm), tile);
+  return ConstTile_t(std::move(tile));
+}
 }
 }
diff --git a/include/dlaf/matrix/matrix.h b/include/dlaf/matrix/matrix.h
index a609d09089..47b4341c8f 100644
--- a/include/dlaf/matrix/matrix.h
+++ b/include/dlaf/matrix/matrix.h
@@ -15,6 +15,8 @@
 
 #include <hpx/local/future.hpp>
 
+#include <hpx/synchronization/async_rw_mutex.hpp>
+
 #include "dlaf/communication/communicator_grid.h"
 #include "dlaf/matrix/distribution.h"
 #include "dlaf/matrix/internal/tile_future_manager.h"
@@ -128,12 +130,22 @@ class Matrix : public Matrix<const T, device> {
     return operator()(this->distribution().localTileIndex(index));
   }
 
+  auto readwrite_sender(const LocalTileIndex& index) noexcept {
+    std::size_t i = to_sizet(tileLinearIndex(index));
+    return tile_rw_mutexes_[i].readwrite();
+  }
+
+  auto readwrite_sender(const GlobalTileIndex& index) {
+    return readwrite_sender(this->distribution().localTileIndex(index));
+  }
+
 protected:
   using Matrix<const T, device>::tileLinearIndex;
 
 private:
   using Matrix<const T, device>::setUpTiles;
   using Matrix<const T, device>::tile_managers_;
+  using Matrix<const T, device>::tile_rw_mutexes_;
 };
 
 template <class T, Device device>
@@ -175,6 +187,15 @@ class Matrix<const T, device> : public internal::MatrixBase {
     return read(distribution().localTileIndex(index));
   }
 
+  auto read_sender(const LocalTileIndex& index) noexcept {
+    std::size_t i = to_sizet(tileLinearIndex(index));
+    return tile_rw_mutexes_[i].read();
+  }
+
+  auto read_sender(const GlobalTileIndex& index) {
+    return read_sender(distribution().localTileIndex(index));
+  }
+
   /// Synchronization barrier for all local tiles in the matrix
   ///
   /// This blocking call does not return until all operations, i.e. both RO and RW,
@@ -187,6 +208,7 @@ class Matrix<const T, device> : public internal::MatrixBase {
   void setUpTiles(const memory::MemoryView<ElementType, device>& mem, const LayoutInfo& layout) noexcept;
 
   std::vector<internal::TileFutureManager<T, device>> tile_managers_;
+  std::vector<hpx::experimental::async_rw_mutex<Tile<T, device>, Tile<const T, device>>> tile_rw_mutexes_;
 };
 
 // Note: the templates of the following helper functions are inverted w.r.t. the Matrix templates
diff --git a/include/dlaf/matrix/matrix_const.tpp b/include/dlaf/matrix/matrix_const.tpp
index 11ef1cb452..4a271ec7a5 100644
--- a/include/dlaf/matrix/matrix_const.tpp
+++ b/include/dlaf/matrix/matrix_const.tpp
@@ -70,6 +70,10 @@ void Matrix<const T, device>::setUpTiles(const memory::MemoryView<ElementType, d
       tile_managers_.emplace_back(
           TileType(tile_size, MemView(mem, layout.tileOffset(ind), layout.minTileMemSize(tile_size)),
                    layout.ldTile()));
+      // TODO: This is currently duplicated. Only one of the two is needed.
+      tile_rw_mutexes_.emplace_back(
+          TileType(tile_size, MemView(mem, layout.tileOffset(ind), layout.minTileMemSize(tile_size)),
+                   layout.ldTile()));
     }
   }
 }
diff --git a/include/dlaf/matrix/tile.tpp b/include/dlaf/matrix/tile.tpp
index 4d9965d2b7..740c1537f7 100644
--- a/include/dlaf/matrix/tile.tpp
+++ b/include/dlaf/matrix/tile.tpp
@@ -30,7 +30,7 @@ Tile<const T, device>::Tile(Tile&& rhs) noexcept
 template <class T, Device device>
 Tile<const T, device>::~Tile() {
   if (p_) {
-    if (std::uncaught_exception())
+    if (std::uncaught_exceptions() > 0)
       p_->set_exception(std::make_exception_ptr(ContinuationException{}));
     else
       p_->set_value(Tile<ElementType, device>(size_, std::move(memory_view_), ld_));
diff --git a/include/dlaf/solver/triangular/impl.h b/include/dlaf/solver/triangular/impl.h
index 046a0b0fc5..bffa2c2ee8 100644
--- a/include/dlaf/solver/triangular/impl.h
+++ b/include/dlaf/solver/triangular/impl.h
@@ -11,6 +11,13 @@
 
 #include <hpx/local/future.hpp>
 
+// TODO: Clean up unneeded includes
+#include <hpx/execution/algorithms/detach.hpp>
+#include <hpx/execution/algorithms/just.hpp>
+#include <hpx/execution/algorithms/on.hpp>
+#include <hpx/executors/p0443_executor.hpp>
+#include <hpx/synchronization/async_rw_mutex.hpp>
+
 #include "dlaf/blas/tile.h"
 #include "dlaf/common/index2d.h"
 #include "dlaf/common/pipeline.h"
@@ -24,6 +31,7 @@
 #include "dlaf/matrix/distribution.h"
 #include "dlaf/matrix/matrix.h"
 #include "dlaf/solver/triangular/api.h"
+#include "dlaf/transform.h"
 #include "dlaf/util_matrix.h"
 
 namespace dlaf {
@@ -82,6 +90,8 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  namespace ex = hpx::execution::experimental;
+
   constexpr auto Left = blas::Side::Left;
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
@@ -105,9 +115,29 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), op, NoTrans, beta,
-                      mat_a.read(LocalTileIndex{k, i}), mat_b.read(kj), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+
+        // Futures as senders version:
+        ex::detach(
+            transform<backend>(when_all_lift(op, blas::Op::NoTrans, beta,
+                                             // TODO: Add read_sender for use in sender algorithms?
+                                             ex::keep_future(mat_a.read(LocalTileIndex{k, i})),
+                                             ex::keep_future(mat_b.read(kj)), T(1.0),
+                                             // mat_a.read(LocalTileIndex{i, j}),
+                                             // mat_b.read(kj), T(1.0),
+                                             mat_b(LocalTileIndex{i, j})),
+                               tile::gemm_o));
+        // Alternative:
+        // tile::gemm<backend> in place of tile::transform<backend>
+
+        // Pure sender version (with async_rw_mutex):
+        // - add versions of read and operator() to Matrix which return
+        //   async_rw_mutex senders
+        // - remove the keep_future calls
+
+        // Original dataflow/future version
+        // hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), op, NoTrans, beta,
+        //               mat_a.read(LocalTileIndex{k, i}), mat_b.read(kj), T(1.0),
+        //               mat_b(LocalTileIndex{i, j}));
       }
     }
   }
diff --git a/include/dlaf/transform.h b/include/dlaf/transform.h
new file mode 100644
index 0000000000..b114687a2e
--- /dev/null
+++ b/include/dlaf/transform.h
@@ -0,0 +1,161 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2020-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <hpx/local/execution.hpp>
+
+#include "dlaf/init.h"
+#include "dlaf/types.h"
+
+#ifdef DLAF_WITH_CUDA
+// TODO: Only for the pools, not the executors
+#include "dlaf/cublas/executor.h"
+#include "dlaf/cuda/executor.h"
+#endif
+
+namespace dlaf {
+// TODO: Upstream. In what form? execution::dataflow equivalent to
+// when_all_lift | on | transform?
+template <typename S, typename = std::enable_if_t<hpx::execution::experimental::is_sender<S>::value>>
+decltype(auto) lift_non_senders(S&& s) {
+  return std::forward<S>(s);
+}
+
+template <typename S, typename = std::enable_if_t<!hpx::execution::experimental::is_sender<S>::value>>
+auto lift_non_senders(S&& s) {
+  return hpx::execution::experimental::just(std::forward<S>(s));
+}
+
+template <typename... Ts>
+auto when_all_lift(Ts&&... ts) {
+  return hpx::execution::experimental::when_all(lift_non_senders<Ts>(std::forward<Ts>(ts))...);
+}
+namespace internal {
+// DLAF-specific transform, templated on a backend. This, together with
+// when_all, takes the place of dataflow(executor, ...)
+
+// TODO: Priorities. Do backends need to be types (such that priorities can be
+// attached to them)? Should tile algorithms take backend-specific execution
+// policies? Should we go back to customizing hpx::execution::transform based on
+// a via/on predecessor sender?
+template <Backend B>
+struct transform;
+
+// For Backend::MC we use the regular thread pool scheduler from HPX.
+template <>
+struct transform<Backend::MC> {
+  template <typename S, typename F>
+  static auto call(S&& s, F&& f) {
+    namespace ex = hpx::execution::experimental;
+    return ex::transform(ex::on(std::forward<S>(s), ex::executor{}),
+                         hpx::util::unwrapping(std::forward<F>(f)));
+  }
+};
+
+#ifdef DLAF_WITH_CUDA
+// For Backend::GPU we use a custom sender. This currently handles CUDA stream
+// and cuBLAS handle functions.
+template <>
+struct transform<Backend::GPU> {
+  template <typename S, typename F>
+  struct gpu_transform_sender {
+    std::decay_t<S> s;
+    std::decay_t<F> f;
+
+    // TODO: Non-void functions
+    template <template <typename...> class Tuple, template <typename...> class Variant>
+    using value_types = Variant<Tuple<>>;
+
+    // TODO: Add predecessor error_types
+    template <template <typename...> class Variant>
+    using error_types = Variant<std::exception_ptr>;
+
+    static constexpr bool sends_done = false;
+
+    template <typename R>
+    struct gpu_transform_receiver {
+      std::decay_t<R> r;
+      std::decay_t<F> f;
+
+      template <typename E>
+          void set_error(E&& e) && noexcept {
+        hpx::execution::experimental::set_error(std::move(r), std::forward<E>(e));
+      }
+
+      void set_done() && noexcept {
+        hpx::execution::experimental::set_done(std::move(r));
+      }
+
+      template <typename... Ts>
+      void set_value(Ts&&... ts) {
+        // TODO: Dispatch with cuda stream, cublas handle, or cusolver handle
+        // depending on what works.
+        auto stream_pool = getNpCudaStreamPool();
+        auto handle_pool = getCublasHandlePool();
+        cudaStream_t stream = stream_pool.getNextStream();
+        cublasHandle_t handle = handle_pool.getNextHandle(stream);
+
+        // TODO: Non-void functions
+        // TODO: Exception handling
+
+        // NOTE: ts is not forwarded because we keep the pack alive longer in
+        // the continuation.
+        using unwrapping_function_type = decltype(hpx::util::unwrapping(std::move(f)));
+        static_assert(
+            std::is_invocable_v<unwrapping_function_type, Ts..., cudaStream_t> ||
+                std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Ts...>,
+            "function passed to transform<GPU> must be invocable with a cublasStream_t as the last argument or a cublasHandle_t as the first argument");
+
+        if constexpr (std::is_invocable_v<unwrapping_function_type, Ts..., cudaStream_t>) {
+          std::invoke(hpx::util::unwrapping(std::move(f)), ts..., stream);
+        }
+        else if constexpr (std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Ts...>) {
+          std::invoke(hpx::util::unwrapping(std::move(f)), handle, ts...);
+        }
+        // TODO: cusolver case
+
+        // TODO: This does not need a full future. It allocates two shared
+        // states: one for the future returned from get_future_with_event, and
+        // one for the future returned from future::then. A callback triggered
+        // by event completion would be enough (that likely implies one heap
+        // allocation, however).
+        hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
+        fut.then(hpx::launch::sync,
+                 [r = std::move(r),
+                  keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                               std::move(handle_pool))](hpx::future<void>&&) mutable {
+                   hpx::execution::experimental::set_value(std::move(r));
+                 });
+      }
+    };
+
+    template <typename R>
+    auto connect(R&& r) && {
+      return hpx::execution::experimental::connect(std::move(s),
+                                                   gpu_transform_receiver<R>{std::forward<R>(r),
+                                                                             std::move(f)});
+    }
+  };
+
+  template <typename S, typename F>
+  static auto call(S&& s, F&& f) {
+    return gpu_transform_sender<S, F>{std::forward<S>(s), std::forward<F>(f)};
+  }
+};
+#endif
+}
+
+template <Backend B, typename S, typename F>
+decltype(auto) transform(S&& s, F&& f) {
+  return internal::transform<B>::call(std::forward<S>(s), std::forward<F>(f));
+}
+
+// TODO: operator| overloads, if useful
+}
diff --git a/spack/packages/dla-future/package.py b/spack/packages/dla-future/package.py
index a05c5c796b..5d9f2f01cd 100644
--- a/spack/packages/dla-future/package.py
+++ b/spack/packages/dla-future/package.py
@@ -27,8 +27,7 @@ class DlaFuture(CMakePackage, CudaPackage):
     depends_on("lapackpp")
     depends_on("umpire~examples")
     depends_on("umpire+cuda~shared", when="+cuda")
-    depends_on("hpx cxxstd=14 networking=none +async_mpi")
-    depends_on("hpx@1.6.0:")
+    depends_on("hpx networking=none +async_mpi")
     depends_on("hpx +cuda", when="+cuda")
 
     depends_on("hpx build_type=Debug", when="build_type=Debug")
diff --git a/test/include/dlaf_test/matrix/util_matrix.h b/test/include/dlaf_test/matrix/util_matrix.h
index 3a4efc98d8..884ebe5c9d 100644
--- a/test/include/dlaf_test/matrix/util_matrix.h
+++ b/test/include/dlaf_test/matrix/util_matrix.h
@@ -73,6 +73,16 @@ void set(MatrixType<T, Device::CPU>& mat, ElementGetter el) {
                                      tile_base_index.col() + tile_index.col()));
       };
       set(mat(tile_index).get(), el_tile);
+      // TODO: Sender equivalent is roughly this (readwrite_sender's value_types
+      // contains a proxy object):
+      // set(ex::sync_wait(mat.readwrite_sender(tile_index)), el_tile);
+      //
+      // OR (still a proxy object):
+      // set(ex::make_future(mat.readwrite_sender(tile_index)).get(), el_tile);
+      //
+      // NOTE: This is also possible with futures (change all .gets to sync_wait
+      // first?):
+      // set(ex::sync_wait(mat(tile_index)), el_tile);
     }
   }
 }
diff --git a/test/unit/solver/test_triangular.cpp b/test/unit/solver/test_triangular.cpp
index 186e1cce9b..2410a5e852 100644
--- a/test/unit/solver/test_triangular.cpp
+++ b/test/unit/solver/test_triangular.cpp
@@ -135,16 +135,21 @@ void testTriangularSolver(comm::CommunicatorGrid grid, blas::Side side, blas::Up
     std::tie(el_op_a, el_b, res_b) =
         getRightTriangularSystem<GlobalElementIndex, T>(uplo, op, diag, alpha, n);
 
+  // TODO: This requires set to know about senders.
   set(mat_ah, el_op_a, op);
   set(mat_bh, el_b);
 
   {
+    // Plain access does not require any special adaptation for senders. TODO:
+    // Copying requires adaptation for senders.
     MatrixMirror<T, D, Device::CPU> mat_a(mat_ah);
     MatrixMirror<T, D, Device::CPU> mat_b(mat_bh);
 
+    // TODO: The algorithm itself needs to know about senders.
     solver::triangular<B, D, T>(grid, side, uplo, op, diag, alpha, mat_a.get(), mat_b.get());
   }
 
+  // TODO: This probably also needs to know about senders.
   CHECK_MATRIX_NEAR(res_b, mat_bh, 20 * (mat_bh.size().rows() + 1) * TypeUtilities<T>::error,
                     20 * (mat_bh.size().rows() + 1) * TypeUtilities<T>::error);
 }

From 84ddbf9d5a2b3704a8227ec00e061a2677ab9153 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 20 May 2021 14:06:48 +0200
Subject: [PATCH 02/15] Move CUDA/cuBLAS pools into separate headers

---
 include/dlaf/cublas/executor.h    |  71 +---------------
 include/dlaf/cublas/handle_pool.h | 108 ++++++++++++++++++++++++
 include/dlaf/cuda/executor.h      |  99 +---------------------
 include/dlaf/cuda/stream_pool.h   | 135 ++++++++++++++++++++++++++++++
 include/dlaf/transform.h          |   5 +-
 5 files changed, 247 insertions(+), 171 deletions(-)
 create mode 100644 include/dlaf/cublas/handle_pool.h
 create mode 100644 include/dlaf/cuda/stream_pool.h

diff --git a/include/dlaf/cublas/executor.h b/include/dlaf/cublas/executor.h
index 2808bc3940..37e83c3981 100644
--- a/include/dlaf/cublas/executor.h
+++ b/include/dlaf/cublas/executor.h
@@ -31,51 +31,13 @@
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cublas/error.h"
+#include "dlaf/cublas/handle_pool.h"
 #include "dlaf/cuda/error.h"
 #include "dlaf/cuda/executor.h"
 
 namespace dlaf {
 namespace cublas {
 namespace internal {
-class HandlePoolImpl {
-  int device_;
-  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
-  std::vector<cublasHandle_t> handles_;
-  cublasPointerMode_t ptr_mode_;
-
-public:
-  HandlePoolImpl(int device, cublasPointerMode_t ptr_mode)
-      : device_(device), handles_(num_worker_threads_), ptr_mode_(ptr_mode) {
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-
-    for (auto& h : handles_) {
-      DLAF_CUBLAS_CALL(cublasCreate(&h));
-    }
-  }
-
-  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
-  HandlePoolImpl(HandlePoolImpl&&) = default;
-  HandlePoolImpl(const HandlePoolImpl&) = delete;
-  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
-
-  ~HandlePoolImpl() {
-    for (auto& h : handles_) {
-      DLAF_CUBLAS_CALL(cublasDestroy(h));
-    }
-  }
-
-  cublasHandle_t getNextHandle(cudaStream_t stream) {
-    cublasHandle_t handle = handles_[hpx::get_worker_thread_num()];
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-    DLAF_CUBLAS_CALL(cublasSetStream(handle, stream));
-    DLAF_CUBLAS_CALL(cublasSetPointerMode(handle, ptr_mode_));
-    return handle;
-  }
-
-  int getDevice() {
-    return device_;
-  }
-};
 
 template <bool IsCallable, typename F, typename... Ts>
 struct isAsyncCublasCallableImpl : std::false_type {
@@ -99,37 +61,6 @@ struct isDataflowCublasCallable
                                                 std::declval<Futures>()))> {};
 }
 
-/// A pool of cuBLAS handles with reference semantics (copying points to the
-/// same underlying cuBLAS handles, last reference destroys the references).
-/// Allows access to cuBLAS handles associated with a particular stream. The
-/// user must ensure that the handle pool and the stream use the same device.
-/// Each HPX worker thread is assigned thread local cuBLAS handle.
-class HandlePool {
-  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
-
-public:
-  HandlePool(int device = 0, cublasPointerMode_t ptr_mode = CUBLAS_POINTER_MODE_HOST)
-      : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device, ptr_mode)) {}
-
-  cublasHandle_t getNextHandle(cudaStream_t stream) {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getNextHandle(stream);
-  }
-
-  int getDevice() {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getDevice();
-  }
-
-  bool operator==(HandlePool const& rhs) const noexcept {
-    return handles_ptr_ == rhs.handles_ptr_;
-  }
-
-  bool operator!=(HandlePool const& rhs) const noexcept {
-    return !(*this == rhs);
-  }
-};
-
 /// An executor for cuBLAS functions. Uses handles and streams from the given
 /// HandlePool and StreamPool. A cuBLAS function is defined as any function that
 /// takes a cuBLAS handle as the first argument. The executor inserts a cuBLAS
diff --git a/include/dlaf/cublas/handle_pool.h b/include/dlaf/cublas/handle_pool.h
new file mode 100644
index 0000000000..bd82a59ea9
--- /dev/null
+++ b/include/dlaf/cublas/handle_pool.h
@@ -0,0 +1,108 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+/// @file
+
+#ifdef DLAF_WITH_CUDA
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include <hpx/runtime.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/cublas/error.h"
+#include "dlaf/cuda/error.h"
+#include "dlaf/cuda/executor.h"
+
+namespace dlaf {
+namespace cublas {
+namespace internal {
+class HandlePoolImpl {
+  int device_;
+  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
+  std::vector<cublasHandle_t> handles_;
+  cublasPointerMode_t ptr_mode_;
+
+public:
+  HandlePoolImpl(int device, cublasPointerMode_t ptr_mode)
+      : device_(device), handles_(num_worker_threads_), ptr_mode_(ptr_mode) {
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+
+    for (auto& h : handles_) {
+      DLAF_CUBLAS_CALL(cublasCreate(&h));
+    }
+  }
+
+  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
+  HandlePoolImpl(HandlePoolImpl&&) = default;
+  HandlePoolImpl(const HandlePoolImpl&) = delete;
+  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
+
+  ~HandlePoolImpl() {
+    for (auto& h : handles_) {
+      DLAF_CUBLAS_CALL(cublasDestroy(h));
+    }
+  }
+
+  cublasHandle_t getNextHandle(cudaStream_t stream) {
+    cublasHandle_t handle = handles_[hpx::get_worker_thread_num()];
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+    DLAF_CUBLAS_CALL(cublasSetStream(handle, stream));
+    DLAF_CUBLAS_CALL(cublasSetPointerMode(handle, ptr_mode_));
+    return handle;
+  }
+
+  int getDevice() {
+    return device_;
+  }
+};
+}
+
+/// A pool of cuBLAS handles with reference semantics (copying points to the
+/// same underlying cuBLAS handles, last reference destroys the references).
+/// Allows access to cuBLAS handles associated with a particular stream. The
+/// user must ensure that the handle pool and the stream use the same device.
+/// Each HPX worker thread is assigned thread local cuBLAS handle.
+class HandlePool {
+  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
+
+public:
+  HandlePool(int device = 0, cublasPointerMode_t ptr_mode = CUBLAS_POINTER_MODE_HOST)
+      : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device, ptr_mode)) {}
+
+  cublasHandle_t getNextHandle(cudaStream_t stream) {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getNextHandle(stream);
+  }
+
+  int getDevice() {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getDevice();
+  }
+
+  bool operator==(HandlePool const& rhs) const noexcept {
+    return handles_ptr_ == rhs.handles_ptr_;
+  }
+
+  bool operator!=(HandlePool const& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+};
+}
+}
+
+#endif
diff --git a/include/dlaf/cuda/executor.h b/include/dlaf/cuda/executor.h
index b8952614d8..9c8984ef92 100644
--- a/include/dlaf/cuda/executor.h
+++ b/include/dlaf/cuda/executor.h
@@ -28,107 +28,10 @@
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cuda/error.h"
+#include "dlaf/cuda/stream_pool.h"
 
 namespace dlaf {
 namespace cuda {
-namespace internal {
-
-struct StreamPoolImpl {
-  int device_;
-  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
-  std::size_t num_streams_per_worker_thread_;
-  std::vector<cudaStream_t> streams_;
-  std::vector<hpx::util::cache_aligned_data<std::size_t>> current_stream_idxs_;
-
-  StreamPoolImpl(int device, std::size_t num_streams_per_worker_thread,
-                 hpx::threads::thread_priority hpx_thread_priority)
-      : device_(device), num_streams_per_worker_thread_(num_streams_per_worker_thread),
-        streams_(num_worker_threads_ * num_streams_per_worker_thread),
-        current_stream_idxs_(num_worker_threads_, {std::size_t(0)}) {
-    DLAF_CUDA_CALL(cudaSetDevice(device));
-
-    // We map hpx::threads::thread_priority::high to the highest CUDA stream
-    // priority, and the rest to the lowest. Typically CUDA streams will only
-    // have two priorities.
-    int least_priority, greatest_priority;
-    DLAF_CUDA_CALL(cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
-    int stream_priority = least_priority;
-    if (hpx_thread_priority == hpx::threads::thread_priority::high) {
-      stream_priority = greatest_priority;
-    }
-
-    for (auto& s : streams_) {
-      DLAF_CUDA_CALL(cudaStreamCreateWithPriority(&s, cudaStreamNonBlocking, stream_priority));
-    }
-  }
-
-  StreamPoolImpl& operator=(StreamPoolImpl&&) = default;
-  StreamPoolImpl(StreamPoolImpl&&) = default;
-  StreamPoolImpl(const StreamPoolImpl&) = delete;
-  StreamPoolImpl& operator=(const StreamPoolImpl&) = delete;
-
-  ~StreamPoolImpl() {
-    for (auto& s : streams_) {
-      DLAF_CUDA_CALL(cudaStreamDestroy(s));
-    }
-  }
-
-  cudaStream_t getNextStream() {
-    // Set the device corresponding to the CUBLAS handle.
-    //
-    // The CUBLAS library context is tied to the current CUDA device [1]. A previous task scheduled on
-    // the same thread may have set a different device, this makes sure the correct device is used. The
-    // function is considered very low overhead call [2].
-    //
-    // [1]: https://docs.nvidia.com/cuda/cublas/index.html#cublascreate
-    // [2]: CUDA Runtime API, section 5.1 Device Management
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-    const std::size_t worker_thread_num = hpx::get_worker_thread_num();
-    DLAF_ASSERT(worker_thread_num != std::size_t(-1), worker_thread_num);
-    std::size_t stream_idx =
-        worker_thread_num * num_streams_per_worker_thread_ +
-        (++current_stream_idxs_[worker_thread_num].data_ % num_streams_per_worker_thread_);
-
-    return streams_[stream_idx];
-  }
-
-  int getDevice() {
-    return device_;
-  }
-};
-}
-
-/// A pool of CUDA streams with reference semantics (copying points to the same
-/// underlying CUDA streams, last reference destroys the references).  Allows
-/// access to CUDA streams in a round-robin fashion.  Each HPX worker thread is
-/// assigned a set of thread local CUDA streams.
-class StreamPool {
-  std::shared_ptr<internal::StreamPoolImpl> streams_ptr_;
-
-public:
-  StreamPool(int device = 0, std::size_t num_streams_per_worker_thread = 3,
-             hpx::threads::thread_priority hpx_thread_priority = hpx::threads::thread_priority::default_)
-      : streams_ptr_(std::make_shared<internal::StreamPoolImpl>(device, num_streams_per_worker_thread,
-                                                                hpx_thread_priority)) {}
-
-  cudaStream_t getNextStream() {
-    DLAF_ASSERT(bool(streams_ptr_), "");
-    return streams_ptr_->getNextStream();
-  }
-
-  int getDevice() {
-    DLAF_ASSERT(bool(streams_ptr_), "");
-    return streams_ptr_->getDevice();
-  }
-
-  bool operator==(StreamPool const& rhs) const noexcept {
-    return streams_ptr_ == rhs.streams_ptr_;
-  }
-
-  bool operator!=(StreamPool const& rhs) const noexcept {
-    return !(*this == rhs);
-  }
-};
 
 /// An executor for CUDA functions. Uses streams from the given StreamPool. A
 /// CUDA function is defined as any function that takes a CUDA stream as the
diff --git a/include/dlaf/cuda/stream_pool.h b/include/dlaf/cuda/stream_pool.h
new file mode 100644
index 0000000000..456dadcf10
--- /dev/null
+++ b/include/dlaf/cuda/stream_pool.h
@@ -0,0 +1,135 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+/// @file
+
+#ifdef DLAF_WITH_CUDA
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+
+#include <cuda_runtime.h>
+
+#include <hpx/functional.hpp>
+#include <hpx/future.hpp>
+#include <hpx/include/util.hpp>
+#include <hpx/runtime.hpp>
+#include <hpx/thread.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/cuda/error.h"
+
+namespace dlaf {
+namespace cuda {
+namespace internal {
+
+struct StreamPoolImpl {
+  int device_;
+  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
+  std::size_t num_streams_per_worker_thread_;
+  std::vector<cudaStream_t> streams_;
+  std::vector<hpx::util::cache_aligned_data<std::size_t>> current_stream_idxs_;
+
+  StreamPoolImpl(int device, std::size_t num_streams_per_worker_thread,
+                 hpx::threads::thread_priority hpx_thread_priority)
+      : device_(device), num_streams_per_worker_thread_(num_streams_per_worker_thread),
+        streams_(num_worker_threads_ * num_streams_per_worker_thread),
+        current_stream_idxs_(num_worker_threads_, {std::size_t(0)}) {
+    DLAF_CUDA_CALL(cudaSetDevice(device));
+
+    // We map hpx::threads::thread_priority::high to the highest CUDA stream
+    // priority, and the rest to the lowest. Typically CUDA streams will only
+    // have two priorities.
+    int least_priority, greatest_priority;
+    DLAF_CUDA_CALL(cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+    int stream_priority = least_priority;
+    if (hpx_thread_priority == hpx::threads::thread_priority::high) {
+      stream_priority = greatest_priority;
+    }
+
+    for (auto& s : streams_) {
+      DLAF_CUDA_CALL(cudaStreamCreateWithPriority(&s, cudaStreamNonBlocking, stream_priority));
+    }
+  }
+
+  StreamPoolImpl& operator=(StreamPoolImpl&&) = default;
+  StreamPoolImpl(StreamPoolImpl&&) = default;
+  StreamPoolImpl(const StreamPoolImpl&) = delete;
+  StreamPoolImpl& operator=(const StreamPoolImpl&) = delete;
+
+  ~StreamPoolImpl() {
+    for (auto& s : streams_) {
+      DLAF_CUDA_CALL(cudaStreamDestroy(s));
+    }
+  }
+
+  cudaStream_t getNextStream() {
+    // Set the device corresponding to the CUBLAS handle.
+    //
+    // The CUBLAS library context is tied to the current CUDA device [1]. A previous task scheduled on
+    // the same thread may have set a different device, this makes sure the correct device is used. The
+    // function is considered very low overhead call [2].
+    //
+    // [1]: https://docs.nvidia.com/cuda/cublas/index.html#cublascreate
+    // [2]: CUDA Runtime API, section 5.1 Device Management
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+    const std::size_t worker_thread_num = hpx::get_worker_thread_num();
+    DLAF_ASSERT(worker_thread_num != std::size_t(-1), worker_thread_num);
+    std::size_t stream_idx =
+        worker_thread_num * num_streams_per_worker_thread_ +
+        (++current_stream_idxs_[worker_thread_num].data_ % num_streams_per_worker_thread_);
+
+    return streams_[stream_idx];
+  }
+
+  int getDevice() {
+    return device_;
+  }
+};
+}
+
+/// A pool of CUDA streams with reference semantics (copying points to the same
+/// underlying CUDA streams, last reference destroys the references).  Allows
+/// access to CUDA streams in a round-robin fashion.  Each HPX worker thread is
+/// assigned a set of thread local CUDA streams.
+class StreamPool {
+  std::shared_ptr<internal::StreamPoolImpl> streams_ptr_;
+
+public:
+  StreamPool(int device = 0, std::size_t num_streams_per_worker_thread = 3,
+             hpx::threads::thread_priority hpx_thread_priority = hpx::threads::thread_priority::default_)
+      : streams_ptr_(std::make_shared<internal::StreamPoolImpl>(device, num_streams_per_worker_thread,
+                                                                hpx_thread_priority)) {}
+
+  cudaStream_t getNextStream() {
+    DLAF_ASSERT(bool(streams_ptr_), "");
+    return streams_ptr_->getNextStream();
+  }
+
+  int getDevice() {
+    DLAF_ASSERT(bool(streams_ptr_), "");
+    return streams_ptr_->getDevice();
+  }
+
+  bool operator==(StreamPool const& rhs) const noexcept {
+    return streams_ptr_ == rhs.streams_ptr_;
+  }
+
+  bool operator!=(StreamPool const& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+};
+}
+}
+
+#endif
diff --git a/include/dlaf/transform.h b/include/dlaf/transform.h
index b114687a2e..460f654324 100644
--- a/include/dlaf/transform.h
+++ b/include/dlaf/transform.h
@@ -15,9 +15,8 @@
 #include "dlaf/types.h"
 
 #ifdef DLAF_WITH_CUDA
-// TODO: Only for the pools, not the executors
-#include "dlaf/cublas/executor.h"
-#include "dlaf/cuda/executor.h"
+#include "dlaf/cublas/handle_pool.h"
+#include "dlaf/cuda/stream_pool.h"
 #endif
 
 namespace dlaf {

From aa9991a9b855fbc2b36a35240f97c8f1b461276b Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 20 May 2021 14:07:08 +0200
Subject: [PATCH 03/15] Allow passing priority to transform function

---
 include/dlaf/transform.h | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/dlaf/transform.h b/include/dlaf/transform.h
index 460f654324..d51b482966 100644
--- a/include/dlaf/transform.h
+++ b/include/dlaf/transform.h
@@ -39,11 +39,6 @@ auto when_all_lift(Ts&&... ts) {
 namespace internal {
 // DLAF-specific transform, templated on a backend. This, together with
 // when_all, takes the place of dataflow(executor, ...)
-
-// TODO: Priorities. Do backends need to be types (such that priorities can be
-// attached to them)? Should tile algorithms take backend-specific execution
-// policies? Should we go back to customizing hpx::execution::transform based on
-// a via/on predecessor sender?
 template <Backend B>
 struct transform;
 
@@ -51,9 +46,9 @@ struct transform;
 template <>
 struct transform<Backend::MC> {
   template <typename S, typename F>
-  static auto call(S&& s, F&& f) {
+  static auto call(S&& s, F&& f, hpx::threads::thread_priority priority) {
     namespace ex = hpx::execution::experimental;
-    return ex::transform(ex::on(std::forward<S>(s), ex::executor{}),
+    return ex::transform(ex::on(std::forward<S>(s), ex::make_with_priority(ex::executor{}, priority)),
                          hpx::util::unwrapping(std::forward<F>(f)));
   }
 };
@@ -67,6 +62,7 @@ struct transform<Backend::GPU> {
   struct gpu_transform_sender {
     std::decay_t<S> s;
     std::decay_t<F> f;
+    hpx::threads::thread_priority priority;
 
     // TODO: Non-void functions
     template <template <typename...> class Tuple, template <typename...> class Variant>
@@ -96,7 +92,10 @@ struct transform<Backend::GPU> {
       void set_value(Ts&&... ts) {
         // TODO: Dispatch with cuda stream, cublas handle, or cusolver handle
         // depending on what works.
-        auto stream_pool = getNpCudaStreamPool();
+        // TODO: This is accessed when the predecessor is ready. Do we need to
+        // access the stream/handle pools earlier?
+        auto stream_pool = priority >= hpx::threads::thread_priority::high ? getHpCudaStreamPool()
+                                                                           : getNpCudaStreamPool();
         auto handle_pool = getCublasHandlePool();
         cudaStream_t stream = stream_pool.getNextStream();
         cublasHandle_t handle = handle_pool.getNextHandle(stream);
@@ -144,16 +143,17 @@ struct transform<Backend::GPU> {
   };
 
   template <typename S, typename F>
-  static auto call(S&& s, F&& f) {
-    return gpu_transform_sender<S, F>{std::forward<S>(s), std::forward<F>(f)};
+  static auto call(S&& s, F&& f, hpx::threads::thread_priority priority) {
+    return gpu_transform_sender<S, F>{std::forward<S>(s), std::forward<F>(f), priority};
   }
 };
 #endif
 }
 
 template <Backend B, typename S, typename F>
-decltype(auto) transform(S&& s, F&& f) {
-  return internal::transform<B>::call(std::forward<S>(s), std::forward<F>(f));
+decltype(auto) transform(
+    S&& s, F&& f, hpx::threads::thread_priority priority = hpx::threads::thread_priority::normal) {
+  return internal::transform<B>::call(std::forward<S>(s), std::forward<F>(f), priority);
 }
 
 // TODO: operator| overloads, if useful

From 63047a91365c3e791c07069a6afbb238e51026f4 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 20 May 2021 15:04:32 +0200
Subject: [PATCH 04/15] Change Matrix sender access member functions to wrap
 read-tile in keep_future

---
 include/dlaf/matrix/matrix.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/dlaf/matrix/matrix.h b/include/dlaf/matrix/matrix.h
index 47b4341c8f..5a1f64c4ff 100644
--- a/include/dlaf/matrix/matrix.h
+++ b/include/dlaf/matrix/matrix.h
@@ -131,8 +131,7 @@ class Matrix : public Matrix<const T, device> {
   }
 
   auto readwrite_sender(const LocalTileIndex& index) noexcept {
-    std::size_t i = to_sizet(tileLinearIndex(index));
-    return tile_rw_mutexes_[i].readwrite();
+    return this->operator()(index);
   }
 
   auto readwrite_sender(const GlobalTileIndex& index) {
@@ -188,8 +187,9 @@ class Matrix<const T, device> : public internal::MatrixBase {
   }
 
   auto read_sender(const LocalTileIndex& index) noexcept {
-    std::size_t i = to_sizet(tileLinearIndex(index));
-    return tile_rw_mutexes_[i].read();
+    // We want to explicitly deal with the shared_future, no the const& to the
+    // value.
+    return hpx::execution::experimental::keep_future(read(index));
   }
 
   auto read_sender(const GlobalTileIndex& index) {

From 3437688c404955c365613132f3d6ba3bf34a0661 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 20 May 2021 20:51:53 +0200
Subject: [PATCH 05/15] More sender changes

---
 include/dlaf/matrix/copy.h            |  15 ++-
 include/dlaf/matrix/copy_tile.h       |  56 +++++---
 include/dlaf/sender/lift_non_sender.h |  31 +++++
 include/dlaf/{ => sender}/transform.h |  38 +++---
 include/dlaf/sender/when_all_lift.h   |  28 ++++
 include/dlaf/solver/triangular/impl.h | 184 ++++++++++++--------------
 6 files changed, 206 insertions(+), 146 deletions(-)
 create mode 100644 include/dlaf/sender/lift_non_sender.h
 rename include/dlaf/{ => sender}/transform.h (83%)
 create mode 100644 include/dlaf/sender/when_all_lift.h

diff --git a/include/dlaf/matrix/copy.h b/include/dlaf/matrix/copy.h
index fbb2e0786d..453c3f0d75 100644
--- a/include/dlaf/matrix/copy.h
+++ b/include/dlaf/matrix/copy.h
@@ -15,6 +15,7 @@
 
 #include "dlaf/executors.h"
 #include "dlaf/matrix/copy_tile.h"
+#include "dlaf/sender/transform.h"
 #include "dlaf/types.h"
 #include "dlaf/util_matrix.h"
 
@@ -36,10 +37,16 @@ void copy(Matrix<const T, Source>& source, Matrix<T, Destination>& dest) {
   const SizeType local_tile_rows = distribution.localNrTiles().rows();
   const SizeType local_tile_cols = distribution.localNrTiles().cols();
 
-  for (SizeType j = 0; j < local_tile_cols; ++j)
-    for (SizeType i = 0; i < local_tile_rows; ++i)
-      hpx::dataflow(dlaf::getCopyExecutor<Source, Destination>(), unwrapExtendTiles(copy_o),
-                    source.read(LocalTileIndex(i, j)), dest(LocalTileIndex(i, j)));
+  for (SizeType j = 0; j < local_tile_cols; ++j) {
+    for (SizeType i = 0; i < local_tile_rows; ++i) {
+      transform<internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
+                                                                   copy_o,
+                                                                   source.read_sender(
+                                                                       LocalTileIndex(i, j)),
+                                                                   dest.readwrite_sender(
+                                                                       LocalTileIndex(i, j)));
+    }
+  }
 }
 }
 }
diff --git a/include/dlaf/matrix/copy_tile.h b/include/dlaf/matrix/copy_tile.h
index 467e1a77c9..0eb08d77f3 100644
--- a/include/dlaf/matrix/copy_tile.h
+++ b/include/dlaf/matrix/copy_tile.h
@@ -22,10 +22,36 @@
 #include "dlaf/executors.h"
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/tile.h"
+#include "dlaf/sender/transform.h"
 
 namespace dlaf {
 namespace matrix {
 namespace internal {
+template <Device Source, Device Destination>
+struct CopyBackend;
+
+template <>
+struct CopyBackend<Device::CPU, Device::CPU> {
+  static constexpr Backend value = Backend::MC;
+};
+
+#ifdef DLAF_WITH_CUDA
+template <>
+struct CopyBackend<Device::CPU, Device::GPU> {
+  static constexpr Backend value = Backend::GPU;
+};
+
+template <>
+struct CopyBackend<Device::GPU, Device::CPU> {
+  static constexpr Backend value = Backend::GPU;
+};
+
+template <>
+struct CopyBackend<Device::GPU, Device::GPU> {
+  static constexpr Backend value = Backend::GPU;
+};
+#endif
+
 template <typename T, Device Source, Device Destination>
 struct CopyTile;
 
@@ -155,32 +181,22 @@ struct Duplicate {
   }
 };
 
-namespace internal {
-template <Device Destination, Device Source>
-struct DuplicateIfNeeded {
-  template <typename T, template <class> class Future>
-  static auto call(Future<Tile<T, Source>> tile) {
-    return getUnwrapReturnValue(hpx::dataflow(dlaf::getCopyExecutor<Source, Destination>(),
-                                              unwrapExtendTiles(Duplicate<Destination>{}), tile));
-  }
-};
-
-template <Device SourceDestination>
-struct DuplicateIfNeeded<SourceDestination, SourceDestination> {
-  template <typename T, template <class> class Future>
-  static auto call(Future<Tile<T, SourceDestination>> tile) {
-    return tile;
-  }
-};
-}
-
 /// Helper function for duplicating an input tile to Destination asynchronously,
 /// but only if the destination device is different from the source device.
 ///
 /// When Destination and Source are the same, returns the input tile unmodified.
 template <Device Destination, typename T, Device Source, template <class> class Future>
 auto duplicateIfNeeded(Future<Tile<T, Source>> tile) {
-  return internal::DuplicateIfNeeded<Destination, Source>::call(std::move(tile));
+  if constexpr (Source == Destination) {
+    return tile;
+  }
+  else {
+    return hpx::execution::experimental::make_future(
+        dlaf::transform<
+            internal::CopyBackend<Source, Destination>>(hpx::threads::thread_priority::normal,
+                                                        dlaf::matrix::Duplicate<Destination>{},
+                                                        std::move(tile)));
+  }
 }
 }
 }
diff --git a/include/dlaf/sender/lift_non_sender.h b/include/dlaf/sender/lift_non_sender.h
new file mode 100644
index 0000000000..49eb196618
--- /dev/null
+++ b/include/dlaf/sender/lift_non_sender.h
@@ -0,0 +1,31 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2020-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <hpx/local/execution.hpp>
+
+#include <type_traits>
+#include <utility>
+
+namespace dlaf {
+namespace internal {
+// Utility to make a sender out of a non-sender (non-senders are wrapped in
+// just).
+template <typename S, typename = std::enable_if_t<hpx::execution::experimental::is_sender<S>::value>>
+decltype(auto) liftNonSenders(S&& s) {
+  return std::forward<S>(s);
+}
+
+template <typename S, typename = std::enable_if_t<!hpx::execution::experimental::is_sender<S>::value>>
+auto liftNonSenders(S&& s) {
+  return hpx::execution::experimental::just(std::forward<S>(s));
+}
+}
+}
diff --git a/include/dlaf/transform.h b/include/dlaf/sender/transform.h
similarity index 83%
rename from include/dlaf/transform.h
rename to include/dlaf/sender/transform.h
index d51b482966..e2d08d1624 100644
--- a/include/dlaf/transform.h
+++ b/include/dlaf/sender/transform.h
@@ -12,6 +12,7 @@
 #include <hpx/local/execution.hpp>
 
 #include "dlaf/init.h"
+#include "dlaf/sender/when_all_lift.h"
 #include "dlaf/types.h"
 
 #ifdef DLAF_WITH_CUDA
@@ -20,25 +21,9 @@
 #endif
 
 namespace dlaf {
-// TODO: Upstream. In what form? execution::dataflow equivalent to
-// when_all_lift | on | transform?
-template <typename S, typename = std::enable_if_t<hpx::execution::experimental::is_sender<S>::value>>
-decltype(auto) lift_non_senders(S&& s) {
-  return std::forward<S>(s);
-}
-
-template <typename S, typename = std::enable_if_t<!hpx::execution::experimental::is_sender<S>::value>>
-auto lift_non_senders(S&& s) {
-  return hpx::execution::experimental::just(std::forward<S>(s));
-}
-
-template <typename... Ts>
-auto when_all_lift(Ts&&... ts) {
-  return hpx::execution::experimental::when_all(lift_non_senders<Ts>(std::forward<Ts>(ts))...);
-}
 namespace internal {
 // DLAF-specific transform, templated on a backend. This, together with
-// when_all, takes the place of dataflow(executor, ...)
+// when_all, takes the place of dataflow(executor, ...) for futures.
 template <Backend B>
 struct transform;
 
@@ -78,6 +63,7 @@ struct transform<Backend::GPU> {
     struct gpu_transform_receiver {
       std::decay_t<R> r;
       std::decay_t<F> f;
+      hpx::threads::thread_priority priority;
 
       template <typename E>
           void set_error(E&& e) && noexcept {
@@ -138,7 +124,7 @@ struct transform<Backend::GPU> {
     auto connect(R&& r) && {
       return hpx::execution::experimental::connect(std::move(s),
                                                    gpu_transform_receiver<R>{std::forward<R>(r),
-                                                                             std::move(f)});
+                                                                             std::move(f), priority});
     }
   };
 
@@ -150,11 +136,17 @@ struct transform<Backend::GPU> {
 #endif
 }
 
-template <Backend B, typename S, typename F>
-decltype(auto) transform(
-    S&& s, F&& f, hpx::threads::thread_priority priority = hpx::threads::thread_priority::normal) {
-  return internal::transform<B>::call(std::forward<S>(s), std::forward<F>(f), priority);
+// Lazy transform. This does not submit the work and returns a sender.
+template <Backend B, typename F, typename... Ts>
+decltype(auto) transform(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
+  return internal::transform<B>::call(internal::whenAllLift(std::forward<Ts>(ts)...), std::forward<F>(f),
+                                      priority);
 }
 
-// TODO: operator| overloads, if useful
+// Fire-and-forget transform. This submits the work and returns void.
+template <Backend B, typename F, typename... Ts>
+void transform_detach(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
+  hpx::execution::experimental::detach(
+      transform<B>(priority, std::forward<F>(f), std::forward<Ts>(ts)...));
+}
 }
diff --git a/include/dlaf/sender/when_all_lift.h b/include/dlaf/sender/when_all_lift.h
new file mode 100644
index 0000000000..6703789aae
--- /dev/null
+++ b/include/dlaf/sender/when_all_lift.h
@@ -0,0 +1,28 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2020-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+#include <hpx/local/execution.hpp>
+
+#include "dlaf/sender/lift_non_sender.h"
+
+namespace dlaf {
+namespace internal {
+// when_all-like utility which makes senders out of non-senders before passing
+// them to when_all.
+template <typename... Ts>
+auto whenAllLift(Ts&&... ts) {
+  return hpx::execution::experimental::when_all(liftNonSenders<Ts>(std::forward<Ts>(ts))...);
+}
+}
+}
diff --git a/include/dlaf/solver/triangular/impl.h b/include/dlaf/solver/triangular/impl.h
index bffa2c2ee8..f691fcd77f 100644
--- a/include/dlaf/solver/triangular/impl.h
+++ b/include/dlaf/solver/triangular/impl.h
@@ -11,13 +11,6 @@
 
 #include <hpx/local/future.hpp>
 
-// TODO: Clean up unneeded includes
-#include <hpx/execution/algorithms/detach.hpp>
-#include <hpx/execution/algorithms/just.hpp>
-#include <hpx/execution/algorithms/on.hpp>
-#include <hpx/executors/p0443_executor.hpp>
-#include <hpx/synchronization/async_rw_mutex.hpp>
-
 #include "dlaf/blas/tile.h"
 #include "dlaf/common/index2d.h"
 #include "dlaf/common/pipeline.h"
@@ -30,38 +23,22 @@
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/distribution.h"
 #include "dlaf/matrix/matrix.h"
+#include "dlaf/sender/transform.h"
 #include "dlaf/solver/triangular/api.h"
-#include "dlaf/transform.h"
 #include "dlaf/util_matrix.h"
 
 namespace dlaf {
 namespace solver {
 namespace internal {
-
-namespace lln {
-template <class Executor, class T, Device D>
-void trsm_B_panel_tile(Executor&& ex, blas::Diag diag, T alpha,
-                       hpx::shared_future<matrix::Tile<const T, D>> in_tile,
-                       hpx::future<matrix::Tile<T, D>> out_tile) {
-  hpx::dataflow(std::forward<Executor>(ex), matrix::unwrapExtendTiles(tile::trsm_o), blas::Side::Left,
-                blas::Uplo::Lower, blas::Op::NoTrans, diag, alpha, std::move(in_tile),
-                std::move(out_tile));
-}
-
-template <class Executor, class T, Device D>
-void gemm_trailing_matrix_tile(Executor&& ex, T beta,
-                               hpx::shared_future<matrix::Tile<const T, D>> a_tile,
-                               hpx::shared_future<matrix::Tile<const T, D>> b_tile,
-                               hpx::future<matrix::Tile<T, D>> c_tile) {
-  hpx::dataflow(std::forward<Executor>(ex), matrix::unwrapExtendTiles(tile::gemm_o), blas::Op::NoTrans,
-                blas::Op::NoTrans, beta, std::move(a_tile), std::move(b_tile), T(1.0),
-                std::move(c_tile));
-}
-}
-
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
+  constexpr auto Left = blas::Side::Left;
+  constexpr auto Lower = blas::Uplo::Lower;
+  constexpr auto NoTrans = blas::Op::NoTrans;
+
   auto executor_hp = dlaf::getHpExecutor<backend>();
   auto executor_np = dlaf::getNpExecutor<backend>();
 
@@ -73,15 +50,19 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
       auto kj = LocalTileIndex{k, j};
 
       // Triangular solve of k-th row Panel of B
-      lln::trsm_B_panel_tile(executor_hp, diag, alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans,
+                                diag, alpha, mat_a.read_sender(LocalTileIndex{k, k}),
+                                mat_b.readwrite_sender(kj));
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k + 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        lln::gemm_trailing_matrix_tile(trailing_executor, beta, mat_a.read(LocalTileIndex{i, k}),
-                                       mat_b.read(kj), mat_b(LocalTileIndex{i, j}));
+        transform_detach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
+                                  mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -90,6 +71,7 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
   namespace ex = hpx::execution::experimental;
 
   constexpr auto Left = blas::Side::Left;
@@ -106,38 +88,19 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
     for (SizeType j = n - 1; j > -1; --j) {
       auto kj = LocalTileIndex{k, j};
       // Triangular solve of k-th row Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Left, Lower, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, op, diag, alpha,
+                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k - 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
-        // Update trailing matrix
 
-        // Futures as senders version:
-        ex::detach(
-            transform<backend>(when_all_lift(op, blas::Op::NoTrans, beta,
-                                             // TODO: Add read_sender for use in sender algorithms?
-                                             ex::keep_future(mat_a.read(LocalTileIndex{k, i})),
-                                             ex::keep_future(mat_b.read(kj)), T(1.0),
-                                             // mat_a.read(LocalTileIndex{i, j}),
-                                             // mat_b.read(kj), T(1.0),
-                                             mat_b(LocalTileIndex{i, j})),
-                               tile::gemm_o));
-        // Alternative:
-        // tile::gemm<backend> in place of tile::transform<backend>
-
-        // Pure sender version (with async_rw_mutex):
-        // - add versions of read and operator() to Matrix which return
-        //   async_rw_mutex senders
-        // - remove the keep_future calls
-
-        // Original dataflow/future version
-        // hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), op, NoTrans, beta,
-        //               mat_a.read(LocalTileIndex{k, i}), mat_b.read(kj), T(1.0),
-        //               mat_b(LocalTileIndex{i, j}));
+        // Update trailing matrix
+        transform_detach<backend>(priority, tile::gemm_o, op, blas::Op::NoTrans, beta,
+                                  mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -146,6 +109,8 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Left = blas::Side::Left;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
@@ -160,18 +125,18 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
     for (SizeType j = n - 1; j > -1; --j) {
       auto kj = LocalTileIndex{k, j};
       // Triangular solve of k-th row Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Left, Upper, NoTrans, diag,
-                    alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, NoTrans, diag, alpha,
+                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k - 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, NoTrans, beta,
-                      mat_a.read(LocalTileIndex{i, k}), mat_b.read(kj), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transform_detach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta,
+                                  mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -180,6 +145,8 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Left = blas::Side::Left;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
@@ -195,18 +162,18 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
       auto kj = LocalTileIndex{k, j};
 
       // Triangular solve of k-th row Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Left, Upper, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, op, diag, alpha,
+                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k + 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), op, NoTrans, beta,
-                      mat_a.read(LocalTileIndex{k, i}), mat_b.read(kj), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transform_detach<backend>(priority, tile::gemm_o, op, NoTrans, beta,
+                                  mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -215,6 +182,8 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
@@ -230,18 +199,18 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Lower, NoTrans, diag,
-                    alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, NoTrans, diag, alpha,
+                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k - 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, NoTrans, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{k, j}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transform_detach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
+                                  mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -250,6 +219,8 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
@@ -265,18 +236,18 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Lower, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, op, diag, alpha,
+                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k + 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, op, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{j, k}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transform_detach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
+                                  mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -285,6 +256,8 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
@@ -300,18 +273,18 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Upper, NoTrans, diag,
-                    alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, NoTrans, diag, alpha,
+                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k + 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, NoTrans, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{k, j}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transform_detach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
+                                  mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -320,6 +293,8 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
@@ -335,18 +310,18 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Upper, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, op, diag, alpha,
+                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k - 1) ? executor_hp : executor_np;
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, op, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{j, k}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transform_detach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
+                                  mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -355,8 +330,15 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  namespace ex = hpx::execution::experimental;
+  using hpx::threads::thread_priority;
+  using hpx::threads::thread_priority;
   using hpx::util::unwrapping;
 
+  constexpr auto Left = blas::Side::Left;
+  constexpr auto Lower = blas::Uplo::Lower;
+  constexpr auto NoTrans = blas::Op::NoTrans;
+
   using common::internal::vector;
   using ConstTileType = typename Matrix<T, device>::ConstTileType;
 
@@ -407,7 +389,9 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
       if (mat_b.rankIndex().row() == k_rank_row) {
         auto k_local_row = distr_b.localTileFromGlobalTile<Coord::Row>(k);
         auto kj = LocalTileIndex{k_local_row, j_local};
-        lln::trsm_B_panel_tile(executor_hp, diag, alpha, kk_tile, mat_b(kj));
+        transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans,
+                                  diag, alpha, ex::keep_future(kk_tile), mat_b.readwrite_sender(kj));
+
         panel[j_local] = mat_b.read(kj);
         if (k != (mat_b.nrTiles().rows() - 1)) {
           comm::scheduleSendBcast(executor_mpi, panel[j_local], mpi_col_task_chain());
@@ -446,9 +430,11 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
 
       // Update trailing matrix
       for (SizeType j_local = 0; j_local < b_local_cols; ++j_local) {
+        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
         T beta = T(-1.0) / alpha;
-        lln::gemm_trailing_matrix_tile(trailing_executor, beta, ik_tile, panel[j_local],
-                                       mat_b(LocalTileIndex{i_local, j_local}));
+        transform_detach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
+                                  ex::keep_future(ik_tile), ex::keep_future(panel[j_local]), T(1.0),
+                                  mat_b.readwrite_sender(LocalTileIndex{i_local, j_local}));
       }
     }
   }

From 02e456dbbad2333fcfc4f8cd6e684cf4afa1c3cd Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 20 May 2021 21:10:01 +0200
Subject: [PATCH 06/15] Follow naming conventions for sender functionality

---
 include/dlaf/sender/lift_non_sender.h |  4 +-
 include/dlaf/sender/transform.h       | 30 ++++-----
 include/dlaf/sender/when_all_lift.h   |  2 +-
 include/dlaf/solver/triangular/impl.h | 94 ++++++++++++++-------------
 4 files changed, 66 insertions(+), 64 deletions(-)

diff --git a/include/dlaf/sender/lift_non_sender.h b/include/dlaf/sender/lift_non_sender.h
index 49eb196618..1ebd9d99c1 100644
--- a/include/dlaf/sender/lift_non_sender.h
+++ b/include/dlaf/sender/lift_non_sender.h
@@ -19,12 +19,12 @@ namespace internal {
 // Utility to make a sender out of a non-sender (non-senders are wrapped in
 // just).
 template <typename S, typename = std::enable_if_t<hpx::execution::experimental::is_sender<S>::value>>
-decltype(auto) liftNonSenders(S&& s) {
+decltype(auto) liftNonSender(S&& s) {
   return std::forward<S>(s);
 }
 
 template <typename S, typename = std::enable_if_t<!hpx::execution::experimental::is_sender<S>::value>>
-auto liftNonSenders(S&& s) {
+auto liftNonSender(S&& s) {
   return hpx::execution::experimental::just(std::forward<S>(s));
 }
 }
diff --git a/include/dlaf/sender/transform.h b/include/dlaf/sender/transform.h
index e2d08d1624..3d6c9256d9 100644
--- a/include/dlaf/sender/transform.h
+++ b/include/dlaf/sender/transform.h
@@ -25,13 +25,13 @@ namespace internal {
 // DLAF-specific transform, templated on a backend. This, together with
 // when_all, takes the place of dataflow(executor, ...) for futures.
 template <Backend B>
-struct transform;
+struct Transform;
 
 // For Backend::MC we use the regular thread pool scheduler from HPX.
 template <>
-struct transform<Backend::MC> {
+struct Transform<Backend::MC> {
   template <typename S, typename F>
-  static auto call(S&& s, F&& f, hpx::threads::thread_priority priority) {
+  static auto call(hpx::threads::thread_priority priority, S&& s, F&& f) {
     namespace ex = hpx::execution::experimental;
     return ex::transform(ex::on(std::forward<S>(s), ex::make_with_priority(ex::executor{}, priority)),
                          hpx::util::unwrapping(std::forward<F>(f)));
@@ -42,12 +42,12 @@ struct transform<Backend::MC> {
 // For Backend::GPU we use a custom sender. This currently handles CUDA stream
 // and cuBLAS handle functions.
 template <>
-struct transform<Backend::GPU> {
+struct Transform<Backend::GPU> {
   template <typename S, typename F>
-  struct gpu_transform_sender {
+  struct GPUTransformSender {
+    hpx::threads::thread_priority priority;
     std::decay_t<S> s;
     std::decay_t<F> f;
-    hpx::threads::thread_priority priority;
 
     // TODO: Non-void functions
     template <template <typename...> class Tuple, template <typename...> class Variant>
@@ -60,10 +60,10 @@ struct transform<Backend::GPU> {
     static constexpr bool sends_done = false;
 
     template <typename R>
-    struct gpu_transform_receiver {
+    struct GPUTransformReceiver {
+      hpx::threads::thread_priority priority;
       std::decay_t<R> r;
       std::decay_t<F> f;
-      hpx::threads::thread_priority priority;
 
       template <typename E>
           void set_error(E&& e) && noexcept {
@@ -123,14 +123,14 @@ struct transform<Backend::GPU> {
     template <typename R>
     auto connect(R&& r) && {
       return hpx::execution::experimental::connect(std::move(s),
-                                                   gpu_transform_receiver<R>{std::forward<R>(r),
-                                                                             std::move(f), priority});
+                                                   GPUTransformReceiver<R>{priority, std::forward<R>(r),
+                                                                           std::move(f)});
     }
   };
 
   template <typename S, typename F>
-  static auto call(S&& s, F&& f, hpx::threads::thread_priority priority) {
-    return gpu_transform_sender<S, F>{std::forward<S>(s), std::forward<F>(f), priority};
+  static auto call(hpx::threads::thread_priority priority, S&& s, F&& f) {
+    return GPUTransformSender<S, F>{priority, std::forward<S>(s), std::forward<F>(f)};
   }
 };
 #endif
@@ -139,13 +139,13 @@ struct transform<Backend::GPU> {
 // Lazy transform. This does not submit the work and returns a sender.
 template <Backend B, typename F, typename... Ts>
 decltype(auto) transform(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
-  return internal::transform<B>::call(internal::whenAllLift(std::forward<Ts>(ts)...), std::forward<F>(f),
-                                      priority);
+  return internal::Transform<B>::call(priority, internal::whenAllLift(std::forward<Ts>(ts)...),
+                                      std::forward<F>(f));
 }
 
 // Fire-and-forget transform. This submits the work and returns void.
 template <Backend B, typename F, typename... Ts>
-void transform_detach(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
+void transformDetach(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
   hpx::execution::experimental::detach(
       transform<B>(priority, std::forward<F>(f), std::forward<Ts>(ts)...));
 }
diff --git a/include/dlaf/sender/when_all_lift.h b/include/dlaf/sender/when_all_lift.h
index 6703789aae..f8075c1bbe 100644
--- a/include/dlaf/sender/when_all_lift.h
+++ b/include/dlaf/sender/when_all_lift.h
@@ -22,7 +22,7 @@ namespace internal {
 // them to when_all.
 template <typename... Ts>
 auto whenAllLift(Ts&&... ts) {
-  return hpx::execution::experimental::when_all(liftNonSenders<Ts>(std::forward<Ts>(ts))...);
+  return hpx::execution::experimental::when_all(liftNonSender<Ts>(std::forward<Ts>(ts))...);
 }
 }
 }
diff --git a/include/dlaf/solver/triangular/impl.h b/include/dlaf/solver/triangular/impl.h
index f691fcd77f..2fdf394135 100644
--- a/include/dlaf/solver/triangular/impl.h
+++ b/include/dlaf/solver/triangular/impl.h
@@ -9,7 +9,9 @@
 //
 #pragma once
 
+#include <hpx/include/util.hpp>
 #include <hpx/local/future.hpp>
+#include <hpx/thread.hpp>
 
 #include "dlaf/blas/tile.h"
 #include "dlaf/common/index2d.h"
@@ -50,9 +52,9 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
       auto kj = LocalTileIndex{k, j};
 
       // Triangular solve of k-th row Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans,
-                                diag, alpha, mat_a.read_sender(LocalTileIndex{k, k}),
-                                mat_b.readwrite_sender(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans, diag,
+                               alpha, mat_a.read_sender(LocalTileIndex{k, k}),
+                               mat_b.readwrite_sender(kj));
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
@@ -60,9 +62,9 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
-                                  mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -88,8 +90,8 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
     for (SizeType j = n - 1; j > -1; --j) {
       auto kj = LocalTileIndex{k, j};
       // Triangular solve of k-th row Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, op, diag, alpha,
-                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
@@ -98,9 +100,9 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
         auto beta = static_cast<T>(-1.0) / alpha;
 
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, op, blas::Op::NoTrans, beta,
-                                  mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, op, blas::Op::NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -125,8 +127,8 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
     for (SizeType j = n - 1; j > -1; --j) {
       auto kj = LocalTileIndex{k, j};
       // Triangular solve of k-th row Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, NoTrans, diag, alpha,
-                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, NoTrans, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
@@ -134,9 +136,9 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta,
-                                  mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -162,8 +164,8 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
       auto kj = LocalTileIndex{k, j};
 
       // Triangular solve of k-th row Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, op, diag, alpha,
-                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
@@ -171,9 +173,9 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, op, NoTrans, beta,
-                                  mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, op, NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -199,8 +201,8 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, NoTrans, diag, alpha,
-                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, NoTrans, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
@@ -208,9 +210,9 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
-                                  mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -236,8 +238,8 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, op, diag, alpha,
-                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
@@ -245,9 +247,9 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
-                                  mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -273,8 +275,8 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, NoTrans, diag, alpha,
-                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, NoTrans, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
@@ -282,9 +284,9 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
-                                  mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -310,8 +312,8 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      transform_detach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, op, diag, alpha,
-                                mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
@@ -319,9 +321,9 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        transform_detach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
-                                  mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -389,8 +391,8 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
       if (mat_b.rankIndex().row() == k_rank_row) {
         auto k_local_row = distr_b.localTileFromGlobalTile<Coord::Row>(k);
         auto kj = LocalTileIndex{k_local_row, j_local};
-        transform_detach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans,
-                                  diag, alpha, ex::keep_future(kk_tile), mat_b.readwrite_sender(kj));
+        transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans,
+                                 diag, alpha, ex::keep_future(kk_tile), mat_b.readwrite_sender(kj));
 
         panel[j_local] = mat_b.read(kj);
         if (k != (mat_b.nrTiles().rows() - 1)) {
@@ -432,9 +434,9 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
       for (SizeType j_local = 0; j_local < b_local_cols; ++j_local) {
         auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
         T beta = T(-1.0) / alpha;
-        transform_detach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
-                                  ex::keep_future(ik_tile), ex::keep_future(panel[j_local]), T(1.0),
-                                  mat_b.readwrite_sender(LocalTileIndex{i_local, j_local}));
+        transformDetach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
+                                 ex::keep_future(ik_tile), ex::keep_future(panel[j_local]), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i_local, j_local}));
       }
     }
   }

From adddc228260d8abd06ee928ca1279a2482a31f65 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <mikael.simberg@iki.fi>
Date: Fri, 21 May 2021 16:03:44 +0200
Subject: [PATCH 07/15] Clean up notes and TODOs for sender adaptation

---
 include/dlaf/blas/tile.h                    |  51 ---------
 include/dlaf/communication/sync/broadcast.h | 100 -----------------
 include/dlaf/matrix/matrix.h                |   4 -
 include/dlaf/matrix/matrix_const.tpp        |   4 -
 include/dlaf/sender/transform.h             | 118 ++++++++++++--------
 include/dlaf/solver/triangular/impl.h       |   3 +-
 test/include/dlaf_test/matrix/util_matrix.h |  10 --
 test/unit/solver/test_triangular.cpp        |   5 -
 8 files changed, 71 insertions(+), 224 deletions(-)

diff --git a/include/dlaf/blas/tile.h b/include/dlaf/blas/tile.h
index 95e574d143..fa774db7e2 100644
--- a/include/dlaf/blas/tile.h
+++ b/include/dlaf/blas/tile.h
@@ -14,7 +14,6 @@
 #include <functional>
 
 #include <hpx/execution_base/sender.hpp>
-#include <hpx/synchronization/async_rw_mutex.hpp>
 
 #include "dlaf/common/callable_object.h"
 #include "dlaf/init.h"
@@ -45,29 +44,6 @@ void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha, const Tile<co
              beta, c.ptr(), c.ld());
 }
 
-// Potential overload for using async_rw_mutex.
-// NOTE: No implicit conversions to templated function parameters, hence the
-// explicit async_rw_mutex_access_wrappers. Add another unwrapping variant for
-// unwrapping those?
-// template <class T>
-// void gemm(const blas::Op op_a, const blas::Op op_b, const T alpha,
-//           hpx::experimental::detail::async_rw_mutex_access_wrapper<
-//               Tile<T, Device::CPU>, const Tile<const T, Device::CPU>,
-//               hpx::experimental::detail::async_rw_mutex_access_type::read>,
-//           hpx::experimental::detail::async_rw_mutex_access_wrapper<
-//               Tile<T, Device::CPU>, const dlaf::matrix::Tile<const T, Device::CPU>,
-//               hpx::experimental::detail::async_rw_mutex_access_type::read>,
-//           const T beta,
-//           hpx::experimental::detail::async_rw_mutex_access_wrapper<
-//               Tile<T, Device::CPU>, const Tile<const T, Device::CPU>,
-//               hpx::experimental::detail::async_rw_mutex_access_type::readwrite>
-//               c) noexcept {
-//   // TODO: Should async_rw_mutex_access_wrapper have a get method?
-//   // auto s = tile::internal::getGemmSizes(op_a, op_b, a.get(), b.get(), c);
-//   // blas::gemm(blas::Layout::ColMajor, op_a, op_b, s.m, s.n, s.k, alpha, a.get().ptr(), a.get().ld(),
-//   //            b.get().ptr(), b.get().ld(), beta, c.ptr(), c.ld());
-// }
-
 /// Computes matrix matrix multiplication where matrix @p a is hermitian (symmetric if T is real).
 template <class T>
 void hemm(const blas::Side side, const blas::Uplo uplo, const T alpha,
@@ -105,15 +81,6 @@ void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const
              b.ld());
 }
 
-template <class T>
-void trsm(const blas::Side side, const blas::Uplo uplo, const blas::Op op, const blas::Diag diag,
-          const T alpha, std::reference_wrapper<const Tile<const T, Device::CPU>> a,
-          const Tile<T, Device::CPU>& b) noexcept {
-  auto s = tile::internal::getTrsmSizes(side, a.get(), b);
-  blas::trsm(blas::Layout::ColMajor, side, uplo, op, diag, s.m, s.n, alpha, a.get().ptr(), a.get().ld(),
-             b.ptr(), b.ld());
-}
-
 #ifdef DLAF_WITH_CUDA
 namespace internal {
 template <typename T>
@@ -263,23 +230,5 @@ DLAF_MAKE_CALLABLE_OBJECT(hemm);
 DLAF_MAKE_CALLABLE_OBJECT(her2k);
 DLAF_MAKE_CALLABLE_OBJECT(herk);
 DLAF_MAKE_CALLABLE_OBJECT(trsm);
-
-// TODO: Useful? Take only the predecessor sender, internally call transform.
-// template <Backend B, typename S>
-// decltype(auto) gemm(S&& s) {
-//   return transform<B>(std::forward<S>(s), gemm_o);
-// }
-
-// TODO: Or? Automatically wrap and lift arguments in when_all.
-// template <Backend B, typename... Ts>
-// decltype(auto) gemm(Ts&&... ts) {
-//   return transform<B>(when_all_lift(std::forward<Ts>(ts)...), gemm_o);
-// }
-
-// TODO: Or eagerly submitted? Additionally call detach.
-// template <Backend B, typename... Ts>
-// void gemm(Ts&&... ts) {
-//   ex::detach(transform<B>(when_all_lift(std::forward<Ts>(ts)...), gemm_o));
-// }
 }
 }
diff --git a/include/dlaf/communication/sync/broadcast.h b/include/dlaf/communication/sync/broadcast.h
index 3271a61699..f27c944f07 100644
--- a/include/dlaf/communication/sync/broadcast.h
+++ b/include/dlaf/communication/sync/broadcast.h
@@ -25,8 +25,6 @@
 #include "dlaf/matrix/copy_tile.h"
 #include "dlaf/matrix/tile.h"
 
-#include <hpx/synchronization/async_rw_mutex.hpp>
-
 namespace dlaf {
 namespace comm {
 namespace sync {
@@ -41,7 +39,6 @@ void send(Communicator& communicator, DataIn&& message_to_send) {
   using DataT = std::remove_const_t<typename common::data_traits<decltype(data)>::element_t>;
 
   auto message = comm::make_message(std::move(data));
-  // std::cerr << "send with data = " << message.data() << " and size " << message.count() << "\n";
   DLAF_MPI_CALL(MPI_Bcast(const_cast<DataT*>(message.data()), message.count(), message.mpi_type(),
                           communicator.rank(), communicator));
 }
@@ -53,105 +50,8 @@ template <class DataOut>
 void receive_from(const int broadcaster_rank, Communicator& communicator, DataOut&& data) {
   DLAF_ASSERT_HEAVY(broadcaster_rank != communicator.rank(), broadcaster_rank, communicator.rank());
   auto message = comm::make_message(common::make_data(std::forward<DataOut>(data)));
-  // std::cerr << "receive with data = " << message.data() << " and size " << message.count() << "\n";
   DLAF_MPI_CALL(
       MPI_Bcast(message.data(), message.count(), message.mpi_type(), broadcaster_rank, communicator));
 }
 }
 }
-
-// TODO: These are here only temporarily. MPI handling has changed.
-
-/// Task for broadcasting (send endpoint) a Tile in a direction over a CommunicatorGrid
-template <class T>
-void sendTile(hpx::future<common::PromiseGuard<comm::CommunicatorGrid>> mpi_task_chain, Coord rc_comm,
-              hpx::shared_future<matrix::Tile<const T, Device::CPU>> tile) {
-  using PromiseComm_t = common::PromiseGuard<comm::CommunicatorGrid>;
-
-  PromiseComm_t pcomm = mpi_task_chain.get();
-  comm::sync::broadcast::send(pcomm.ref().subCommunicator(rc_comm), tile.get());
-}
-
-template <class T>
-void sendTile(common::PromiseGuard<comm::CommunicatorGrid> pcomm, Coord rc_comm,
-              std::reference_wrapper<const matrix::Tile<const T, Device::CPU>> tile) {
-  comm::sync::broadcast::send(pcomm.ref().subCommunicator(rc_comm), tile.get());
-}
-
-template <class T>
-void sendTile(hpx::experimental::async_rw_mutex<comm::CommunicatorGrid>::readwrite_access_type pcomm,
-              Coord rc_comm, std::reference_wrapper<const matrix::Tile<const T, Device::CPU>> tile) {
-  comm::CommunicatorGrid& pcomm_ref = pcomm;
-  comm::sync::broadcast::send(pcomm_ref.subCommunicator(rc_comm), tile.get());
-}
-
-template <class T>
-void sendTile(hpx::experimental::async_rw_mutex<comm::CommunicatorGrid>::readwrite_access_type pcomm,
-              Coord rc_comm, matrix::Tile<const T, Device::CPU> const& tile) {
-  comm::CommunicatorGrid& pcomm_ref = pcomm;
-  comm::sync::broadcast::send(pcomm_ref.subCommunicator(rc_comm), tile);
-}
-
-DLAF_MAKE_CALLABLE_OBJECT(sendTile);
-
-/// Task for broadcasting (receiving endpoint) a Tile in a direction over a CommunicatorGrid
-template <class T>
-void recvTile(hpx::future<common::PromiseGuard<comm::CommunicatorGrid>> mpi_task_chain, Coord rc_comm,
-              hpx::future<matrix::Tile<T, Device::CPU>> tile, comm::IndexT_MPI rank) {
-  using PromiseComm_t = common::PromiseGuard<comm::CommunicatorGrid>;
-
-  PromiseComm_t pcomm = mpi_task_chain.get();
-  comm::sync::broadcast::receive_from(rank, pcomm.ref().subCommunicator(rc_comm), tile.get());
-}
-
-DLAF_MAKE_CALLABLE_OBJECT(recvTile);
-
-/// Task for broadcasting (receiving endpoint) a Tile ("JIT" allocation) in a direction over a CommunicatorGrid
-template <class T>
-matrix::Tile<const T, Device::CPU> recvAllocTile(
-    hpx::future<common::PromiseGuard<comm::CommunicatorGrid>> mpi_task_chain, Coord rc_comm,
-    TileElementSize tile_size, comm::IndexT_MPI rank) {
-  using ConstTile_t = matrix::Tile<const T, Device::CPU>;
-  using PromiseComm_t = common::PromiseGuard<comm::CommunicatorGrid>;
-  using MemView_t = memory::MemoryView<T, Device::CPU>;
-  using Tile_t = matrix::Tile<T, Device::CPU>;
-
-  PromiseComm_t pcomm = mpi_task_chain.get();
-  MemView_t mem_view(tile_size.linear_size());
-  Tile_t tile(tile_size, std::move(mem_view), tile_size.rows());
-  comm::sync::broadcast::receive_from(rank, pcomm.ref().subCommunicator(rc_comm), tile);
-  return ConstTile_t(std::move(tile));
-}
-
-/// Task for broadcasting (receiving endpoint) a Tile ("JIT" allocation) in a direction over a
-/// CommunicatorGrid
-template <class T>
-matrix::Tile<const T, Device::CPU> recvAllocTileSender(common::PromiseGuard<comm::CommunicatorGrid> pcomm,
-                                                       Coord rc_comm, TileElementSize tile_size,
-                                                       comm::IndexT_MPI rank) {
-  using ConstTile_t = matrix::Tile<const T, Device::CPU>;
-  using MemView_t = memory::MemoryView<T, Device::CPU>;
-  using Tile_t = matrix::Tile<T, Device::CPU>;
-
-  MemView_t mem_view(tile_size.linear_size());
-  Tile_t tile(tile_size, std::move(mem_view), tile_size.rows());
-  comm::sync::broadcast::receive_from(rank, pcomm.ref().subCommunicator(rc_comm), tile);
-  return ConstTile_t(std::move(tile));
-}
-
-template <class T>
-matrix::Tile<const T, Device::CPU> recvAllocTileSenderMutex(
-    hpx::experimental::async_rw_mutex<comm::CommunicatorGrid>::readwrite_access_type pcomm,
-    Coord rc_comm, TileElementSize tile_size, comm::IndexT_MPI rank) {
-  using ConstTile_t = matrix::Tile<const T, Device::CPU>;
-  using MemView_t = memory::MemoryView<T, Device::CPU>;
-  using Tile_t = matrix::Tile<T, Device::CPU>;
-
-  MemView_t mem_view(tile_size.linear_size());
-  Tile_t tile(tile_size, std::move(mem_view), tile_size.rows());
-  comm::CommunicatorGrid& pcomm_ref = pcomm;
-  comm::sync::broadcast::receive_from(rank, pcomm_ref.subCommunicator(rc_comm), tile);
-  return ConstTile_t(std::move(tile));
-}
-}
-}
diff --git a/include/dlaf/matrix/matrix.h b/include/dlaf/matrix/matrix.h
index 5a1f64c4ff..33be796380 100644
--- a/include/dlaf/matrix/matrix.h
+++ b/include/dlaf/matrix/matrix.h
@@ -15,8 +15,6 @@
 
 #include <hpx/local/future.hpp>
 
-#include <hpx/synchronization/async_rw_mutex.hpp>
-
 #include "dlaf/communication/communicator_grid.h"
 #include "dlaf/matrix/distribution.h"
 #include "dlaf/matrix/internal/tile_future_manager.h"
@@ -144,7 +142,6 @@ class Matrix : public Matrix<const T, device> {
 private:
   using Matrix<const T, device>::setUpTiles;
   using Matrix<const T, device>::tile_managers_;
-  using Matrix<const T, device>::tile_rw_mutexes_;
 };
 
 template <class T, Device device>
@@ -208,7 +205,6 @@ class Matrix<const T, device> : public internal::MatrixBase {
   void setUpTiles(const memory::MemoryView<ElementType, device>& mem, const LayoutInfo& layout) noexcept;
 
   std::vector<internal::TileFutureManager<T, device>> tile_managers_;
-  std::vector<hpx::experimental::async_rw_mutex<Tile<T, device>, Tile<const T, device>>> tile_rw_mutexes_;
 };
 
 // Note: the templates of the following helper functions are inverted w.r.t. the Matrix templates
diff --git a/include/dlaf/matrix/matrix_const.tpp b/include/dlaf/matrix/matrix_const.tpp
index 4a271ec7a5..11ef1cb452 100644
--- a/include/dlaf/matrix/matrix_const.tpp
+++ b/include/dlaf/matrix/matrix_const.tpp
@@ -70,10 +70,6 @@ void Matrix<const T, device>::setUpTiles(const memory::MemoryView<ElementType, d
       tile_managers_.emplace_back(
           TileType(tile_size, MemView(mem, layout.tileOffset(ind), layout.minTileMemSize(tile_size)),
                    layout.ldTile()));
-      // TODO: This is currently duplicated. Only one of the two is needed.
-      tile_rw_mutexes_.emplace_back(
-          TileType(tile_size, MemView(mem, layout.tileOffset(ind), layout.minTileMemSize(tile_size)),
-                   layout.ldTile()));
     }
   }
 }
diff --git a/include/dlaf/sender/transform.h b/include/dlaf/sender/transform.h
index 3d6c9256d9..3d5a5f405d 100644
--- a/include/dlaf/sender/transform.h
+++ b/include/dlaf/sender/transform.h
@@ -45,28 +45,60 @@ template <>
 struct Transform<Backend::GPU> {
   template <typename S, typename F>
   struct GPUTransformSender {
-    hpx::threads::thread_priority priority;
+    cuda::StreamPool stream_pool;
+    cublas::HandlePool handle_pool;
     std::decay_t<S> s;
     std::decay_t<F> f;
 
-    // TODO: Non-void functions
+    template <typename G, typename... Us>
+    static auto call_helper(cudaStream_t stream, cublasHandle_t handle, G&& g, Us&... ts) {
+      using unwrapping_function_type = decltype(hpx::util::unwrapping(std::move(g)));
+      static_assert(std::is_invocable_v<unwrapping_function_type, Ts..., cudaStream_t> ||
+                        std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Ts...>,
+                    "function passed to transform<GPU> must be invocable with a cublasStream_t as the "
+                    "last argument or a cublasHandle_t as the first argument");
+
+      if constexpr (std::is_invocable_v<unwrapping_function_type, Us..., cudaStream_t>) {
+        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), ts..., stream);
+      }
+      else if constexpr (std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Us...>) {
+        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), handle, ts...);
+      }
+    }
+
+    template <typename Tuple>
+    struct invoke_result_helper;
+
+    template <template <typename...> class Tuple, typename... Ts>
+    struct invoke_result_helper<Tuple<Ts...>> {
+      using result_type =
+          decltype(call_helper(std::declval<cudaStream_t>(), std::declval<cublasHandle_t>(),
+                               std::declval<G>(), std::declval<Ts>()...));
+      using type =
+          typename std::conditional<std::is_void<result_type>::value, Tuple<>, Tuple<result_type>>::type;
+    }
+
     template <template <typename...> class Tuple, template <typename...> class Variant>
-    using value_types = Variant<Tuple<>>;
+    using value_types = hpx::util::detail::unique_t<hpx::util::detail::transform_t<
+        typename hpx::execution::experimental::sender_traits<S>::template value_types<Tuple, Variant>,
+        invoke_result_helper>>;
 
-    // TODO: Add predecessor error_types
     template <template <typename...> class Variant>
-    using error_types = Variant<std::exception_ptr>;
+    using error_types = hpx::util::detail::unique_t<hpx::util::detail::prepend_t<
+        typename hpx::execution::experimental::sender_traits<S>::template error_types<Variant>,
+        std::exception_ptr>>;
 
     static constexpr bool sends_done = false;
 
     template <typename R>
     struct GPUTransformReceiver {
-      hpx::threads::thread_priority priority;
+      cuda::StreamPool stream_pool;
+      cublas::HandlePool handle_pool;
       std::decay_t<R> r;
       std::decay_t<F> f;
 
       template <typename E>
-          void set_error(E&& e) && noexcept {
+      void set_error(E&& e) && noexcept {
         hpx::execution::experimental::set_error(std::move(r), std::forward<E>(e));
       }
 
@@ -75,62 +107,50 @@ struct Transform<Backend::GPU> {
       }
 
       template <typename... Ts>
-      void set_value(Ts&&... ts) {
-        // TODO: Dispatch with cuda stream, cublas handle, or cusolver handle
-        // depending on what works.
-        // TODO: This is accessed when the predecessor is ready. Do we need to
-        // access the stream/handle pools earlier?
-        auto stream_pool = priority >= hpx::threads::thread_priority::high ? getHpCudaStreamPool()
-                                                                           : getNpCudaStreamPool();
-        auto handle_pool = getCublasHandlePool();
-        cudaStream_t stream = stream_pool.getNextStream();
-        cublasHandle_t handle = handle_pool.getNextHandle(stream);
-
-        // TODO: Non-void functions
-        // TODO: Exception handling
-
-        // NOTE: ts is not forwarded because we keep the pack alive longer in
-        // the continuation.
-        using unwrapping_function_type = decltype(hpx::util::unwrapping(std::move(f)));
-        static_assert(
-            std::is_invocable_v<unwrapping_function_type, Ts..., cudaStream_t> ||
-                std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Ts...>,
-            "function passed to transform<GPU> must be invocable with a cublasStream_t as the last argument or a cublasHandle_t as the first argument");
-
-        if constexpr (std::is_invocable_v<unwrapping_function_type, Ts..., cudaStream_t>) {
-          std::invoke(hpx::util::unwrapping(std::move(f)), ts..., stream);
+      void set_value(Ts&&... ts) noexcept {
+        try {
+          cudaStream_t stream = stream_pool.getNextStream();
+          cublasHandle_t handle = handle_pool.getNextHandle(stream);
+
+          // NOTE: ts is not forwarded because we keep the pack alive longer in
+          // the continuation.
+          call_helper(stream, handle, std::move(f), ts...);
+
+          // TODO: This does not need a full future. It allocates two shared
+          // states: one for the future returned from get_future_with_event, and
+          // one for the future returned from future::then. A callback triggered
+          // by event completion would be enough (that likely implies one heap
+          // allocation, however). Update this when generic event functionality is
+          // available in HPX.
+          hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
+          fut.then(hpx::launch::sync,
+                   [r = std::move(r),
+                    keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                                 std::move(handle_pool))](hpx::future<void>&&) mutable {
+                     hpx::execution::experimental::set_value(std::move(r));
+                   });
         }
-        else if constexpr (std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Ts...>) {
-          std::invoke(hpx::util::unwrapping(std::move(f)), handle, ts...);
+        catch (...) {
+          hpx::execution::experimental::set_error(std::move(r), std::current_exception());
         }
-        // TODO: cusolver case
-
-        // TODO: This does not need a full future. It allocates two shared
-        // states: one for the future returned from get_future_with_event, and
-        // one for the future returned from future::then. A callback triggered
-        // by event completion would be enough (that likely implies one heap
-        // allocation, however).
-        hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
-        fut.then(hpx::launch::sync,
-                 [r = std::move(r),
-                  keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
-                                               std::move(handle_pool))](hpx::future<void>&&) mutable {
-                   hpx::execution::experimental::set_value(std::move(r));
-                 });
       }
     };
 
     template <typename R>
     auto connect(R&& r) && {
       return hpx::execution::experimental::connect(std::move(s),
-                                                   GPUTransformReceiver<R>{priority, std::forward<R>(r),
+                                                   GPUTransformReceiver<R>{stream_pool, handle_pool,
+                                                                           std::forward<R>(r),
                                                                            std::move(f)});
     }
   };
 
   template <typename S, typename F>
   static auto call(hpx::threads::thread_priority priority, S&& s, F&& f) {
-    return GPUTransformSender<S, F>{priority, std::forward<S>(s), std::forward<F>(f)};
+    return GPUTransformSender<S, F>{priority >= hpx::threads::thread_priority::high
+                                        ? getHpCudaStreamPool()
+                                        : getNpCudaStreamPool(),
+                                    getCublasHandlePool(), std::forward<S>(s), std::forward<F>(f)};
   }
 };
 #endif
diff --git a/include/dlaf/solver/triangular/impl.h b/include/dlaf/solver/triangular/impl.h
index 2fdf394135..2e37fcf1d2 100644
--- a/include/dlaf/solver/triangular/impl.h
+++ b/include/dlaf/solver/triangular/impl.h
@@ -11,7 +11,8 @@
 
 #include <hpx/include/util.hpp>
 #include <hpx/local/future.hpp>
-#include <hpx/thread.hpp>
+#include <hpx/local/execution.hpp>
+#include <hpx/local/thread.hpp>
 
 #include "dlaf/blas/tile.h"
 #include "dlaf/common/index2d.h"
diff --git a/test/include/dlaf_test/matrix/util_matrix.h b/test/include/dlaf_test/matrix/util_matrix.h
index 884ebe5c9d..3a4efc98d8 100644
--- a/test/include/dlaf_test/matrix/util_matrix.h
+++ b/test/include/dlaf_test/matrix/util_matrix.h
@@ -73,16 +73,6 @@ void set(MatrixType<T, Device::CPU>& mat, ElementGetter el) {
                                      tile_base_index.col() + tile_index.col()));
       };
       set(mat(tile_index).get(), el_tile);
-      // TODO: Sender equivalent is roughly this (readwrite_sender's value_types
-      // contains a proxy object):
-      // set(ex::sync_wait(mat.readwrite_sender(tile_index)), el_tile);
-      //
-      // OR (still a proxy object):
-      // set(ex::make_future(mat.readwrite_sender(tile_index)).get(), el_tile);
-      //
-      // NOTE: This is also possible with futures (change all .gets to sync_wait
-      // first?):
-      // set(ex::sync_wait(mat(tile_index)), el_tile);
     }
   }
 }
diff --git a/test/unit/solver/test_triangular.cpp b/test/unit/solver/test_triangular.cpp
index 2410a5e852..186e1cce9b 100644
--- a/test/unit/solver/test_triangular.cpp
+++ b/test/unit/solver/test_triangular.cpp
@@ -135,21 +135,16 @@ void testTriangularSolver(comm::CommunicatorGrid grid, blas::Side side, blas::Up
     std::tie(el_op_a, el_b, res_b) =
         getRightTriangularSystem<GlobalElementIndex, T>(uplo, op, diag, alpha, n);
 
-  // TODO: This requires set to know about senders.
   set(mat_ah, el_op_a, op);
   set(mat_bh, el_b);
 
   {
-    // Plain access does not require any special adaptation for senders. TODO:
-    // Copying requires adaptation for senders.
     MatrixMirror<T, D, Device::CPU> mat_a(mat_ah);
     MatrixMirror<T, D, Device::CPU> mat_b(mat_bh);
 
-    // TODO: The algorithm itself needs to know about senders.
     solver::triangular<B, D, T>(grid, side, uplo, op, diag, alpha, mat_a.get(), mat_b.get());
   }
 
-  // TODO: This probably also needs to know about senders.
   CHECK_MATRIX_NEAR(res_b, mat_bh, 20 * (mat_bh.size().rows() + 1) * TypeUtilities<T>::error,
                     20 * (mat_bh.size().rows() + 1) * TypeUtilities<T>::error);
 }

From fade5f1ed7b526cf5e30fb7ea153551cf4096a0e Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 10:31:33 +0200
Subject: [PATCH 08/15] Revert unnecessary changes

---
 include/dlaf/blas/tile.h                    | 6 +-----
 include/dlaf/communication/sync/broadcast.h | 2 ++
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/include/dlaf/blas/tile.h b/include/dlaf/blas/tile.h
index fa774db7e2..e9d42447a0 100644
--- a/include/dlaf/blas/tile.h
+++ b/include/dlaf/blas/tile.h
@@ -11,12 +11,7 @@
 
 #include "blas.hh"
 
-#include <functional>
-
-#include <hpx/execution_base/sender.hpp>
-
 #include "dlaf/common/callable_object.h"
-#include "dlaf/init.h"
 #include "dlaf/matrix/tile.h"
 #include "dlaf/types.h"
 #include "dlaf/util_blas.h"
@@ -230,5 +225,6 @@ DLAF_MAKE_CALLABLE_OBJECT(hemm);
 DLAF_MAKE_CALLABLE_OBJECT(her2k);
 DLAF_MAKE_CALLABLE_OBJECT(herk);
 DLAF_MAKE_CALLABLE_OBJECT(trsm);
+
 }
 }
diff --git a/include/dlaf/communication/sync/broadcast.h b/include/dlaf/communication/sync/broadcast.h
index f27c944f07..15c4d79bc8 100644
--- a/include/dlaf/communication/sync/broadcast.h
+++ b/include/dlaf/communication/sync/broadcast.h
@@ -55,3 +55,5 @@ void receive_from(const int broadcaster_rank, Communicator& communicator, DataOu
 }
 }
 }
+}
+}

From fe9dde258fdf68f77de5e1cd90b16e44d436163a Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 10:31:41 +0200
Subject: [PATCH 09/15] Fix typo in comment

---
 include/dlaf/matrix/matrix.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/dlaf/matrix/matrix.h b/include/dlaf/matrix/matrix.h
index 33be796380..4f2613982f 100644
--- a/include/dlaf/matrix/matrix.h
+++ b/include/dlaf/matrix/matrix.h
@@ -184,7 +184,7 @@ class Matrix<const T, device> : public internal::MatrixBase {
   }
 
   auto read_sender(const LocalTileIndex& index) noexcept {
-    // We want to explicitly deal with the shared_future, no the const& to the
+    // We want to explicitly deal with the shared_future, not the const& to the
     // value.
     return hpx::execution::experimental::keep_future(read(index));
   }

From 721abe278e28b3c1208aa04a698e6d7eb363bbef Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 11:09:40 +0200
Subject: [PATCH 10/15] Move cusolver handle pool to separate header

---
 include/dlaf/cublas/handle_pool.h   |   3 +-
 include/dlaf/cuda/executor.h        |   1 +
 include/dlaf/cuda/stream_pool.h     |   5 +-
 include/dlaf/cusolver/executor.h    |  70 +------------------
 include/dlaf/cusolver/handle_pool.h | 104 ++++++++++++++++++++++++++++
 5 files changed, 111 insertions(+), 72 deletions(-)
 create mode 100644 include/dlaf/cusolver/handle_pool.h

diff --git a/include/dlaf/cublas/handle_pool.h b/include/dlaf/cublas/handle_pool.h
index bd82a59ea9..4a666632b8 100644
--- a/include/dlaf/cublas/handle_pool.h
+++ b/include/dlaf/cublas/handle_pool.h
@@ -17,11 +17,12 @@
 #include <cstddef>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 
-#include <hpx/runtime.hpp>
+#include <hpx/local/runtime.hpp>
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cublas/error.h"
diff --git a/include/dlaf/cuda/executor.h b/include/dlaf/cuda/executor.h
index 9c8984ef92..66333ef65a 100644
--- a/include/dlaf/cuda/executor.h
+++ b/include/dlaf/cuda/executor.h
@@ -24,6 +24,7 @@
 #include <hpx/functional.hpp>
 #include <hpx/future.hpp>
 #include <hpx/include/util.hpp>
+#include <hpx/modules/async_cuda.hpp>
 #include <hpx/tuple.hpp>
 
 #include "dlaf/common/assert.h"
diff --git a/include/dlaf/cuda/stream_pool.h b/include/dlaf/cuda/stream_pool.h
index 456dadcf10..65e3d7c124 100644
--- a/include/dlaf/cuda/stream_pool.h
+++ b/include/dlaf/cuda/stream_pool.h
@@ -17,13 +17,12 @@
 #include <cstddef>
 #include <memory>
 #include <utility>
+#include <vector>
 
 #include <cuda_runtime.h>
 
-#include <hpx/functional.hpp>
-#include <hpx/future.hpp>
 #include <hpx/include/util.hpp>
-#include <hpx/runtime.hpp>
+#include <hpx/local/runtime.hpp>
 #include <hpx/thread.hpp>
 
 #include "dlaf/common/assert.h"
diff --git a/include/dlaf/cusolver/executor.h b/include/dlaf/cusolver/executor.h
index 0b61d9e531..08f3958b3d 100644
--- a/include/dlaf/cusolver/executor.h
+++ b/include/dlaf/cusolver/executor.h
@@ -25,55 +25,19 @@
 #include <hpx/functional.hpp>
 #include <hpx/future.hpp>
 #include <hpx/modules/async_cuda.hpp>
-#include <hpx/mutex.hpp>
 #include <hpx/tuple.hpp>
 #include <hpx/type_traits.hpp>
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cublas/executor.h"
+#include "dlaf/cublas/handle_pool.h"
 #include "dlaf/cuda/error.h"
 #include "dlaf/cusolver/error.h"
+#include "dlaf/cusolver/handle_pool.h"
 
 namespace dlaf {
 namespace cusolver {
 namespace internal {
-class HandlePoolImpl {
-  int device_;
-  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
-  std::vector<cusolverDnHandle_t> handles_;
-
-public:
-  HandlePoolImpl(int device) : device_(device), handles_(num_worker_threads_) {
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-
-    for (auto& h : handles_) {
-      DLAF_CUSOLVER_CALL(cusolverDnCreate(&h));
-    }
-  }
-
-  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
-  HandlePoolImpl(HandlePoolImpl&&) = default;
-  HandlePoolImpl(const HandlePoolImpl&) = delete;
-  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
-
-  ~HandlePoolImpl() {
-    for (auto& h : handles_) {
-      DLAF_CUSOLVER_CALL(cusolverDnDestroy(h));
-    }
-  }
-
-  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
-    cusolverDnHandle_t handle = handles_[hpx::get_worker_thread_num()];
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-    DLAF_CUSOLVER_CALL(cusolverDnSetStream(handle, stream));
-    return handle;
-  }
-
-  int getDevice() {
-    return device_;
-  }
-};
-
 template <bool IsCallable, typename F, typename... Ts>
 struct isAsyncCusolverCallableImpl : std::false_type {
   struct dummy_type {};
@@ -96,36 +60,6 @@ struct isDataflowCusolverCallable
                                                 std::declval<Futures>()))> {};
 }
 
-/// A pool of cuSOLVER handles with reference semantics (copying points to the
-/// same underlying cuSOLVER handles, last reference destroys the references).
-/// Allows access to cuSOLVER handles associated with a particular stream. The
-/// user must ensure that the handle pool and the stream use the same device.
-/// Each HPX worker thread is assigned thread local cuSOLVER handle.
-class HandlePool {
-  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
-
-public:
-  HandlePool(int device = 0) : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device)) {}
-
-  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getNextHandle(stream);
-  }
-
-  int getDevice() {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getDevice();
-  }
-
-  bool operator==(HandlePool const& rhs) const noexcept {
-    return handles_ptr_ == rhs.handles_ptr_;
-  }
-
-  bool operator!=(HandlePool const& rhs) const noexcept {
-    return !(*this == rhs);
-  }
-};
-
 /// An executor for cuSOLVER functions. Uses handles and streams from the given
 /// HandlePool and StreamPool. A cuSOLVER function is defined as any function
 /// that takes a cuSOLVER handle as the first argument. The executor inserts a
diff --git a/include/dlaf/cusolver/handle_pool.h b/include/dlaf/cusolver/handle_pool.h
new file mode 100644
index 0000000000..1cc1aabda1
--- /dev/null
+++ b/include/dlaf/cusolver/handle_pool.h
@@ -0,0 +1,104 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+/// @file
+
+#ifdef DLAF_WITH_CUDA
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+#include <hpx/local/runtime.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/cuda/error.h"
+#include "dlaf/cusolver/error.h"
+
+namespace dlaf {
+namespace cusolver {
+namespace internal {
+class HandlePoolImpl {
+  int device_;
+  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
+  std::vector<cusolverDnHandle_t> handles_;
+
+public:
+  HandlePoolImpl(int device) : device_(device), handles_(num_worker_threads_) {
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+
+    for (auto& h : handles_) {
+      DLAF_CUSOLVER_CALL(cusolverDnCreate(&h));
+    }
+  }
+
+  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
+  HandlePoolImpl(HandlePoolImpl&&) = default;
+  HandlePoolImpl(const HandlePoolImpl&) = delete;
+  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
+
+  ~HandlePoolImpl() {
+    for (auto& h : handles_) {
+      DLAF_CUSOLVER_CALL(cusolverDnDestroy(h));
+    }
+  }
+
+  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
+    cusolverDnHandle_t handle = handles_[hpx::get_worker_thread_num()];
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+    DLAF_CUSOLVER_CALL(cusolverDnSetStream(handle, stream));
+    return handle;
+  }
+
+  int getDevice() {
+    return device_;
+  }
+};
+}
+
+/// A pool of cuSOLVER handles with reference semantics (copying points to the
+/// same underlying cuSOLVER handles, last reference destroys the handles).
+/// Allows access to cuSOLVER handles associated with a particular stream. The
+/// user must ensure that the handle pool and the stream use the same device.
+/// Each HPX worker thread is assigned thread local cuSOLVER handle.
+class HandlePool {
+  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
+
+public:
+  HandlePool(int device = 0) : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device)) {}
+
+  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getNextHandle(stream);
+  }
+
+  int getDevice() {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getDevice();
+  }
+
+  bool operator==(HandlePool const& rhs) const noexcept {
+    return handles_ptr_ == rhs.handles_ptr_;
+  }
+
+  bool operator!=(HandlePool const& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+};
+}
+}
+
+#endif

From dcfa0ea97bcac64892c7c6550966896a7013688d Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 14:34:52 +0200
Subject: [PATCH 11/15] Use senders for blas/lapack tasks in cholesky

---
 include/dlaf/factorization/cholesky/impl.h | 85 ++++++++++++----------
 1 file changed, 46 insertions(+), 39 deletions(-)

diff --git a/include/dlaf/factorization/cholesky/impl.h b/include/dlaf/factorization/cholesky/impl.h
index 67f51ccf0a..d6f47859f3 100644
--- a/include/dlaf/factorization/cholesky/impl.h
+++ b/include/dlaf/factorization/cholesky/impl.h
@@ -42,37 +42,36 @@ namespace dlaf {
 namespace factorization {
 namespace internal {
 
-template <class Executor, Device device, class T>
-void potrfDiagTile(Executor&& exec, hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(exec, matrix::unwrapExtendTiles(tile::potrf_o), blas::Uplo::Lower,
-                std::move(matrix_tile));
+template <Backend backend, Device device, class T>
+void potrfDiagTile(hpx::future<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(hpx::threads::thread_priority::normal, tile::potrf_o, blas::Uplo::Lower,
+                           std::move(matrix_tile));
 }
 
-template <class Executor, Device device, class T>
-void trsmPanelTile(Executor&& executor_hp, hpx::shared_future<matrix::Tile<const T, device>> kk_tile,
-                   hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), blas::Side::Right,
-                blas::Uplo::Lower, blas::Op::ConjTrans, blas::Diag::NonUnit, T(1.0), std::move(kk_tile),
-                std::move(matrix_tile));
+template <Backend backend, typename KKTileSender, template <typename> typename MatrixTileSender,
+          Device device, class T>
+void trsmPanelTile(KKTileSender kk_tile, MatrixTileSender<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(hpx::threads::thread_priority::high, tile::trsm_o, blas::Side::Right,
+                           blas::Uplo::Lower, blas::Op::ConjTrans, blas::Diag::NonUnit, T(1.0),
+                           std::move(kk_tile), std::move(matrix_tile));
 }
 
-template <class Executor, Device device, class T>
-void herkTrailingDiagTile(Executor&& trailing_matrix_executor,
-                          hpx::shared_future<matrix::Tile<const T, device>> panel_tile,
-                          hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(trailing_matrix_executor, matrix::unwrapExtendTiles(tile::herk_o), blas::Uplo::Lower,
-                blas::Op::NoTrans, BaseType<T>(-1.0), panel_tile, BaseType<T>(1.0),
-                std::move(matrix_tile));
+template <Backend backend, typename PanelTileSender, template <typename> typename MatrixTileSender,
+          Device device, class T>
+void herkTrailingDiagTile(hpx::threads::thread_priority priority, PanelTileSender panel_tile,
+                          MatrixTileSender<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(priority, tile::herk_o, blas::Uplo::Lower, blas::Op::NoTrans,
+                           BaseType<T>(-1.0), std::move(panel_tile), BaseType<T>(1.0),
+                           std::move(matrix_tile));
 }
 
-template <class Executor, Device device, class T>
-void gemmTrailingMatrixTile(Executor&& trailing_matrix_executor,
-                            hpx::shared_future<matrix::Tile<const T, device>> panel_tile,
-                            hpx::shared_future<matrix::Tile<const T, device>> col_panel,
-                            hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(trailing_matrix_executor, matrix::unwrapExtendTiles(tile::gemm_o), blas::Op::NoTrans,
-                blas::Op::ConjTrans, T(-1.0), std::move(panel_tile), std::move(col_panel), T(1.0),
-                std::move(matrix_tile));
+template <Backend backend, typename PanelColTileSender, template <typename> typename MatrixTileSender,
+          Device device, class T>
+void gemmTrailingMatrixTile(hpx::threads::thread_priority priority, PanelColTileSender panel_tile,
+                            PanelColTileSender col_panel,
+                            MatrixTileSender<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::ConjTrans, T(-1.0),
+                           std::move(panel_tile), std::move(col_panel), T(1.0), std::move(matrix_tile));
 }
 
 // Local implementation of Lower Cholesky factorization.
@@ -88,26 +87,29 @@ void Cholesky<backend, device, T>::call_L(Matrix<T, device>& mat_a) {
     // Cholesky decomposition on mat_a(k,k) r/w potrf (lapack operation)
     auto kk = LocalTileIndex{k, k};
 
-    potrfDiagTile(executor_hp, mat_a(kk));
+    potrfDiagTile<backend>(mat_a.readwrite_sender(kk));
 
     for (SizeType i = k + 1; i < nrtile; ++i) {
       // Update panel mat_a(i,k) with trsm (blas operation), using data mat_a.read(k,k)
-      trsmPanelTile(executor_hp, mat_a.read(kk), mat_a(LocalTileIndex{i, k}));
+      trsmPanelTile<backend>(mat_a.read_sender(kk), mat_a.readwrite_sender(LocalTileIndex{i, k}));
     }
 
     for (SizeType j = k + 1; j < nrtile; ++j) {
       // first trailing panel gets high priority (look ahead).
-      auto& trailing_matrix_executor = (j == k + 1) ? executor_hp : executor_np;
+      const auto trailing_matrix_priority =
+          (j == k + 1) ? hpx::threads::thread_priority::high : hpx::threads::thread_priority::normal;
 
       // Update trailing matrix: diagonal element mat_a(j,j), reading mat_a.read(j,k), using herk (blas operation)
-      herkTrailingDiagTile(trailing_matrix_executor, mat_a.read(LocalTileIndex{j, k}),
-                           mat_a(LocalTileIndex{j, j}));
+      herkTrailingDiagTile<backend>(trailing_matrix_priority, mat_a.read_sender(LocalTileIndex{j, k}),
+                                    mat_a.readwrite_sender(LocalTileIndex{j, j}));
 
       for (SizeType i = j + 1; i < nrtile; ++i) {
         // Update remaining trailing matrix mat_a(i,j), reading mat_a.read(i,k) and mat_a.read(j,k),
         // using gemm (blas operation)
-        gemmTrailingMatrixTile(trailing_matrix_executor, mat_a.read(LocalTileIndex{i, k}),
-                               mat_a.read(LocalTileIndex{j, k}), mat_a(LocalTileIndex{i, j}));
+        gemmTrailingMatrixTile<backend>(trailing_matrix_priority,
+                                        mat_a.read_sender(LocalTileIndex{i, k}),
+                                        mat_a.read_sender(LocalTileIndex{j, k}),
+                                        mat_a.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -141,7 +143,7 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
 
     // Factorization of diagonal tile and broadcast it along the k-th column
     if (kk_rank == this_rank)
-      potrfDiagTile(executor_hp, mat_a(kk_idx));
+      potrfDiagTile<backend>(mat_a.readwrite_sender(kk_idx));
 
     // If there is no trailing matrix
     if (k == nrtile - 1)
@@ -180,7 +182,7 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
         const LocalTileIndex local_idx(Coord::Row, i);
         const LocalTileIndex ik_idx(i, distr.localTileFromGlobalTile<Coord::Col>(k));
 
-        trsmPanelTile(executor_hp, panelT.read(diag_wp_idx), mat_a(ik_idx));
+        trsmPanelTile<backend>(panelT.read_sender(diag_wp_idx), mat_a.readwrite_sender(ik_idx));
 
         panel.setTile(local_idx, mat_a.read(ik_idx));
       }
@@ -203,12 +205,13 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
         continue;
 
       const auto j = distr.localTileFromGlobalTile<Coord::Col>(jt_idx);
-      auto& trailing_matrix_executor = (jt_idx == kt) ? executor_hp : executor_np;
+      const auto trailing_matrix_priority =
+          (jt_idx == kt) ? hpx::threads::thread_priority::high : hpx::threads::thread_priority::normal;
       if (this_rank.row() == owner.row()) {
         const auto i = distr.localTileFromGlobalTile<Coord::Row>(jt_idx);
 
-        herkTrailingDiagTile(trailing_matrix_executor, panel.read({Coord::Row, i}),
-                             mat_a(LocalTileIndex{i, j}));
+        herkTrailingDiagTile<backend>(trailing_matrix_priority, panel.read_sender({Coord::Row, i}),
+                                      mat_a.readwrite_sender(LocalTileIndex{i, j}));
       }
 
       for (SizeType i_idx = jt_idx + 1; i_idx < nrtile; ++i_idx) {
@@ -218,8 +221,12 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
           continue;
 
         const auto i = distr.localTileFromGlobalTile<Coord::Row>(i_idx);
-        gemmTrailingMatrixTile(executor_np, panel.read({Coord::Row, i}), panelT.read({Coord::Col, j}),
-                               mat_a(LocalTileIndex{i, j}));
+        // TODO: This was using executor_np. Was that intentional, or should it
+        // be trailing_matrix_executor/priority?
+        gemmTrailingMatrixTile<backend>(hpx::threads::thread_priority::normal,
+                                        panel.read_sender({Coord::Row, i}),
+                                        panelT.read_sender({Coord::Col, j}),
+                                        mat_a.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
 

From 080922f583901a59a650f8cb1fcadcd9cda75a87 Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 14:35:25 +0200
Subject: [PATCH 12/15] Update and fix copy/duplicate functionality to use
 senders

---
 include/dlaf/matrix/copy.h      |  12 ++--
 include/dlaf/matrix/copy_tile.h |  25 ++++++--
 include/dlaf/matrix/panel.h     |   9 ++-
 include/dlaf/sender/transform.h | 101 ++++++++++++++++++++------------
 4 files changed, 99 insertions(+), 48 deletions(-)

diff --git a/include/dlaf/matrix/copy.h b/include/dlaf/matrix/copy.h
index 453c3f0d75..f16c1759f3 100644
--- a/include/dlaf/matrix/copy.h
+++ b/include/dlaf/matrix/copy.h
@@ -39,12 +39,12 @@ void copy(Matrix<const T, Source>& source, Matrix<T, Destination>& dest) {
 
   for (SizeType j = 0; j < local_tile_cols; ++j) {
     for (SizeType i = 0; i < local_tile_rows; ++i) {
-      transform<internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
-                                                                   copy_o,
-                                                                   source.read_sender(
-                                                                       LocalTileIndex(i, j)),
-                                                                   dest.readwrite_sender(
-                                                                       LocalTileIndex(i, j)));
+      transformDetach<
+          internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
+                                                             copy_o,
+                                                             source.read_sender(LocalTileIndex(i, j)),
+                                                             dest.readwrite_sender(
+                                                                 LocalTileIndex(i, j)));
     }
   }
 }
diff --git a/include/dlaf/matrix/copy_tile.h b/include/dlaf/matrix/copy_tile.h
index 0eb08d77f3..ca29ff5663 100644
--- a/include/dlaf/matrix/copy_tile.h
+++ b/include/dlaf/matrix/copy_tile.h
@@ -185,17 +185,32 @@ struct Duplicate {
 /// but only if the destination device is different from the source device.
 ///
 /// When Destination and Source are the same, returns the input tile unmodified.
-template <Device Destination, typename T, Device Source, template <class> class Future>
-auto duplicateIfNeeded(Future<Tile<T, Source>> tile) {
+template <Device Destination, typename T, Device Source>
+auto duplicateIfNeeded(hpx::future<Tile<T, Source>> tile) {
   if constexpr (Source == Destination) {
     return tile;
   }
   else {
     return hpx::execution::experimental::make_future(
         dlaf::transform<
-            internal::CopyBackend<Source, Destination>>(hpx::threads::thread_priority::normal,
-                                                        dlaf::matrix::Duplicate<Destination>{},
-                                                        std::move(tile)));
+            internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
+                                                               dlaf::matrix::Duplicate<Destination>{},
+                                                               std::move(tile)));
+  }
+}
+
+template <Device Destination, typename T, Device Source>
+auto duplicateIfNeeded(hpx::shared_future<Tile<T, Source>> tile) {
+  if constexpr (Source == Destination) {
+    return tile;
+  }
+  else {
+    return hpx::execution::experimental::make_future(
+        dlaf::transform<
+            internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
+                                                               dlaf::matrix::Duplicate<Destination>{},
+                                                               hpx::execution::experimental::keep_future(
+                                                                   std::move(tile))));
   }
 }
 }
diff --git a/include/dlaf/matrix/panel.h b/include/dlaf/matrix/panel.h
index 28cde56096..a503024851 100644
--- a/include/dlaf/matrix/panel.h
+++ b/include/dlaf/matrix/panel.h
@@ -104,6 +104,10 @@ struct Panel<axis, const T, D> {
     }
   }
 
+  auto read_sender(const LocalTileIndex& index) {
+    return hpx::execution::experimental::keep_future(read(index));
+  }
+
   /// Set the panel to enable access to the range of tiles [start, end)
   ///
   /// With respect to the parent matrix.
@@ -300,9 +304,12 @@ struct Panel : public Panel<axis, const T, device> {
     return BaseT::data_(BaseT::fullIndex(index));
   }
 
+  auto readwrite_sender(const LocalTileIndex& index) {
+    return hpx::execution::experimental::keep_future(this->operator()(index));
+  }
+
 protected:
   using BaseT = Panel<axis, const T, device>;
 };
-
 }
 }
diff --git a/include/dlaf/sender/transform.h b/include/dlaf/sender/transform.h
index 3d5a5f405d..2c9d0ef5c3 100644
--- a/include/dlaf/sender/transform.h
+++ b/include/dlaf/sender/transform.h
@@ -16,8 +16,13 @@
 #include "dlaf/types.h"
 
 #ifdef DLAF_WITH_CUDA
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
 #include "dlaf/cublas/handle_pool.h"
 #include "dlaf/cuda/stream_pool.h"
+#include "dlaf/cusolver/handle_pool.h"
 #endif
 
 namespace dlaf {
@@ -33,36 +38,45 @@ struct Transform<Backend::MC> {
   template <typename S, typename F>
   static auto call(hpx::threads::thread_priority priority, S&& s, F&& f) {
     namespace ex = hpx::execution::experimental;
-    return ex::transform(ex::on(std::forward<S>(s), ex::make_with_priority(ex::executor{}, priority)),
+    return ex::transform(ex::on(std::forward<S>(s), ex::with_priority(ex::executor{}, priority)),
                          hpx::util::unwrapping(std::forward<F>(f)));
   }
 };
 
 #ifdef DLAF_WITH_CUDA
-// For Backend::GPU we use a custom sender. This currently handles CUDA stream
-// and cuBLAS handle functions.
+// For Backend::GPU we use a custom sender.
 template <>
 struct Transform<Backend::GPU> {
   template <typename S, typename F>
   struct GPUTransformSender {
     cuda::StreamPool stream_pool;
-    cublas::HandlePool handle_pool;
+    cublas::HandlePool cublas_handle_pool;
+    cusolver::HandlePool cusolver_handle_pool;
     std::decay_t<S> s;
     std::decay_t<F> f;
 
     template <typename G, typename... Us>
-    static auto call_helper(cudaStream_t stream, cublasHandle_t handle, G&& g, Us&... ts) {
-      using unwrapping_function_type = decltype(hpx::util::unwrapping(std::move(g)));
-      static_assert(std::is_invocable_v<unwrapping_function_type, Ts..., cudaStream_t> ||
-                        std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Ts...>,
+    static auto call_helper(cudaStream_t stream, cublasHandle_t cublas_handle,
+                            cusolverDnHandle_t cusolver_handle, G&& g, Us&... ts) {
+      using unwrapping_function_type = decltype(hpx::util::unwrapping(std::forward<G>(g)));
+      static_assert(std::is_invocable_v<unwrapping_function_type, Us..., cudaStream_t> ||
+                        std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Us...> ||
+                        std::is_invocable_v<unwrapping_function_type, cusolverDnHandle_t, Us...>,
                     "function passed to transform<GPU> must be invocable with a cublasStream_t as the "
-                    "last argument or a cublasHandle_t as the first argument");
+                    "last argument or a cublasHandle_t/cusolverDnHandle_t as the first argument");
 
       if constexpr (std::is_invocable_v<unwrapping_function_type, Us..., cudaStream_t>) {
+        (void)cublas_handle;
+        (void)cusolver_handle;
         return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), ts..., stream);
       }
       else if constexpr (std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Us...>) {
-        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), handle, ts...);
+        (void)cusolver_handle;
+        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), cublas_handle, ts...);
+      }
+      else if constexpr (std::is_invocable_v<unwrapping_function_type, cusolverDnHandle_t, Us...>) {
+        (void)cublas_handle;
+        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), cusolver_handle, ts...);
       }
     }
 
@@ -71,12 +85,12 @@ struct Transform<Backend::GPU> {
 
     template <template <typename...> class Tuple, typename... Ts>
     struct invoke_result_helper<Tuple<Ts...>> {
-      using result_type =
-          decltype(call_helper(std::declval<cudaStream_t>(), std::declval<cublasHandle_t>(),
-                               std::declval<G>(), std::declval<Ts>()...));
+      using result_type = decltype(
+          call_helper(std::declval<cudaStream_t&>(), std::declval<cublasHandle_t&>(),
+                      std::declval<cusolverDnHandle_t&>(), std::declval<F>(), std::declval<Ts&>()...));
       using type =
           typename std::conditional<std::is_void<result_type>::value, Tuple<>, Tuple<result_type>>::type;
-    }
+    };
 
     template <template <typename...> class Tuple, template <typename...> class Variant>
     using value_types = hpx::util::detail::unique_t<hpx::util::detail::transform_t<
@@ -93,12 +107,13 @@ struct Transform<Backend::GPU> {
     template <typename R>
     struct GPUTransformReceiver {
       cuda::StreamPool stream_pool;
-      cublas::HandlePool handle_pool;
+      cublas::HandlePool cublas_handle_pool;
+      cusolver::HandlePool cusolver_handle_pool;
       std::decay_t<R> r;
       std::decay_t<F> f;
 
       template <typename E>
-      void set_error(E&& e) && noexcept {
+          void set_error(E&& e) && noexcept {
         hpx::execution::experimental::set_error(std::move(r), std::forward<E>(e));
       }
 
@@ -110,25 +125,36 @@ struct Transform<Backend::GPU> {
       void set_value(Ts&&... ts) noexcept {
         try {
           cudaStream_t stream = stream_pool.getNextStream();
-          cublasHandle_t handle = handle_pool.getNextHandle(stream);
+          cublasHandle_t cublas_handle = cublas_handle_pool.getNextHandle(stream);
+          cusolverDnHandle_t cusolver_handle = cusolver_handle_pool.getNextHandle(stream);
 
-          // NOTE: ts is not forwarded because we keep the pack alive longer in
+          // NOTE: We do not forward ts because we keep the pack alive longer in
           // the continuation.
-          call_helper(stream, handle, std::move(f), ts...);
-
-          // TODO: This does not need a full future. It allocates two shared
-          // states: one for the future returned from get_future_with_event, and
-          // one for the future returned from future::then. A callback triggered
-          // by event completion would be enough (that likely implies one heap
-          // allocation, however). Update this when generic event functionality is
-          // available in HPX.
-          hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
-          fut.then(hpx::launch::sync,
-                   [r = std::move(r),
-                    keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
-                                                 std::move(handle_pool))](hpx::future<void>&&) mutable {
-                     hpx::execution::experimental::set_value(std::move(r));
-                   });
+          if constexpr (std::is_void_v<decltype(
+                            call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...))>) {
+            call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...);
+            hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
+            fut.then(hpx::launch::sync,
+                     [r = std::move(r),
+                      keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                                   std::move(cublas_handle_pool),
+                                                   std::move(cusolver_handle_pool))](
+                         hpx::future<void>&&) mutable {
+                       hpx::execution::experimental::set_value(std::move(r));
+                     });
+          }
+          else {
+            auto res = call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...);
+            hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
+            fut.then(hpx::launch::sync,
+                     [r = std::move(r), res = std::move(res),
+                      keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                                   std::move(cublas_handle_pool),
+                                                   std::move(cusolver_handle_pool))](
+                         hpx::future<void>&&) mutable {
+                       hpx::execution::experimental::set_value(std::move(r), std::move(res));
+                     });
+          }
         }
         catch (...) {
           hpx::execution::experimental::set_error(std::move(r), std::current_exception());
@@ -139,7 +165,9 @@ struct Transform<Backend::GPU> {
     template <typename R>
     auto connect(R&& r) && {
       return hpx::execution::experimental::connect(std::move(s),
-                                                   GPUTransformReceiver<R>{stream_pool, handle_pool,
+                                                   GPUTransformReceiver<R>{stream_pool,
+                                                                           cublas_handle_pool,
+                                                                           cusolver_handle_pool,
                                                                            std::forward<R>(r),
                                                                            std::move(f)});
     }
@@ -150,7 +178,8 @@ struct Transform<Backend::GPU> {
     return GPUTransformSender<S, F>{priority >= hpx::threads::thread_priority::high
                                         ? getHpCudaStreamPool()
                                         : getNpCudaStreamPool(),
-                                    getCublasHandlePool(), std::forward<S>(s), std::forward<F>(f)};
+                                    getCublasHandlePool(), getCusolverHandlePool(), std::forward<S>(s),
+                                    std::forward<F>(f)};
   }
 };
 #endif
@@ -158,7 +187,7 @@ struct Transform<Backend::GPU> {
 
 // Lazy transform. This does not submit the work and returns a sender.
 template <Backend B, typename F, typename... Ts>
-decltype(auto) transform(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
+[[nodiscard]] decltype(auto) transform(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
   return internal::Transform<B>::call(priority, internal::whenAllLift(std::forward<Ts>(ts)...),
                                       std::forward<F>(f));
 }

From 23b6513eb72a1e24f155974b7cbf671e1cd62a9f Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 14:42:02 +0200
Subject: [PATCH 13/15] Remove unnecessary variables and typedefs in cholesky
 and triangular implementations

---
 include/dlaf/factorization/cholesky/impl.h |  8 ----
 include/dlaf/solver/triangular/impl.h      | 47 +++++-----------------
 2 files changed, 10 insertions(+), 45 deletions(-)

diff --git a/include/dlaf/factorization/cholesky/impl.h b/include/dlaf/factorization/cholesky/impl.h
index d6f47859f3..5b5ab986a4 100644
--- a/include/dlaf/factorization/cholesky/impl.h
+++ b/include/dlaf/factorization/cholesky/impl.h
@@ -77,9 +77,6 @@ void gemmTrailingMatrixTile(hpx::threads::thread_priority priority, PanelColTile
 // Local implementation of Lower Cholesky factorization.
 template <Backend backend, Device device, class T>
 void Cholesky<backend, device, T>::call_L(Matrix<T, device>& mat_a) {
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   // Number of tile (rows = cols)
   SizeType nrtile = mat_a.nrTiles().cols();
 
@@ -117,11 +114,6 @@ void Cholesky<backend, device, T>::call_L(Matrix<T, device>& mat_a) {
 
 template <Backend backend, Device device, class T>
 void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T, device>& mat_a) {
-  using hpx::util::unwrapping;
-  using hpx::dataflow;
-
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
   auto executor_mpi = dlaf::getMPIExecutor<backend>();
 
   // Set up MPI executor pipelines
diff --git a/include/dlaf/solver/triangular/impl.h b/include/dlaf/solver/triangular/impl.h
index 2e37fcf1d2..3d97397e24 100644
--- a/include/dlaf/solver/triangular/impl.h
+++ b/include/dlaf/solver/triangular/impl.h
@@ -10,8 +10,8 @@
 #pragma once
 
 #include <hpx/include/util.hpp>
-#include <hpx/local/future.hpp>
 #include <hpx/local/execution.hpp>
+#include <hpx/local/future.hpp>
 #include <hpx/local/thread.hpp>
 
 #include "dlaf/blas/tile.h"
@@ -42,9 +42,6 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -59,7 +56,7 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
@@ -81,9 +78,6 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -96,7 +90,7 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
 
@@ -118,9 +112,6 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -133,7 +124,7 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
@@ -154,9 +145,6 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -170,7 +158,7 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
@@ -191,9 +179,6 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -207,7 +192,7 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
@@ -228,9 +213,6 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -244,7 +226,7 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
@@ -265,9 +247,6 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -281,7 +260,7 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
@@ -302,9 +281,6 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -318,7 +294,7 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
@@ -345,8 +321,6 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
   using common::internal::vector;
   using ConstTileType = typename Matrix<T, device>::ConstTileType;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
   auto executor_mpi = dlaf::getMPIExecutor<backend>();
 
   // Set up MPI
@@ -414,7 +388,7 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
       auto i = distr_a.globalTileFromLocalTile<Coord::Row>(i_local);
 
       // Choose queue priority
-      auto& trailing_executor = (i == k + 1) ? executor_hp : executor_np;
+      const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
       hpx::shared_future<ConstTileType> ik_tile;
 
@@ -433,7 +407,6 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
 
       // Update trailing matrix
       for (SizeType j_local = 0; j_local < b_local_cols; ++j_local) {
-        auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
         T beta = T(-1.0) / alpha;
         transformDetach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
                                  ex::keep_future(ik_tile), ex::keep_future(panel[j_local]), T(1.0),

From 1d65117267a6217db0f4df0bc1fcf174164ca31b Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 17:24:13 +0200
Subject: [PATCH 14/15] Minor formatting

---
 include/dlaf/sender/transform.h | 48 +++++++++++++++++----------------
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/include/dlaf/sender/transform.h b/include/dlaf/sender/transform.h
index 2c9d0ef5c3..50acc1d907 100644
--- a/include/dlaf/sender/transform.h
+++ b/include/dlaf/sender/transform.h
@@ -38,7 +38,7 @@ struct Transform<Backend::MC> {
   template <typename S, typename F>
   static auto call(hpx::threads::thread_priority priority, S&& s, F&& f) {
     namespace ex = hpx::execution::experimental;
-    return ex::transform(ex::on(std::forward<S>(s), ex::with_priority(ex::executor{}, priority)),
+    return ex::transform(ex::on(std::forward<S>(s), ex::make_with_priority(ex::executor{}, priority)),
                          hpx::util::unwrapping(std::forward<F>(f)));
   }
 };
@@ -66,16 +66,16 @@ struct Transform<Backend::GPU> {
                     "last argument or a cublasHandle_t/cusolverDnHandle_t as the first argument");
 
       if constexpr (std::is_invocable_v<unwrapping_function_type, Us..., cudaStream_t>) {
-        (void)cublas_handle;
-        (void)cusolver_handle;
+        (void) cublas_handle;
+        (void) cusolver_handle;
         return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), ts..., stream);
       }
       else if constexpr (std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Us...>) {
-        (void)cusolver_handle;
+        (void) cusolver_handle;
         return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), cublas_handle, ts...);
       }
       else if constexpr (std::is_invocable_v<unwrapping_function_type, cusolverDnHandle_t, Us...>) {
-        (void)cublas_handle;
+        (void) cublas_handle;
         return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), cusolver_handle, ts...);
       }
     }
@@ -133,27 +133,29 @@ struct Transform<Backend::GPU> {
           if constexpr (std::is_void_v<decltype(
                             call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...))>) {
             call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...);
-            hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
-            fut.then(hpx::launch::sync,
-                     [r = std::move(r),
-                      keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
-                                                   std::move(cublas_handle_pool),
-                                                   std::move(cusolver_handle_pool))](
-                         hpx::future<void>&&) mutable {
-                       hpx::execution::experimental::set_value(std::move(r));
-                     });
+            hpx::cuda::experimental::detail::add_event_callback(
+                [r = std::move(r),
+                 keep_alive =
+                     std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                     std::move(cublas_handle_pool),
+                                     std::move(cusolver_handle_pool))](cudaError_t status) mutable {
+                  DLAF_CUDA_CALL(status);
+                  hpx::execution::experimental::set_value(std::move(r));
+                },
+                stream);
           }
           else {
             auto res = call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...);
-            hpx::future<void> fut = hpx::cuda::experimental::detail::get_future_with_event(stream);
-            fut.then(hpx::launch::sync,
-                     [r = std::move(r), res = std::move(res),
-                      keep_alive = std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
-                                                   std::move(cublas_handle_pool),
-                                                   std::move(cusolver_handle_pool))](
-                         hpx::future<void>&&) mutable {
-                       hpx::execution::experimental::set_value(std::move(r), std::move(res));
-                     });
+            hpx::cuda::experimental::detail::add_event_callback(
+                [r = std::move(r), res = std::move(res),
+                 keep_alive =
+                     std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                     std::move(cublas_handle_pool),
+                                     std::move(cusolver_handle_pool))](cudaError_t status) mutable {
+                  DLAF_CUDA_CALL(status);
+                  hpx::execution::experimental::set_value(std::move(r), std::move(res));
+                },
+                stream);
           }
         }
         catch (...) {

From 34ffb5d07c04541708c7ce48a0d75eb4a230f82f Mon Sep 17 00:00:00 2001
From: Mikael Simberg <simberg@cscs.ch>
Date: Thu, 3 Jun 2021 17:26:04 +0200
Subject: [PATCH 15/15] Temporarily use custom HPX branch

---
 ci/docker/codecov/build.Dockerfile | 4 ++--
 ci/docker/release/build.Dockerfile | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/docker/codecov/build.Dockerfile b/ci/docker/codecov/build.Dockerfile
index 8630e08352..6af11fb6de 100644
--- a/ci/docker/codecov/build.Dockerfile
+++ b/ci/docker/codecov/build.Dockerfile
@@ -79,8 +79,8 @@ RUN wget -q https://github.com/gperftools/gperftools/releases/download/gperftool
     rm -rf /root/gperftools.tar.gz /root/gperftools-${GPERFTOOLS_VERSION}
 
 # Install HPX
-ARG HPX_FORK=STEllAR-GROUP
-ARG HPX_VERSION=1.6.0
+ARG HPX_FORK=msimberg
+ARG HPX_VERSION=cuda-event-callback
 ARG HPX_WITH_CUDA=OFF
 ARG HPX_PATH=/usr/local/hpx
 RUN wget -q https://github.com/${HPX_FORK}/hpx/archive/${HPX_VERSION}.tar.gz -O hpx.tar.gz && \
diff --git a/ci/docker/release/build.Dockerfile b/ci/docker/release/build.Dockerfile
index 93fad06d30..8adff004e2 100644
--- a/ci/docker/release/build.Dockerfile
+++ b/ci/docker/release/build.Dockerfile
@@ -79,8 +79,8 @@ RUN wget -q https://github.com/gperftools/gperftools/releases/download/gperftool
     rm -rf /root/gperftools.tar.gz /root/gperftools-${GPERFTOOLS_VERSION}
 
 # Install HPX
-ARG HPX_FORK=STEllAR-GROUP
-ARG HPX_VERSION=1.6.0
+ARG HPX_FORK=msimberg
+ARG HPX_VERSION=cuda-event-callback
 ARG HPX_WITH_CUDA=OFF
 ARG HPX_PATH=/usr/local/hpx
 RUN wget -q https://github.com/${HPX_FORK}/hpx/archive/${HPX_VERSION}.tar.gz -O hpx.tar.gz && \