diff --git a/ci/docker/codecov/build.Dockerfile b/ci/docker/codecov/build.Dockerfile
index 8630e08352..6af11fb6de 100644
--- a/ci/docker/codecov/build.Dockerfile
+++ b/ci/docker/codecov/build.Dockerfile
@@ -79,8 +79,8 @@ RUN wget -q https://github.com/gperftools/gperftools/releases/download/gperftool
     rm -rf /root/gperftools.tar.gz /root/gperftools-${GPERFTOOLS_VERSION}
 
 # Install HPX
-ARG HPX_FORK=STEllAR-GROUP
-ARG HPX_VERSION=1.6.0
+ARG HPX_FORK=msimberg
+ARG HPX_VERSION=cuda-event-callback
 ARG HPX_WITH_CUDA=OFF
 ARG HPX_PATH=/usr/local/hpx
 RUN wget -q https://github.com/${HPX_FORK}/hpx/archive/${HPX_VERSION}.tar.gz -O hpx.tar.gz && \
diff --git a/ci/docker/release/build.Dockerfile b/ci/docker/release/build.Dockerfile
index 93fad06d30..8adff004e2 100644
--- a/ci/docker/release/build.Dockerfile
+++ b/ci/docker/release/build.Dockerfile
@@ -79,8 +79,8 @@ RUN wget -q https://github.com/gperftools/gperftools/releases/download/gperftool
     rm -rf /root/gperftools.tar.gz /root/gperftools-${GPERFTOOLS_VERSION}
 
 # Install HPX
-ARG HPX_FORK=STEllAR-GROUP
-ARG HPX_VERSION=1.6.0
+ARG HPX_FORK=msimberg
+ARG HPX_VERSION=cuda-event-callback
 ARG HPX_WITH_CUDA=OFF
 ARG HPX_PATH=/usr/local/hpx
 RUN wget -q https://github.com/${HPX_FORK}/hpx/archive/${HPX_VERSION}.tar.gz -O hpx.tar.gz && \
diff --git a/include/dlaf/cublas/executor.h b/include/dlaf/cublas/executor.h
index 2808bc3940..37e83c3981 100644
--- a/include/dlaf/cublas/executor.h
+++ b/include/dlaf/cublas/executor.h
@@ -31,51 +31,13 @@
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cublas/error.h"
+#include "dlaf/cublas/handle_pool.h"
 #include "dlaf/cuda/error.h"
 #include "dlaf/cuda/executor.h"
 
 namespace dlaf {
 namespace cublas {
 namespace internal {
-class HandlePoolImpl {
-  int device_;
-  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
-  std::vector<cublasHandle_t> handles_;
-  cublasPointerMode_t ptr_mode_;
-
-public:
-  HandlePoolImpl(int device, cublasPointerMode_t ptr_mode)
-      : device_(device), handles_(num_worker_threads_), ptr_mode_(ptr_mode) {
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-
-    for (auto& h : handles_) {
-      DLAF_CUBLAS_CALL(cublasCreate(&h));
-    }
-  }
-
-  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
-  HandlePoolImpl(HandlePoolImpl&&) = default;
-  HandlePoolImpl(const HandlePoolImpl&) = delete;
-  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
-
-  ~HandlePoolImpl() {
-    for (auto& h : handles_) {
-      DLAF_CUBLAS_CALL(cublasDestroy(h));
-    }
-  }
-
-  cublasHandle_t getNextHandle(cudaStream_t stream) {
-    cublasHandle_t handle = handles_[hpx::get_worker_thread_num()];
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-    DLAF_CUBLAS_CALL(cublasSetStream(handle, stream));
-    DLAF_CUBLAS_CALL(cublasSetPointerMode(handle, ptr_mode_));
-    return handle;
-  }
-
-  int getDevice() {
-    return device_;
-  }
-};
 
 template <bool IsCallable, typename F, typename... Ts>
 struct isAsyncCublasCallableImpl : std::false_type {
@@ -99,37 +61,6 @@ struct isDataflowCublasCallable
                                                 std::declval<Futures>()))> {};
 }
 
-/// A pool of cuBLAS handles with reference semantics (copying points to the
-/// same underlying cuBLAS handles, last reference destroys the references).
-/// Allows access to cuBLAS handles associated with a particular stream. The
-/// user must ensure that the handle pool and the stream use the same device.
-/// Each HPX worker thread is assigned thread local cuBLAS handle.
-class HandlePool {
-  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
-
-public:
-  HandlePool(int device = 0, cublasPointerMode_t ptr_mode = CUBLAS_POINTER_MODE_HOST)
-      : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device, ptr_mode)) {}
-
-  cublasHandle_t getNextHandle(cudaStream_t stream) {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getNextHandle(stream);
-  }
-
-  int getDevice() {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getDevice();
-  }
-
-  bool operator==(HandlePool const& rhs) const noexcept {
-    return handles_ptr_ == rhs.handles_ptr_;
-  }
-
-  bool operator!=(HandlePool const& rhs) const noexcept {
-    return !(*this == rhs);
-  }
-};
-
 /// An executor for cuBLAS functions. Uses handles and streams from the given
 /// HandlePool and StreamPool. A cuBLAS function is defined as any function that
 /// takes a cuBLAS handle as the first argument. The executor inserts a cuBLAS
diff --git a/include/dlaf/cublas/handle_pool.h b/include/dlaf/cublas/handle_pool.h
new file mode 100644
index 0000000000..4a666632b8
--- /dev/null
+++ b/include/dlaf/cublas/handle_pool.h
@@ -0,0 +1,109 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+/// @file
+
+#ifdef DLAF_WITH_CUDA
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+
+#include <hpx/local/runtime.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/cublas/error.h"
+#include "dlaf/cuda/error.h"
+#include "dlaf/cuda/executor.h"
+
+namespace dlaf {
+namespace cublas {
+namespace internal {
+class HandlePoolImpl {
+  int device_;
+  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
+  std::vector<cublasHandle_t> handles_;
+  cublasPointerMode_t ptr_mode_;
+
+public:
+  HandlePoolImpl(int device, cublasPointerMode_t ptr_mode)
+      : device_(device), handles_(num_worker_threads_), ptr_mode_(ptr_mode) {
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+
+    for (auto& h : handles_) {
+      DLAF_CUBLAS_CALL(cublasCreate(&h));
+    }
+  }
+
+  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
+  HandlePoolImpl(HandlePoolImpl&&) = default;
+  HandlePoolImpl(const HandlePoolImpl&) = delete;
+  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
+
+  ~HandlePoolImpl() {
+    for (auto& h : handles_) {
+      DLAF_CUBLAS_CALL(cublasDestroy(h));
+    }
+  }
+
+  cublasHandle_t getNextHandle(cudaStream_t stream) {
+    cublasHandle_t handle = handles_[hpx::get_worker_thread_num()];
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+    DLAF_CUBLAS_CALL(cublasSetStream(handle, stream));
+    DLAF_CUBLAS_CALL(cublasSetPointerMode(handle, ptr_mode_));
+    return handle;
+  }
+
+  int getDevice() {
+    return device_;
+  }
+};
+}
+
+/// A pool of cuBLAS handles with reference semantics (copying points to the
+/// same underlying cuBLAS handles, last reference destroys the references).
+/// Allows access to cuBLAS handles associated with a particular stream. The
+/// user must ensure that the handle pool and the stream use the same device.
+/// Each HPX worker thread is assigned thread local cuBLAS handle.
+class HandlePool {
+  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
+
+public:
+  HandlePool(int device = 0, cublasPointerMode_t ptr_mode = CUBLAS_POINTER_MODE_HOST)
+      : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device, ptr_mode)) {}
+
+  cublasHandle_t getNextHandle(cudaStream_t stream) {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getNextHandle(stream);
+  }
+
+  int getDevice() {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getDevice();
+  }
+
+  bool operator==(HandlePool const& rhs) const noexcept {
+    return handles_ptr_ == rhs.handles_ptr_;
+  }
+
+  bool operator!=(HandlePool const& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+};
+}
+}
+
+#endif
diff --git a/include/dlaf/cuda/executor.h b/include/dlaf/cuda/executor.h
index b8952614d8..66333ef65a 100644
--- a/include/dlaf/cuda/executor.h
+++ b/include/dlaf/cuda/executor.h
@@ -24,111 +24,15 @@
 #include <hpx/functional.hpp>
 #include <hpx/future.hpp>
 #include <hpx/include/util.hpp>
+#include <hpx/modules/async_cuda.hpp>
 #include <hpx/tuple.hpp>
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cuda/error.h"
+#include "dlaf/cuda/stream_pool.h"
 
 namespace dlaf {
 namespace cuda {
-namespace internal {
-
-struct StreamPoolImpl {
-  int device_;
-  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
-  std::size_t num_streams_per_worker_thread_;
-  std::vector<cudaStream_t> streams_;
-  std::vector<hpx::util::cache_aligned_data<std::size_t>> current_stream_idxs_;
-
-  StreamPoolImpl(int device, std::size_t num_streams_per_worker_thread,
-                 hpx::threads::thread_priority hpx_thread_priority)
-      : device_(device), num_streams_per_worker_thread_(num_streams_per_worker_thread),
-        streams_(num_worker_threads_ * num_streams_per_worker_thread),
-        current_stream_idxs_(num_worker_threads_, {std::size_t(0)}) {
-    DLAF_CUDA_CALL(cudaSetDevice(device));
-
-    // We map hpx::threads::thread_priority::high to the highest CUDA stream
-    // priority, and the rest to the lowest. Typically CUDA streams will only
-    // have two priorities.
-    int least_priority, greatest_priority;
-    DLAF_CUDA_CALL(cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
-    int stream_priority = least_priority;
-    if (hpx_thread_priority == hpx::threads::thread_priority::high) {
-      stream_priority = greatest_priority;
-    }
-
-    for (auto& s : streams_) {
-      DLAF_CUDA_CALL(cudaStreamCreateWithPriority(&s, cudaStreamNonBlocking, stream_priority));
-    }
-  }
-
-  StreamPoolImpl& operator=(StreamPoolImpl&&) = default;
-  StreamPoolImpl(StreamPoolImpl&&) = default;
-  StreamPoolImpl(const StreamPoolImpl&) = delete;
-  StreamPoolImpl& operator=(const StreamPoolImpl&) = delete;
-
-  ~StreamPoolImpl() {
-    for (auto& s : streams_) {
-      DLAF_CUDA_CALL(cudaStreamDestroy(s));
-    }
-  }
-
-  cudaStream_t getNextStream() {
-    // Set the device corresponding to the CUBLAS handle.
-    //
-    // The CUBLAS library context is tied to the current CUDA device [1]. A previous task scheduled on
-    // the same thread may have set a different device, this makes sure the correct device is used. The
-    // function is considered very low overhead call [2].
-    //
-    // [1]: https://docs.nvidia.com/cuda/cublas/index.html#cublascreate
-    // [2]: CUDA Runtime API, section 5.1 Device Management
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-    const std::size_t worker_thread_num = hpx::get_worker_thread_num();
-    DLAF_ASSERT(worker_thread_num != std::size_t(-1), worker_thread_num);
-    std::size_t stream_idx =
-        worker_thread_num * num_streams_per_worker_thread_ +
-        (++current_stream_idxs_[worker_thread_num].data_ % num_streams_per_worker_thread_);
-
-    return streams_[stream_idx];
-  }
-
-  int getDevice() {
-    return device_;
-  }
-};
-}
-
-/// A pool of CUDA streams with reference semantics (copying points to the same
-/// underlying CUDA streams, last reference destroys the references).  Allows
-/// access to CUDA streams in a round-robin fashion.  Each HPX worker thread is
-/// assigned a set of thread local CUDA streams.
-class StreamPool {
-  std::shared_ptr<internal::StreamPoolImpl> streams_ptr_;
-
-public:
-  StreamPool(int device = 0, std::size_t num_streams_per_worker_thread = 3,
-             hpx::threads::thread_priority hpx_thread_priority = hpx::threads::thread_priority::default_)
-      : streams_ptr_(std::make_shared<internal::StreamPoolImpl>(device, num_streams_per_worker_thread,
-                                                                hpx_thread_priority)) {}
-
-  cudaStream_t getNextStream() {
-    DLAF_ASSERT(bool(streams_ptr_), "");
-    return streams_ptr_->getNextStream();
-  }
-
-  int getDevice() {
-    DLAF_ASSERT(bool(streams_ptr_), "");
-    return streams_ptr_->getDevice();
-  }
-
-  bool operator==(StreamPool const& rhs) const noexcept {
-    return streams_ptr_ == rhs.streams_ptr_;
-  }
-
-  bool operator!=(StreamPool const& rhs) const noexcept {
-    return !(*this == rhs);
-  }
-};
 
 /// An executor for CUDA functions. Uses streams from the given StreamPool. A
 /// CUDA function is defined as any function that takes a CUDA stream as the
diff --git a/include/dlaf/cuda/stream_pool.h b/include/dlaf/cuda/stream_pool.h
new file mode 100644
index 0000000000..65e3d7c124
--- /dev/null
+++ b/include/dlaf/cuda/stream_pool.h
@@ -0,0 +1,134 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+/// @file
+
+#ifdef DLAF_WITH_CUDA
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <cuda_runtime.h>
+
+#include <hpx/include/util.hpp>
+#include <hpx/local/runtime.hpp>
+#include <hpx/thread.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/cuda/error.h"
+
+namespace dlaf {
+namespace cuda {
+namespace internal {
+
+struct StreamPoolImpl {
+  int device_;
+  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
+  std::size_t num_streams_per_worker_thread_;
+  std::vector<cudaStream_t> streams_;
+  std::vector<hpx::util::cache_aligned_data<std::size_t>> current_stream_idxs_;
+
+  StreamPoolImpl(int device, std::size_t num_streams_per_worker_thread,
+                 hpx::threads::thread_priority hpx_thread_priority)
+      : device_(device), num_streams_per_worker_thread_(num_streams_per_worker_thread),
+        streams_(num_worker_threads_ * num_streams_per_worker_thread),
+        current_stream_idxs_(num_worker_threads_, {std::size_t(0)}) {
+    DLAF_CUDA_CALL(cudaSetDevice(device));
+
+    // We map hpx::threads::thread_priority::high to the highest CUDA stream
+    // priority, and the rest to the lowest. Typically CUDA streams will only
+    // have two priorities.
+    int least_priority, greatest_priority;
+    DLAF_CUDA_CALL(cudaDeviceGetStreamPriorityRange(&least_priority, &greatest_priority));
+    int stream_priority = least_priority;
+    if (hpx_thread_priority == hpx::threads::thread_priority::high) {
+      stream_priority = greatest_priority;
+    }
+
+    for (auto& s : streams_) {
+      DLAF_CUDA_CALL(cudaStreamCreateWithPriority(&s, cudaStreamNonBlocking, stream_priority));
+    }
+  }
+
+  StreamPoolImpl& operator=(StreamPoolImpl&&) = default;
+  StreamPoolImpl(StreamPoolImpl&&) = default;
+  StreamPoolImpl(const StreamPoolImpl&) = delete;
+  StreamPoolImpl& operator=(const StreamPoolImpl&) = delete;
+
+  ~StreamPoolImpl() {
+    for (auto& s : streams_) {
+      DLAF_CUDA_CALL(cudaStreamDestroy(s));
+    }
+  }
+
+  cudaStream_t getNextStream() {
+    // Set the device corresponding to the CUBLAS handle.
+    //
+    // The CUBLAS library context is tied to the current CUDA device [1]. A previous task scheduled on
+    // the same thread may have set a different device, this makes sure the correct device is used. The
+    // function is considered very low overhead call [2].
+    //
+    // [1]: https://docs.nvidia.com/cuda/cublas/index.html#cublascreate
+    // [2]: CUDA Runtime API, section 5.1 Device Management
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+    const std::size_t worker_thread_num = hpx::get_worker_thread_num();
+    DLAF_ASSERT(worker_thread_num != std::size_t(-1), worker_thread_num);
+    std::size_t stream_idx =
+        worker_thread_num * num_streams_per_worker_thread_ +
+        (++current_stream_idxs_[worker_thread_num].data_ % num_streams_per_worker_thread_);
+
+    return streams_[stream_idx];
+  }
+
+  int getDevice() {
+    return device_;
+  }
+};
+}
+
+/// A pool of CUDA streams with reference semantics (copying points to the same
+/// underlying CUDA streams, last reference destroys the references).  Allows
+/// access to CUDA streams in a round-robin fashion.  Each HPX worker thread is
+/// assigned a set of thread local CUDA streams.
+class StreamPool {
+  std::shared_ptr<internal::StreamPoolImpl> streams_ptr_;
+
+public:
+  StreamPool(int device = 0, std::size_t num_streams_per_worker_thread = 3,
+             hpx::threads::thread_priority hpx_thread_priority = hpx::threads::thread_priority::default_)
+      : streams_ptr_(std::make_shared<internal::StreamPoolImpl>(device, num_streams_per_worker_thread,
+                                                                hpx_thread_priority)) {}
+
+  cudaStream_t getNextStream() {
+    DLAF_ASSERT(bool(streams_ptr_), "");
+    return streams_ptr_->getNextStream();
+  }
+
+  int getDevice() {
+    DLAF_ASSERT(bool(streams_ptr_), "");
+    return streams_ptr_->getDevice();
+  }
+
+  bool operator==(StreamPool const& rhs) const noexcept {
+    return streams_ptr_ == rhs.streams_ptr_;
+  }
+
+  bool operator!=(StreamPool const& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+};
+}
+}
+
+#endif
diff --git a/include/dlaf/cusolver/executor.h b/include/dlaf/cusolver/executor.h
index 0b61d9e531..08f3958b3d 100644
--- a/include/dlaf/cusolver/executor.h
+++ b/include/dlaf/cusolver/executor.h
@@ -25,55 +25,19 @@
 #include <hpx/functional.hpp>
 #include <hpx/future.hpp>
 #include <hpx/modules/async_cuda.hpp>
-#include <hpx/mutex.hpp>
 #include <hpx/tuple.hpp>
 #include <hpx/type_traits.hpp>
 
 #include "dlaf/common/assert.h"
 #include "dlaf/cublas/executor.h"
+#include "dlaf/cublas/handle_pool.h"
 #include "dlaf/cuda/error.h"
 #include "dlaf/cusolver/error.h"
+#include "dlaf/cusolver/handle_pool.h"
 
 namespace dlaf {
 namespace cusolver {
 namespace internal {
-class HandlePoolImpl {
-  int device_;
-  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
-  std::vector<cusolverDnHandle_t> handles_;
-
-public:
-  HandlePoolImpl(int device) : device_(device), handles_(num_worker_threads_) {
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-
-    for (auto& h : handles_) {
-      DLAF_CUSOLVER_CALL(cusolverDnCreate(&h));
-    }
-  }
-
-  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
-  HandlePoolImpl(HandlePoolImpl&&) = default;
-  HandlePoolImpl(const HandlePoolImpl&) = delete;
-  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
-
-  ~HandlePoolImpl() {
-    for (auto& h : handles_) {
-      DLAF_CUSOLVER_CALL(cusolverDnDestroy(h));
-    }
-  }
-
-  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
-    cusolverDnHandle_t handle = handles_[hpx::get_worker_thread_num()];
-    DLAF_CUDA_CALL(cudaSetDevice(device_));
-    DLAF_CUSOLVER_CALL(cusolverDnSetStream(handle, stream));
-    return handle;
-  }
-
-  int getDevice() {
-    return device_;
-  }
-};
-
 template <bool IsCallable, typename F, typename... Ts>
 struct isAsyncCusolverCallableImpl : std::false_type {
   struct dummy_type {};
@@ -96,36 +60,6 @@ struct isDataflowCusolverCallable
                                                 std::declval<Futures>()))> {};
 }
 
-/// A pool of cuSOLVER handles with reference semantics (copying points to the
-/// same underlying cuSOLVER handles, last reference destroys the references).
-/// Allows access to cuSOLVER handles associated with a particular stream. The
-/// user must ensure that the handle pool and the stream use the same device.
-/// Each HPX worker thread is assigned thread local cuSOLVER handle.
-class HandlePool {
-  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
-
-public:
-  HandlePool(int device = 0) : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device)) {}
-
-  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getNextHandle(stream);
-  }
-
-  int getDevice() {
-    DLAF_ASSERT(bool(handles_ptr_), "");
-    return handles_ptr_->getDevice();
-  }
-
-  bool operator==(HandlePool const& rhs) const noexcept {
-    return handles_ptr_ == rhs.handles_ptr_;
-  }
-
-  bool operator!=(HandlePool const& rhs) const noexcept {
-    return !(*this == rhs);
-  }
-};
-
 /// An executor for cuSOLVER functions. Uses handles and streams from the given
 /// HandlePool and StreamPool. A cuSOLVER function is defined as any function
 /// that takes a cuSOLVER handle as the first argument. The executor inserts a
diff --git a/include/dlaf/cusolver/handle_pool.h b/include/dlaf/cusolver/handle_pool.h
new file mode 100644
index 0000000000..1cc1aabda1
--- /dev/null
+++ b/include/dlaf/cusolver/handle_pool.h
@@ -0,0 +1,104 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2018-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+
+#pragma once
+
+/// @file
+
+#ifdef DLAF_WITH_CUDA
+
+#include <cstddef>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+#include <hpx/local/runtime.hpp>
+
+#include "dlaf/common/assert.h"
+#include "dlaf/cuda/error.h"
+#include "dlaf/cusolver/error.h"
+
+namespace dlaf {
+namespace cusolver {
+namespace internal {
+class HandlePoolImpl {
+  int device_;
+  std::size_t num_worker_threads_ = hpx::get_num_worker_threads();
+  std::vector<cusolverDnHandle_t> handles_;
+
+public:
+  HandlePoolImpl(int device) : device_(device), handles_(num_worker_threads_) {
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+
+    for (auto& h : handles_) {
+      DLAF_CUSOLVER_CALL(cusolverDnCreate(&h));
+    }
+  }
+
+  HandlePoolImpl& operator=(HandlePoolImpl&&) = default;
+  HandlePoolImpl(HandlePoolImpl&&) = default;
+  HandlePoolImpl(const HandlePoolImpl&) = delete;
+  HandlePoolImpl& operator=(const HandlePoolImpl&) = delete;
+
+  ~HandlePoolImpl() {
+    for (auto& h : handles_) {
+      DLAF_CUSOLVER_CALL(cusolverDnDestroy(h));
+    }
+  }
+
+  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
+    cusolverDnHandle_t handle = handles_[hpx::get_worker_thread_num()];
+    DLAF_CUDA_CALL(cudaSetDevice(device_));
+    DLAF_CUSOLVER_CALL(cusolverDnSetStream(handle, stream));
+    return handle;
+  }
+
+  int getDevice() {
+    return device_;
+  }
+};
+}
+
+/// A pool of cuSOLVER handles with reference semantics (copying points to the
+/// same underlying cuSOLVER handles, last reference destroys the handles).
+/// Allows access to cuSOLVER handles associated with a particular stream. The
+/// user must ensure that the handle pool and the stream use the same device.
+/// Each HPX worker thread is assigned thread local cuSOLVER handle.
+class HandlePool {
+  std::shared_ptr<internal::HandlePoolImpl> handles_ptr_;
+
+public:
+  HandlePool(int device = 0) : handles_ptr_(std::make_shared<internal::HandlePoolImpl>(device)) {}
+
+  cusolverDnHandle_t getNextHandle(cudaStream_t stream) {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getNextHandle(stream);
+  }
+
+  int getDevice() {
+    DLAF_ASSERT(bool(handles_ptr_), "");
+    return handles_ptr_->getDevice();
+  }
+
+  bool operator==(HandlePool const& rhs) const noexcept {
+    return handles_ptr_ == rhs.handles_ptr_;
+  }
+
+  bool operator!=(HandlePool const& rhs) const noexcept {
+    return !(*this == rhs);
+  }
+};
+}
+}
+
+#endif
diff --git a/include/dlaf/factorization/cholesky/impl.h b/include/dlaf/factorization/cholesky/impl.h
index 67f51ccf0a..5b5ab986a4 100644
--- a/include/dlaf/factorization/cholesky/impl.h
+++ b/include/dlaf/factorization/cholesky/impl.h
@@ -42,45 +42,41 @@ namespace dlaf {
 namespace factorization {
 namespace internal {
 
-template <class Executor, Device device, class T>
-void potrfDiagTile(Executor&& exec, hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(exec, matrix::unwrapExtendTiles(tile::potrf_o), blas::Uplo::Lower,
-                std::move(matrix_tile));
+template <Backend backend, Device device, class T>
+void potrfDiagTile(hpx::future<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(hpx::threads::thread_priority::normal, tile::potrf_o, blas::Uplo::Lower,
+                           std::move(matrix_tile));
 }
 
-template <class Executor, Device device, class T>
-void trsmPanelTile(Executor&& executor_hp, hpx::shared_future<matrix::Tile<const T, device>> kk_tile,
-                   hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), blas::Side::Right,
-                blas::Uplo::Lower, blas::Op::ConjTrans, blas::Diag::NonUnit, T(1.0), std::move(kk_tile),
-                std::move(matrix_tile));
+template <Backend backend, typename KKTileSender, template <typename> typename MatrixTileSender,
+          Device device, class T>
+void trsmPanelTile(KKTileSender kk_tile, MatrixTileSender<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(hpx::threads::thread_priority::high, tile::trsm_o, blas::Side::Right,
+                           blas::Uplo::Lower, blas::Op::ConjTrans, blas::Diag::NonUnit, T(1.0),
+                           std::move(kk_tile), std::move(matrix_tile));
 }
 
-template <class Executor, Device device, class T>
-void herkTrailingDiagTile(Executor&& trailing_matrix_executor,
-                          hpx::shared_future<matrix::Tile<const T, device>> panel_tile,
-                          hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(trailing_matrix_executor, matrix::unwrapExtendTiles(tile::herk_o), blas::Uplo::Lower,
-                blas::Op::NoTrans, BaseType<T>(-1.0), panel_tile, BaseType<T>(1.0),
-                std::move(matrix_tile));
+template <Backend backend, typename PanelTileSender, template <typename> typename MatrixTileSender,
+          Device device, class T>
+void herkTrailingDiagTile(hpx::threads::thread_priority priority, PanelTileSender panel_tile,
+                          MatrixTileSender<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(priority, tile::herk_o, blas::Uplo::Lower, blas::Op::NoTrans,
+                           BaseType<T>(-1.0), std::move(panel_tile), BaseType<T>(1.0),
+                           std::move(matrix_tile));
 }
 
-template <class Executor, Device device, class T>
-void gemmTrailingMatrixTile(Executor&& trailing_matrix_executor,
-                            hpx::shared_future<matrix::Tile<const T, device>> panel_tile,
-                            hpx::shared_future<matrix::Tile<const T, device>> col_panel,
-                            hpx::future<matrix::Tile<T, device>> matrix_tile) {
-  hpx::dataflow(trailing_matrix_executor, matrix::unwrapExtendTiles(tile::gemm_o), blas::Op::NoTrans,
-                blas::Op::ConjTrans, T(-1.0), std::move(panel_tile), std::move(col_panel), T(1.0),
-                std::move(matrix_tile));
+template <Backend backend, typename PanelColTileSender, template <typename> typename MatrixTileSender,
+          Device device, class T>
+void gemmTrailingMatrixTile(hpx::threads::thread_priority priority, PanelColTileSender panel_tile,
+                            PanelColTileSender col_panel,
+                            MatrixTileSender<matrix::Tile<T, device>> matrix_tile) {
+  transformDetach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::ConjTrans, T(-1.0),
+                           std::move(panel_tile), std::move(col_panel), T(1.0), std::move(matrix_tile));
 }
 
 // Local implementation of Lower Cholesky factorization.
 template <Backend backend, Device device, class T>
 void Cholesky<backend, device, T>::call_L(Matrix<T, device>& mat_a) {
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   // Number of tile (rows = cols)
   SizeType nrtile = mat_a.nrTiles().cols();
 
@@ -88,26 +84,29 @@ void Cholesky<backend, device, T>::call_L(Matrix<T, device>& mat_a) {
     // Cholesky decomposition on mat_a(k,k) r/w potrf (lapack operation)
     auto kk = LocalTileIndex{k, k};
 
-    potrfDiagTile(executor_hp, mat_a(kk));
+    potrfDiagTile<backend>(mat_a.readwrite_sender(kk));
 
     for (SizeType i = k + 1; i < nrtile; ++i) {
       // Update panel mat_a(i,k) with trsm (blas operation), using data mat_a.read(k,k)
-      trsmPanelTile(executor_hp, mat_a.read(kk), mat_a(LocalTileIndex{i, k}));
+      trsmPanelTile<backend>(mat_a.read_sender(kk), mat_a.readwrite_sender(LocalTileIndex{i, k}));
     }
 
     for (SizeType j = k + 1; j < nrtile; ++j) {
       // first trailing panel gets high priority (look ahead).
-      auto& trailing_matrix_executor = (j == k + 1) ? executor_hp : executor_np;
+      const auto trailing_matrix_priority =
+          (j == k + 1) ? hpx::threads::thread_priority::high : hpx::threads::thread_priority::normal;
 
       // Update trailing matrix: diagonal element mat_a(j,j), reading mat_a.read(j,k), using herk (blas operation)
-      herkTrailingDiagTile(trailing_matrix_executor, mat_a.read(LocalTileIndex{j, k}),
-                           mat_a(LocalTileIndex{j, j}));
+      herkTrailingDiagTile<backend>(trailing_matrix_priority, mat_a.read_sender(LocalTileIndex{j, k}),
+                                    mat_a.readwrite_sender(LocalTileIndex{j, j}));
 
       for (SizeType i = j + 1; i < nrtile; ++i) {
         // Update remaining trailing matrix mat_a(i,j), reading mat_a.read(i,k) and mat_a.read(j,k),
         // using gemm (blas operation)
-        gemmTrailingMatrixTile(trailing_matrix_executor, mat_a.read(LocalTileIndex{i, k}),
-                               mat_a.read(LocalTileIndex{j, k}), mat_a(LocalTileIndex{i, j}));
+        gemmTrailingMatrixTile<backend>(trailing_matrix_priority,
+                                        mat_a.read_sender(LocalTileIndex{i, k}),
+                                        mat_a.read_sender(LocalTileIndex{j, k}),
+                                        mat_a.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -115,11 +114,6 @@ void Cholesky<backend, device, T>::call_L(Matrix<T, device>& mat_a) {
 
 template <Backend backend, Device device, class T>
 void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T, device>& mat_a) {
-  using hpx::util::unwrapping;
-  using hpx::dataflow;
-
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
   auto executor_mpi = dlaf::getMPIExecutor<backend>();
 
   // Set up MPI executor pipelines
@@ -141,7 +135,7 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
 
     // Factorization of diagonal tile and broadcast it along the k-th column
     if (kk_rank == this_rank)
-      potrfDiagTile(executor_hp, mat_a(kk_idx));
+      potrfDiagTile<backend>(mat_a.readwrite_sender(kk_idx));
 
     // If there is no trailing matrix
     if (k == nrtile - 1)
@@ -180,7 +174,7 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
         const LocalTileIndex local_idx(Coord::Row, i);
         const LocalTileIndex ik_idx(i, distr.localTileFromGlobalTile<Coord::Col>(k));
 
-        trsmPanelTile(executor_hp, panelT.read(diag_wp_idx), mat_a(ik_idx));
+        trsmPanelTile<backend>(panelT.read_sender(diag_wp_idx), mat_a.readwrite_sender(ik_idx));
 
         panel.setTile(local_idx, mat_a.read(ik_idx));
       }
@@ -203,12 +197,13 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
         continue;
 
       const auto j = distr.localTileFromGlobalTile<Coord::Col>(jt_idx);
-      auto& trailing_matrix_executor = (jt_idx == kt) ? executor_hp : executor_np;
+      const auto trailing_matrix_priority =
+          (jt_idx == kt) ? hpx::threads::thread_priority::high : hpx::threads::thread_priority::normal;
       if (this_rank.row() == owner.row()) {
         const auto i = distr.localTileFromGlobalTile<Coord::Row>(jt_idx);
 
-        herkTrailingDiagTile(trailing_matrix_executor, panel.read({Coord::Row, i}),
-                             mat_a(LocalTileIndex{i, j}));
+        herkTrailingDiagTile<backend>(trailing_matrix_priority, panel.read_sender({Coord::Row, i}),
+                                      mat_a.readwrite_sender(LocalTileIndex{i, j}));
       }
 
       for (SizeType i_idx = jt_idx + 1; i_idx < nrtile; ++i_idx) {
@@ -218,8 +213,12 @@ void Cholesky<backend, device, T>::call_L(comm::CommunicatorGrid grid, Matrix<T,
           continue;
 
         const auto i = distr.localTileFromGlobalTile<Coord::Row>(i_idx);
-        gemmTrailingMatrixTile(executor_np, panel.read({Coord::Row, i}), panelT.read({Coord::Col, j}),
-                               mat_a(LocalTileIndex{i, j}));
+        // TODO: This was using executor_np. Was that intentional, or should it
+        // be trailing_matrix_executor/priority?
+        gemmTrailingMatrixTile<backend>(hpx::threads::thread_priority::normal,
+                                        panel.read_sender({Coord::Row, i}),
+                                        panelT.read_sender({Coord::Col, j}),
+                                        mat_a.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
 
diff --git a/include/dlaf/matrix/copy.h b/include/dlaf/matrix/copy.h
index fbb2e0786d..f16c1759f3 100644
--- a/include/dlaf/matrix/copy.h
+++ b/include/dlaf/matrix/copy.h
@@ -15,6 +15,7 @@
 
 #include "dlaf/executors.h"
 #include "dlaf/matrix/copy_tile.h"
+#include "dlaf/sender/transform.h"
 #include "dlaf/types.h"
 #include "dlaf/util_matrix.h"
 
@@ -36,10 +37,16 @@ void copy(Matrix<const T, Source>& source, Matrix<T, Destination>& dest) {
   const SizeType local_tile_rows = distribution.localNrTiles().rows();
   const SizeType local_tile_cols = distribution.localNrTiles().cols();
 
-  for (SizeType j = 0; j < local_tile_cols; ++j)
-    for (SizeType i = 0; i < local_tile_rows; ++i)
-      hpx::dataflow(dlaf::getCopyExecutor<Source, Destination>(), unwrapExtendTiles(copy_o),
-                    source.read(LocalTileIndex(i, j)), dest(LocalTileIndex(i, j)));
+  for (SizeType j = 0; j < local_tile_cols; ++j) {
+    for (SizeType i = 0; i < local_tile_rows; ++i) {
+      transformDetach<
+          internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
+                                                             copy_o,
+                                                             source.read_sender(LocalTileIndex(i, j)),
+                                                             dest.readwrite_sender(
+                                                                 LocalTileIndex(i, j)));
+    }
+  }
 }
 }
 }
diff --git a/include/dlaf/matrix/copy_tile.h b/include/dlaf/matrix/copy_tile.h
index 467e1a77c9..ca29ff5663 100644
--- a/include/dlaf/matrix/copy_tile.h
+++ b/include/dlaf/matrix/copy_tile.h
@@ -22,10 +22,36 @@
 #include "dlaf/executors.h"
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/tile.h"
+#include "dlaf/sender/transform.h"
 
 namespace dlaf {
 namespace matrix {
 namespace internal {
+template <Device Source, Device Destination>
+struct CopyBackend;
+
+template <>
+struct CopyBackend<Device::CPU, Device::CPU> {
+  static constexpr Backend value = Backend::MC;
+};
+
+#ifdef DLAF_WITH_CUDA
+template <>
+struct CopyBackend<Device::CPU, Device::GPU> {
+  static constexpr Backend value = Backend::GPU;
+};
+
+template <>
+struct CopyBackend<Device::GPU, Device::CPU> {
+  static constexpr Backend value = Backend::GPU;
+};
+
+template <>
+struct CopyBackend<Device::GPU, Device::GPU> {
+  static constexpr Backend value = Backend::GPU;
+};
+#endif
+
 template <typename T, Device Source, Device Destination>
 struct CopyTile;
 
@@ -155,32 +181,37 @@ struct Duplicate {
   }
 };
 
-namespace internal {
-template <Device Destination, Device Source>
-struct DuplicateIfNeeded {
-  template <typename T, template <class> class Future>
-  static auto call(Future<Tile<T, Source>> tile) {
-    return getUnwrapReturnValue(hpx::dataflow(dlaf::getCopyExecutor<Source, Destination>(),
-                                              unwrapExtendTiles(Duplicate<Destination>{}), tile));
-  }
-};
-
-template <Device SourceDestination>
-struct DuplicateIfNeeded<SourceDestination, SourceDestination> {
-  template <typename T, template <class> class Future>
-  static auto call(Future<Tile<T, SourceDestination>> tile) {
-    return tile;
-  }
-};
-}
-
 /// Helper function for duplicating an input tile to Destination asynchronously,
 /// but only if the destination device is different from the source device.
 ///
 /// When Destination and Source are the same, returns the input tile unmodified.
-template <Device Destination, typename T, Device Source, template <class> class Future>
-auto duplicateIfNeeded(Future<Tile<T, Source>> tile) {
-  return internal::DuplicateIfNeeded<Destination, Source>::call(std::move(tile));
+template <Device Destination, typename T, Device Source>
+auto duplicateIfNeeded(hpx::future<Tile<T, Source>> tile) {
+  if constexpr (Source == Destination) {
+    return tile;
+  }
+  else {
+    return hpx::execution::experimental::make_future(
+        dlaf::transform<
+            internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
+                                                               dlaf::matrix::Duplicate<Destination>{},
+                                                               std::move(tile)));
+  }
+}
+
+template <Device Destination, typename T, Device Source>
+auto duplicateIfNeeded(hpx::shared_future<Tile<T, Source>> tile) {
+  if constexpr (Source == Destination) {
+    return tile;
+  }
+  else {
+    return hpx::execution::experimental::make_future(
+        dlaf::transform<
+            internal::CopyBackend<Source, Destination>::value>(hpx::threads::thread_priority::normal,
+                                                               dlaf::matrix::Duplicate<Destination>{},
+                                                               hpx::execution::experimental::keep_future(
+                                                                   std::move(tile))));
+  }
 }
 }
 }
diff --git a/include/dlaf/matrix/matrix.h b/include/dlaf/matrix/matrix.h
index a609d09089..4f2613982f 100644
--- a/include/dlaf/matrix/matrix.h
+++ b/include/dlaf/matrix/matrix.h
@@ -128,6 +128,14 @@ class Matrix : public Matrix<const T, device> {
     return operator()(this->distribution().localTileIndex(index));
   }
 
+  auto readwrite_sender(const LocalTileIndex& index) noexcept {
+    return this->operator()(index);
+  }
+
+  auto readwrite_sender(const GlobalTileIndex& index) {
+    return readwrite_sender(this->distribution().localTileIndex(index));
+  }
+
 protected:
   using Matrix<const T, device>::tileLinearIndex;
 
@@ -175,6 +183,16 @@ class Matrix<const T, device> : public internal::MatrixBase {
     return read(distribution().localTileIndex(index));
   }
 
+  auto read_sender(const LocalTileIndex& index) noexcept {
+    // We want to explicitly deal with the shared_future, not the const& to the
+    // value.
+    return hpx::execution::experimental::keep_future(read(index));
+  }
+
+  auto read_sender(const GlobalTileIndex& index) {
+    return read_sender(distribution().localTileIndex(index));
+  }
+
   /// Synchronization barrier for all local tiles in the matrix
   ///
   /// This blocking call does not return until all operations, i.e. both RO and RW,
diff --git a/include/dlaf/matrix/panel.h b/include/dlaf/matrix/panel.h
index 28cde56096..a503024851 100644
--- a/include/dlaf/matrix/panel.h
+++ b/include/dlaf/matrix/panel.h
@@ -104,6 +104,10 @@ struct Panel<axis, const T, D> {
     }
   }
 
+  auto read_sender(const LocalTileIndex& index) {
+    return hpx::execution::experimental::keep_future(read(index));
+  }
+
   /// Set the panel to enable access to the range of tiles [start, end)
   ///
   /// With respect to the parent matrix.
@@ -300,9 +304,12 @@ struct Panel : public Panel<axis, const T, device> {
     return BaseT::data_(BaseT::fullIndex(index));
   }
 
+  auto readwrite_sender(const LocalTileIndex& index) {
+    return hpx::execution::experimental::keep_future(this->operator()(index));
+  }
+
 protected:
   using BaseT = Panel<axis, const T, device>;
 };
-
 }
 }
diff --git a/include/dlaf/matrix/tile.tpp b/include/dlaf/matrix/tile.tpp
index 4d9965d2b7..740c1537f7 100644
--- a/include/dlaf/matrix/tile.tpp
+++ b/include/dlaf/matrix/tile.tpp
@@ -30,7 +30,7 @@ Tile<const T, device>::Tile(Tile&& rhs) noexcept
 template <class T, Device device>
 Tile<const T, device>::~Tile() {
   if (p_) {
-    if (std::uncaught_exception())
+    if (std::uncaught_exceptions() > 0)
       p_->set_exception(std::make_exception_ptr(ContinuationException{}));
     else
       p_->set_value(Tile<ElementType, device>(size_, std::move(memory_view_), ld_));
diff --git a/include/dlaf/sender/lift_non_sender.h b/include/dlaf/sender/lift_non_sender.h
new file mode 100644
index 0000000000..1ebd9d99c1
--- /dev/null
+++ b/include/dlaf/sender/lift_non_sender.h
@@ -0,0 +1,31 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2020-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <hpx/local/execution.hpp>
+
+#include <type_traits>
+#include <utility>
+
+namespace dlaf {
+namespace internal {
+// Utility to make a sender out of a non-sender (non-senders are wrapped in
+// just).
+template <typename S, typename = std::enable_if_t<hpx::execution::experimental::is_sender<S>::value>>
+decltype(auto) liftNonSender(S&& s) {
+  return std::forward<S>(s);
+}
+
+template <typename S, typename = std::enable_if_t<!hpx::execution::experimental::is_sender<S>::value>>
+auto liftNonSender(S&& s) {
+  return hpx::execution::experimental::just(std::forward<S>(s));
+}
+}
+}
diff --git a/include/dlaf/sender/transform.h b/include/dlaf/sender/transform.h
new file mode 100644
index 0000000000..50acc1d907
--- /dev/null
+++ b/include/dlaf/sender/transform.h
@@ -0,0 +1,203 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2020-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <hpx/local/execution.hpp>
+
+#include "dlaf/init.h"
+#include "dlaf/sender/when_all_lift.h"
+#include "dlaf/types.h"
+
+#ifdef DLAF_WITH_CUDA
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cusolverDn.h>
+
+#include "dlaf/cublas/handle_pool.h"
+#include "dlaf/cuda/stream_pool.h"
+#include "dlaf/cusolver/handle_pool.h"
+#endif
+
+namespace dlaf {
+namespace internal {
+// DLAF-specific transform, templated on a backend. This, together with
+// when_all, takes the place of dataflow(executor, ...) for futures.
+template <Backend B>
+struct Transform;
+
+// For Backend::MC we use the regular thread pool scheduler from HPX.
+template <>
+struct Transform<Backend::MC> {
+  template <typename S, typename F>
+  static auto call(hpx::threads::thread_priority priority, S&& s, F&& f) {
+    namespace ex = hpx::execution::experimental;
+    return ex::transform(ex::on(std::forward<S>(s), ex::make_with_priority(ex::executor{}, priority)),
+                         hpx::util::unwrapping(std::forward<F>(f)));
+  }
+};
+
+#ifdef DLAF_WITH_CUDA
+// For Backend::GPU we use a custom sender.
+template <>
+struct Transform<Backend::GPU> {
+  template <typename S, typename F>
+  struct GPUTransformSender {
+    cuda::StreamPool stream_pool;
+    cublas::HandlePool cublas_handle_pool;
+    cusolver::HandlePool cusolver_handle_pool;
+    std::decay_t<S> s;
+    std::decay_t<F> f;
+
+    template <typename G, typename... Us>
+    static auto call_helper(cudaStream_t stream, cublasHandle_t cublas_handle,
+                            cusolverDnHandle_t cusolver_handle, G&& g, Us&... ts) {
+      using unwrapping_function_type = decltype(hpx::util::unwrapping(std::forward<G>(g)));
+      static_assert(std::is_invocable_v<unwrapping_function_type, Us..., cudaStream_t> ||
+                        std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Us...> ||
+                        std::is_invocable_v<unwrapping_function_type, cusolverDnHandle_t, Us...>,
+                    "function passed to transform<GPU> must be invocable with a cublasStream_t as the "
+                    "last argument or a cublasHandle_t/cusolverDnHandle_t as the first argument");
+
+      if constexpr (std::is_invocable_v<unwrapping_function_type, Us..., cudaStream_t>) {
+        (void) cublas_handle;
+        (void) cusolver_handle;
+        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), ts..., stream);
+      }
+      else if constexpr (std::is_invocable_v<unwrapping_function_type, cublasHandle_t, Us...>) {
+        (void) cusolver_handle;
+        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), cublas_handle, ts...);
+      }
+      else if constexpr (std::is_invocable_v<unwrapping_function_type, cusolverDnHandle_t, Us...>) {
+        (void) cublas_handle;
+        return std::invoke(hpx::util::unwrapping(std::forward<G>(g)), cusolver_handle, ts...);
+      }
+    }
+
+    template <typename Tuple>
+    struct invoke_result_helper;
+
+    template <template <typename...> class Tuple, typename... Ts>
+    struct invoke_result_helper<Tuple<Ts...>> {
+      using result_type = decltype(
+          call_helper(std::declval<cudaStream_t&>(), std::declval<cublasHandle_t&>(),
+                      std::declval<cusolverDnHandle_t&>(), std::declval<F>(), std::declval<Ts&>()...));
+      using type =
+          typename std::conditional<std::is_void<result_type>::value, Tuple<>, Tuple<result_type>>::type;
+    };
+
+    template <template <typename...> class Tuple, template <typename...> class Variant>
+    using value_types = hpx::util::detail::unique_t<hpx::util::detail::transform_t<
+        typename hpx::execution::experimental::sender_traits<S>::template value_types<Tuple, Variant>,
+        invoke_result_helper>>;
+
+    template <template <typename...> class Variant>
+    using error_types = hpx::util::detail::unique_t<hpx::util::detail::prepend_t<
+        typename hpx::execution::experimental::sender_traits<S>::template error_types<Variant>,
+        std::exception_ptr>>;
+
+    static constexpr bool sends_done = false;
+
+    template <typename R>
+    struct GPUTransformReceiver {
+      cuda::StreamPool stream_pool;
+      cublas::HandlePool cublas_handle_pool;
+      cusolver::HandlePool cusolver_handle_pool;
+      std::decay_t<R> r;
+      std::decay_t<F> f;
+
+      template <typename E>
+          void set_error(E&& e) && noexcept {
+        hpx::execution::experimental::set_error(std::move(r), std::forward<E>(e));
+      }
+
+      void set_done() && noexcept {
+        hpx::execution::experimental::set_done(std::move(r));
+      }
+
+      template <typename... Ts>
+      void set_value(Ts&&... ts) noexcept {
+        try {
+          cudaStream_t stream = stream_pool.getNextStream();
+          cublasHandle_t cublas_handle = cublas_handle_pool.getNextHandle(stream);
+          cusolverDnHandle_t cusolver_handle = cusolver_handle_pool.getNextHandle(stream);
+
+          // NOTE: We do not forward ts because we keep the pack alive longer in
+          // the continuation.
+          if constexpr (std::is_void_v<decltype(
+                            call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...))>) {
+            call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...);
+            hpx::cuda::experimental::detail::add_event_callback(
+                [r = std::move(r),
+                 keep_alive =
+                     std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                     std::move(cublas_handle_pool),
+                                     std::move(cusolver_handle_pool))](cudaError_t status) mutable {
+                  DLAF_CUDA_CALL(status);
+                  hpx::execution::experimental::set_value(std::move(r));
+                },
+                stream);
+          }
+          else {
+            auto res = call_helper(stream, cublas_handle, cusolver_handle, std::move(f), ts...);
+            hpx::cuda::experimental::detail::add_event_callback(
+                [r = std::move(r), res = std::move(res),
+                 keep_alive =
+                     std::make_tuple(std::forward<Ts>(ts)..., std::move(stream_pool),
+                                     std::move(cublas_handle_pool),
+                                     std::move(cusolver_handle_pool))](cudaError_t status) mutable {
+                  DLAF_CUDA_CALL(status);
+                  hpx::execution::experimental::set_value(std::move(r), std::move(res));
+                },
+                stream);
+          }
+        }
+        catch (...) {
+          hpx::execution::experimental::set_error(std::move(r), std::current_exception());
+        }
+      }
+    };
+
+    template <typename R>
+    auto connect(R&& r) && {
+      return hpx::execution::experimental::connect(std::move(s),
+                                                   GPUTransformReceiver<R>{stream_pool,
+                                                                           cublas_handle_pool,
+                                                                           cusolver_handle_pool,
+                                                                           std::forward<R>(r),
+                                                                           std::move(f)});
+    }
+  };
+
+  template <typename S, typename F>
+  static auto call(hpx::threads::thread_priority priority, S&& s, F&& f) {
+    return GPUTransformSender<S, F>{priority >= hpx::threads::thread_priority::high
+                                        ? getHpCudaStreamPool()
+                                        : getNpCudaStreamPool(),
+                                    getCublasHandlePool(), getCusolverHandlePool(), std::forward<S>(s),
+                                    std::forward<F>(f)};
+  }
+};
+#endif
+}
+
+// Lazy transform. This does not submit the work and returns a sender.
+template <Backend B, typename F, typename... Ts>
+[[nodiscard]] decltype(auto) transform(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
+  return internal::Transform<B>::call(priority, internal::whenAllLift(std::forward<Ts>(ts)...),
+                                      std::forward<F>(f));
+}
+
+// Fire-and-forget transform. This submits the work and returns void.
+template <Backend B, typename F, typename... Ts>
+void transformDetach(hpx::threads::thread_priority priority, F&& f, Ts&&... ts) {
+  hpx::execution::experimental::detach(
+      transform<B>(priority, std::forward<F>(f), std::forward<Ts>(ts)...));
+}
+}
diff --git a/include/dlaf/sender/when_all_lift.h b/include/dlaf/sender/when_all_lift.h
new file mode 100644
index 0000000000..f8075c1bbe
--- /dev/null
+++ b/include/dlaf/sender/when_all_lift.h
@@ -0,0 +1,28 @@
+//
+// Distributed Linear Algebra with Future (DLAF)
+//
+// Copyright (c) 2020-2021, ETH Zurich
+// All rights reserved.
+//
+// Please, refer to the LICENSE file in the root directory.
+// SPDX-License-Identifier: BSD-3-Clause
+//
+#pragma once
+
+#include <type_traits>
+#include <utility>
+
+#include <hpx/local/execution.hpp>
+
+#include "dlaf/sender/lift_non_sender.h"
+
+namespace dlaf {
+namespace internal {
+// when_all-like utility which makes senders out of non-senders before passing
+// them to when_all.
+template <typename... Ts>
+auto whenAllLift(Ts&&... ts) {
+  return hpx::execution::experimental::when_all(liftNonSender<Ts>(std::forward<Ts>(ts))...);
+}
+}
+}
diff --git a/include/dlaf/solver/triangular/impl.h b/include/dlaf/solver/triangular/impl.h
index 046a0b0fc5..3d97397e24 100644
--- a/include/dlaf/solver/triangular/impl.h
+++ b/include/dlaf/solver/triangular/impl.h
@@ -9,7 +9,10 @@
 //
 #pragma once
 
+#include <hpx/include/util.hpp>
+#include <hpx/local/execution.hpp>
 #include <hpx/local/future.hpp>
+#include <hpx/local/thread.hpp>
 
 #include "dlaf/blas/tile.h"
 #include "dlaf/common/index2d.h"
@@ -23,39 +26,21 @@
 #include "dlaf/lapack/tile.h"
 #include "dlaf/matrix/distribution.h"
 #include "dlaf/matrix/matrix.h"
+#include "dlaf/sender/transform.h"
 #include "dlaf/solver/triangular/api.h"
 #include "dlaf/util_matrix.h"
 
 namespace dlaf {
 namespace solver {
 namespace internal {
-
-namespace lln {
-template <class Executor, class T, Device D>
-void trsm_B_panel_tile(Executor&& ex, blas::Diag diag, T alpha,
-                       hpx::shared_future<matrix::Tile<const T, D>> in_tile,
-                       hpx::future<matrix::Tile<T, D>> out_tile) {
-  hpx::dataflow(std::forward<Executor>(ex), matrix::unwrapExtendTiles(tile::trsm_o), blas::Side::Left,
-                blas::Uplo::Lower, blas::Op::NoTrans, diag, alpha, std::move(in_tile),
-                std::move(out_tile));
-}
-
-template <class Executor, class T, Device D>
-void gemm_trailing_matrix_tile(Executor&& ex, T beta,
-                               hpx::shared_future<matrix::Tile<const T, D>> a_tile,
-                               hpx::shared_future<matrix::Tile<const T, D>> b_tile,
-                               hpx::future<matrix::Tile<T, D>> c_tile) {
-  hpx::dataflow(std::forward<Executor>(ex), matrix::unwrapExtendTiles(tile::gemm_o), blas::Op::NoTrans,
-                blas::Op::NoTrans, beta, std::move(a_tile), std::move(b_tile), T(1.0),
-                std::move(c_tile));
-}
-}
-
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
+  using hpx::threads::thread_priority;
+
+  constexpr auto Left = blas::Side::Left;
+  constexpr auto Lower = blas::Uplo::Lower;
+  constexpr auto NoTrans = blas::Op::NoTrans;
 
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
@@ -65,15 +50,19 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
       auto kj = LocalTileIndex{k, j};
 
       // Triangular solve of k-th row Panel of B
-      lln::trsm_B_panel_tile(executor_hp, diag, alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans, diag,
+                               alpha, mat_a.read_sender(LocalTileIndex{k, k}),
+                               mat_b.readwrite_sender(kj));
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k + 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
+
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        lln::gemm_trailing_matrix_tile(trailing_executor, beta, mat_a.read(LocalTileIndex{i, k}),
-                                       mat_b.read(kj), mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -82,13 +71,13 @@ void Triangular<backend, device, T>::call_LLN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+  namespace ex = hpx::execution::experimental;
+
   constexpr auto Left = blas::Side::Left;
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -96,18 +85,19 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
     for (SizeType j = n - 1; j > -1; --j) {
       auto kj = LocalTileIndex{k, j};
       // Triangular solve of k-th row Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Left, Lower, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k - 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
+
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), op, NoTrans, beta,
-                      mat_a.read(LocalTileIndex{k, i}), mat_b.read(kj), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, op, blas::Op::NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -116,13 +106,12 @@ void Triangular<backend, device, T>::call_LLT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Left = blas::Side::Left;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -130,18 +119,18 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
     for (SizeType j = n - 1; j > -1; --j) {
       auto kj = LocalTileIndex{k, j};
       // Triangular solve of k-th row Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Left, Upper, NoTrans, diag,
-                    alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, NoTrans, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k - 1; i > -1; --i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k - 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, NoTrans, beta,
-                      mat_a.read(LocalTileIndex{i, k}), mat_b.read(kj), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{i, k}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -150,13 +139,12 @@ void Triangular<backend, device, T>::call_LUN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Left = blas::Side::Left;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -165,18 +153,18 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
       auto kj = LocalTileIndex{k, j};
 
       // Triangular solve of k-th row Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Left, Upper, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(kj));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Upper, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(kj));
 
       for (SizeType i = k + 1; i < m; ++i) {
         // Choose queue priority
-        auto& trailing_executor = (i == k + 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), op, NoTrans, beta,
-                      mat_a.read(LocalTileIndex{k, i}), mat_b.read(kj), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, op, NoTrans, beta,
+                                 mat_a.read_sender(LocalTileIndex{k, i}), mat_b.read_sender(kj), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -185,13 +173,12 @@ void Triangular<backend, device, T>::call_LUT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -200,18 +187,18 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Lower, NoTrans, diag,
-                    alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, NoTrans, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k - 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, NoTrans, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{k, j}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -220,13 +207,12 @@ void Triangular<backend, device, T>::call_RLN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Lower = blas::Uplo::Lower;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -235,18 +221,18 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Lower, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Lower, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k + 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, op, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{j, k}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -255,13 +241,12 @@ void Triangular<backend, device, T>::call_RLT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<const T, device>& mat_a,
                                               Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -270,18 +255,18 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Upper, NoTrans, diag,
-                    alpha, mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, NoTrans, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k + 1; j < n; ++j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k + 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, NoTrans, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{k, j}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, NoTrans, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{k, j}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -290,13 +275,12 @@ void Triangular<backend, device, T>::call_RUN(blas::Diag diag, T alpha, Matrix<c
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  using hpx::threads::thread_priority;
+
   constexpr auto Right = blas::Side::Right;
   constexpr auto Upper = blas::Uplo::Upper;
   constexpr auto NoTrans = blas::Op::NoTrans;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
-
   SizeType m = mat_b.nrTiles().rows();
   SizeType n = mat_b.nrTiles().cols();
 
@@ -305,18 +289,18 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
       auto ik = LocalTileIndex{i, k};
 
       // Triangular solve of k-th col Panel of B
-      hpx::dataflow(executor_hp, matrix::unwrapExtendTiles(tile::trsm_o), Right, Upper, op, diag, alpha,
-                    mat_a.read(LocalTileIndex{k, k}), mat_b(ik));
+      transformDetach<backend>(thread_priority::high, tile::trsm_o, Right, Upper, op, diag, alpha,
+                               mat_a.read_sender(LocalTileIndex{k, k}), mat_b.readwrite_sender(ik));
 
       for (SizeType j = k - 1; j > -1; --j) {
         // Choose queue priority
-        auto& trailing_executor = (j == k - 1) ? executor_hp : executor_np;
+        const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
         auto beta = static_cast<T>(-1.0) / alpha;
         // Update trailing matrix
-        hpx::dataflow(trailing_executor, matrix::unwrapExtendTiles(tile::gemm_o), NoTrans, op, beta,
-                      mat_b.read(ik), mat_a.read(LocalTileIndex{j, k}), T(1.0),
-                      mat_b(LocalTileIndex{i, j}));
+        transformDetach<backend>(priority, tile::gemm_o, NoTrans, op, beta, mat_b.read_sender(ik),
+                                 mat_a.read_sender(LocalTileIndex{j, k}), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i, j}));
       }
     }
   }
@@ -325,13 +309,18 @@ void Triangular<backend, device, T>::call_RUT(blas::Op op, blas::Diag diag, T al
 template <Backend backend, Device device, class T>
 void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas::Diag diag, T alpha,
                                               Matrix<const T, device>& mat_a, Matrix<T, device>& mat_b) {
+  namespace ex = hpx::execution::experimental;
+  using hpx::threads::thread_priority;
+  using hpx::threads::thread_priority;
   using hpx::util::unwrapping;
 
+  constexpr auto Left = blas::Side::Left;
+  constexpr auto Lower = blas::Uplo::Lower;
+  constexpr auto NoTrans = blas::Op::NoTrans;
+
   using common::internal::vector;
   using ConstTileType = typename Matrix<T, device>::ConstTileType;
 
-  auto executor_hp = dlaf::getHpExecutor<backend>();
-  auto executor_np = dlaf::getNpExecutor<backend>();
   auto executor_mpi = dlaf::getMPIExecutor<backend>();
 
   // Set up MPI
@@ -377,7 +366,9 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
       if (mat_b.rankIndex().row() == k_rank_row) {
         auto k_local_row = distr_b.localTileFromGlobalTile<Coord::Row>(k);
         auto kj = LocalTileIndex{k_local_row, j_local};
-        lln::trsm_B_panel_tile(executor_hp, diag, alpha, kk_tile, mat_b(kj));
+        transformDetach<backend>(thread_priority::high, tile::trsm_o, Left, Lower, blas::Op::NoTrans,
+                                 diag, alpha, ex::keep_future(kk_tile), mat_b.readwrite_sender(kj));
+
         panel[j_local] = mat_b.read(kj);
         if (k != (mat_b.nrTiles().rows() - 1)) {
           comm::scheduleSendBcast(executor_mpi, panel[j_local], mpi_col_task_chain());
@@ -397,7 +388,7 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
       auto i = distr_a.globalTileFromLocalTile<Coord::Row>(i_local);
 
       // Choose queue priority
-      auto& trailing_executor = (i == k + 1) ? executor_hp : executor_np;
+      const auto priority = (i == k - 1) ? thread_priority::high : thread_priority::normal;
 
       hpx::shared_future<ConstTileType> ik_tile;
 
@@ -417,8 +408,9 @@ void Triangular<backend, device, T>::call_LLN(comm::CommunicatorGrid grid, blas:
       // Update trailing matrix
       for (SizeType j_local = 0; j_local < b_local_cols; ++j_local) {
         T beta = T(-1.0) / alpha;
-        lln::gemm_trailing_matrix_tile(trailing_executor, beta, ik_tile, panel[j_local],
-                                       mat_b(LocalTileIndex{i_local, j_local}));
+        transformDetach<backend>(priority, tile::gemm_o, blas::Op::NoTrans, blas::Op::NoTrans, beta,
+                                 ex::keep_future(ik_tile), ex::keep_future(panel[j_local]), T(1.0),
+                                 mat_b.readwrite_sender(LocalTileIndex{i_local, j_local}));
       }
     }
   }
diff --git a/spack/packages/dla-future/package.py b/spack/packages/dla-future/package.py
index a05c5c796b..5d9f2f01cd 100644
--- a/spack/packages/dla-future/package.py
+++ b/spack/packages/dla-future/package.py
@@ -27,8 +27,7 @@ class DlaFuture(CMakePackage, CudaPackage):
     depends_on("lapackpp")
     depends_on("umpire~examples")
     depends_on("umpire+cuda~shared", when="+cuda")
-    depends_on("hpx cxxstd=14 networking=none +async_mpi")
-    depends_on("hpx@1.6.0:")
+    depends_on("hpx networking=none +async_mpi")
     depends_on("hpx +cuda", when="+cuda")
 
     depends_on("hpx build_type=Debug", when="build_type=Debug")