From 0a743c7bec9091f2c85eb87bedfca7104e33db9a Mon Sep 17 00:00:00 2001
From: Philip Salzmann <philip.salzmann@uibk.ac.at>
Date: Tue, 11 Apr 2023 14:39:42 +0200
Subject: [PATCH] Remove multi-pass mechanism (invoke CGFs only once)

- Instead of storing the entire CGF, we now only store the inner "command function" lambda
- Accessors captured into command function closures are "hydrated" before launching kernel
- Buffers and host objects can (and should) now be captured by reference into CGFs; added deprecation warnings
   - Accessors, side-effects and reductions may now be created from non-const buffers
- Deprecate allow_by_ref
- Introduce new mechnaism to tie buffer and host object lifetimes to
  tasks (required as we no longer store captured copies inside the CGF)

Other changes:
- Fix a bug in test "horizons correctly deal with antidependencies",
  which relied on fixed order of dependencies
- Change output of "command graph printing is unchanged" smoke test
  (again due to change in ordering)
---
 examples/convolution/convolution.cc |   6 +-
 examples/distr_io/distr_io.cc       |  12 +-
 examples/matmul/matmul.cc           |   6 +-
 examples/reduction/reduction.cc     |   8 +-
 examples/syncing/syncing.cc         |   4 +-
 examples/wave_sim/wave_sim.cc       |  12 +-
 include/accessor.h                  | 262 ++++++-----
 include/buffer.h                    |  37 +-
 include/closure_hydrator.h          | 157 +++++++
 include/distr_queue.h               |  27 +-
 include/handler.h                   | 666 +++++++++++++---------------
 include/host_object.h               |  25 +-
 include/lifetime_extending_state.h  |  43 ++
 include/runtime.h                   |   6 +-
 include/side_effect.h               |  36 +-
 include/task.h                      | 100 +++--
 include/task_manager.h              |   6 +-
 include/types.h                     |   1 +
 src/executor.cc                     |   3 +
 src/graph_generator.cc              |   2 +-
 src/print_graph.cc                  |   2 +-
 src/runtime.cc                      |   9 +
 src/task.cc                         |  47 +-
 src/task_manager.cc                 |   2 +-
 src/worker_job.cc                   |  50 ++-
 test/accessor_tests.cc              | 193 ++++++--
 test/buffer_manager_test_utils.h    |  26 +-
 test/buffer_manager_tests.cc        |  55 +--
 test/graph_compaction_tests.cc      |   2 +-
 test/integration/backend.cc         |   8 +-
 test/print_graph_tests.cc           |   6 +-
 test/runtime_deprecation_tests.cc   |  27 ++
 test/runtime_tests.cc               | 156 +++++--
 test/system/distr_tests.cc          |  46 +-
 test/task_graph_tests.cc            |   1 -
 test/task_ring_buffer_tests.cc      |   2 +-
 test/test_main.cc                   |   3 +
 test/test_utils.h                   |  14 +-
 38 files changed, 1309 insertions(+), 759 deletions(-)
 create mode 100644 include/closure_hydrator.h
 create mode 100644 include/lifetime_extending_state.h

diff --git a/examples/convolution/convolution.cc b/examples/convolution/convolution.cc
index 4461ba5e1..9b3972a23 100644
--- a/examples/convolution/convolution.cc
+++ b/examples/convolution/convolution.cc
@@ -60,7 +60,7 @@ int main(int argc, char* argv[]) {
 	celerity::buffer<float, 2> gaussian_mat_buf(gaussian_matrix.data(), celerity::range<2>(filter_size, filter_size));
 
 	// Do a gaussian blur
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor in{image_input_buf, cgh, celerity::access::neighborhood{filter_size / 2, filter_size / 2}, celerity::read_only};
 		celerity::accessor gauss{gaussian_mat_buf, cgh, celerity::access::all{}, celerity::read_only};
 		celerity::accessor out{image_tmp_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
@@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
 	celerity::buffer<sycl::float3, 2> image_output_buf(celerity::range<2>(image_height, image_width));
 
 	// Now apply a sharpening kernel
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor in{image_tmp_buf, cgh, celerity::access::neighborhood{1, 1}, celerity::read_only};
 		celerity::accessor out{image_output_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
 
@@ -107,7 +107,7 @@ int main(int argc, char* argv[]) {
 		});
 	});
 
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor out{image_output_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
 
 		cgh.host_task(celerity::on_master_node, [=] {
diff --git a/examples/distr_io/distr_io.cc b/examples/distr_io/distr_io.cc
index e56c6a81a..1560ce2f7 100644
--- a/examples/distr_io/distr_io.cc
+++ b/examples/distr_io/distr_io.cc
@@ -27,8 +27,8 @@ static std::pair<hid_t, hid_t> allocation_window_to_dataspace(const celerity::bu
 }
 
 
-static void read_hdf5_file(celerity::distr_queue& q, const celerity::buffer<float, 2>& buffer, const char* file_name) {
-	q.submit([=](celerity::handler& cgh) {
+static void read_hdf5_file(celerity::distr_queue& q, celerity::buffer<float, 2>& buffer, const char* file_name) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor a{buffer, cgh, celerity::experimental::access::even_split<2>{}, celerity::write_only_host_task, celerity::no_init};
 		cgh.host_task(celerity::experimental::collective, [=](celerity::experimental::collective_partition part) {
 			auto plist = H5Pcreate(H5P_FILE_ACCESS);
@@ -54,8 +54,8 @@ static void read_hdf5_file(celerity::distr_queue& q, const celerity::buffer<floa
 }
 
 
-static void write_hdf5_file(celerity::distr_queue& q, const celerity::buffer<float, 2>& buffer, const char* file_name) {
-	q.submit([=](celerity::handler& cgh) {
+static void write_hdf5_file(celerity::distr_queue& q, celerity::buffer<float, 2>& buffer, const char* file_name) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor a{buffer, cgh, celerity::experimental::access::even_split<2>{}, celerity::read_only_host_task};
 		cgh.host_task(celerity::experimental::collective, [=](celerity::experimental::collective_partition part) {
 			auto plist = H5Pcreate(H5P_FILE_ACCESS);
@@ -116,7 +116,7 @@ int main(int argc, char* argv[]) {
 
 		read_hdf5_file(q, in, argv[2]);
 
-		q.submit([=](celerity::handler& cgh) {
+		q.submit([&](celerity::handler& cgh) {
 			celerity::accessor a{in, cgh, celerity::access::one_to_one{}, celerity::read_only};
 			celerity::accessor b{out, cgh, transposed, celerity::write_only, celerity::no_init};
 			cgh.parallel_for<class transpose>(celerity::range<2>{N, N}, [=](celerity::item<2> item) {
@@ -139,7 +139,7 @@ int main(int argc, char* argv[]) {
 		read_hdf5_file(q, left, argv[2]);
 		read_hdf5_file(q, right, argv[3]);
 
-		q.submit([=](celerity::handler& cgh) {
+		q.submit([&](celerity::handler& cgh) {
 			celerity::accessor a{left, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			celerity::accessor b{right, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			celerity::accessor e{equal, cgh, celerity::write_only_host_task, celerity::no_init};
diff --git a/examples/matmul/matmul.cc b/examples/matmul/matmul.cc
index dce991534..a8a5eb52d 100644
--- a/examples/matmul/matmul.cc
+++ b/examples/matmul/matmul.cc
@@ -6,7 +6,7 @@ const size_t MAT_SIZE = 1024;
 
 template <typename T>
 void set_identity(celerity::distr_queue queue, celerity::buffer<T, 2> mat) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor dw{mat, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
 		cgh.parallel_for<class set_identity_kernel>(mat.get_range(), [=](celerity::item<2> item) { dw[item] = item[0] == item[1]; });
 	});
@@ -14,7 +14,7 @@ void set_identity(celerity::distr_queue queue, celerity::buffer<T, 2> mat) {
 
 template <typename T>
 void multiply(celerity::distr_queue queue, celerity::buffer<T, 2> mat_a, celerity::buffer<T, 2> mat_b, celerity::buffer<T, 2> mat_c) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor a{mat_a, cgh, celerity::access::slice<2>(1), celerity::read_only};
 		celerity::accessor b{mat_b, cgh, celerity::access::slice<2>(0), celerity::read_only};
 		celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
@@ -48,7 +48,7 @@ void multiply(celerity::distr_queue queue, celerity::buffer<T, 2> mat_a, celerit
 // TODO this should really reduce into a buffer<bool> on the device, but not all backends currently support reductions
 template <typename T>
 void verify(celerity::distr_queue& queue, celerity::buffer<T, 2> mat_c, celerity::experimental::host_object<bool> passed_obj) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::read_only_host_task};
 		celerity::experimental::side_effect passed{passed_obj, cgh};
 
diff --git a/examples/reduction/reduction.cc b/examples/reduction/reduction.cc
index 6671463f0..28266b85c 100644
--- a/examples/reduction/reduction.cc
+++ b/examples/reduction/reduction.cc
@@ -73,7 +73,7 @@ int main(int argc, char* argv[]) {
 	celerity::buffer<sycl::float4, 2> lab_buf{image_size};
 	celerity::buffer<sycl::float2, 1> minmax_buf{celerity::range{1}};
 
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::one_to_one{}, celerity::read_only};
 		celerity::accessor rgb_acc{lab_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
 		auto minmax_r = celerity::reduction(minmax_buf, cgh, minmax_identity, minmax, celerity::property::reduction::initialize_to_identity{});
@@ -86,12 +86,12 @@ int main(int argc, char* argv[]) {
 		});
 	});
 
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor minmax_acc{minmax_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
 		cgh.host_task(celerity::on_master_node, [=] { printf("Before contrast stretch: min = %f, max = %f\n", minmax_acc[0][0], minmax_acc[0][1]); });
 	});
 
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor rgb_acc{lab_buf, cgh, celerity::access::one_to_one{}, celerity::read_only};
 		celerity::accessor minmax_acc{minmax_buf, cgh, celerity::access::all{}, celerity::read_only};
 		celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
@@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
 		});
 	});
 
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
 		cgh.host_task(celerity::on_master_node, [=] { stbi_write_jpg("output.jpg", image_width, image_height, 4, srgb_255_acc.get_pointer(), 90); });
 	});
diff --git a/examples/syncing/syncing.cc b/examples/syncing/syncing.cc
index 4ddb2ccad..84f085cb5 100644
--- a/examples/syncing/syncing.cc
+++ b/examples/syncing/syncing.cc
@@ -9,14 +9,14 @@ int main(int argc, char* argv[]) {
 	celerity::buffer<size_t, 1> buf(buf_size);
 
 	// Initialize buffer in a distributed device kernel
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor b{buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
 		cgh.parallel_for<class write_linear_id>(buf.get_range(), [=](celerity::item<1> item) { b[item] = item.get_linear_id(); });
 	});
 
 	// Process values on the host
 	std::vector<size_t> host_buf(buf_size);
-	queue.submit(celerity::allow_by_ref, [=, &host_buf](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor b{buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
 		cgh.host_task(celerity::experimental::collective, [=, &host_buf](celerity::experimental::collective_partition) {
 			std::this_thread::sleep_for(std::chrono::milliseconds(100)); // give the synchronization more time to fail
diff --git a/examples/wave_sim/wave_sim.cc b/examples/wave_sim/wave_sim.cc
index 6be99ab1a..49356ef47 100644
--- a/examples/wave_sim/wave_sim.cc
+++ b/examples/wave_sim/wave_sim.cc
@@ -6,7 +6,7 @@
 #include <celerity.h>
 
 void setup_wave(celerity::distr_queue& queue, celerity::buffer<float, 2> u, sycl::float2 center, float amplitude, sycl::float2 sigma) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor dw_u{u, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
 		cgh.parallel_for<class setup_wave>(u.get_range(), [=, c = center, a = amplitude, s = sigma](celerity::item<2> item) {
 			const float dx = item[1] - c.x();
@@ -17,7 +17,7 @@ void setup_wave(celerity::distr_queue& queue, celerity::buffer<float, 2> u, sycl
 }
 
 void zero(celerity::distr_queue& queue, celerity::buffer<float, 2> buf) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor dw_buf{buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
 		cgh.parallel_for<class zero>(buf.get_range(), [=](celerity::item<2> item) { dw_buf[item] = 0.f; });
 	});
@@ -37,7 +37,7 @@ struct update_config {
 
 template <typename T, typename Config, typename KernelName>
 void step(celerity::distr_queue& queue, celerity::buffer<T, 2> up, celerity::buffer<T, 2> u, float dt, sycl::float2 delta) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor rw_up{up, cgh, celerity::access::one_to_one{}, celerity::read_write};
 		celerity::accessor r_u{u, cgh, celerity::access::neighborhood{1, 1}, celerity::read_only};
 
@@ -64,7 +64,7 @@ void update(celerity::distr_queue& queue, celerity::buffer<float, 2> up, celerit
 }
 
 void stream_open(celerity::distr_queue& queue, size_t N, size_t num_samples, celerity::experimental::host_object<std::ofstream> os) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::experimental::side_effect os_eff{os, cgh};
 		cgh.host_task(celerity::on_master_node, [=] {
 			os_eff->open("wave_sim_result.bin", std::ios_base::out | std::ios_base::binary);
@@ -77,7 +77,7 @@ void stream_open(celerity::distr_queue& queue, size_t N, size_t num_samples, cel
 template <typename T>
 void stream_append(celerity::distr_queue& queue, celerity::buffer<T, 2> up, celerity::experimental::host_object<std::ofstream> os) {
 	const auto range = up.get_range();
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::accessor up_r{up, cgh, celerity::access::all{}, celerity::read_only_host_task};
 		celerity::experimental::side_effect os_eff{os, cgh};
 		cgh.host_task(celerity::on_master_node, [=] { os_eff->write(reinterpret_cast<const char*>(up_r.get_pointer()), range.size() * sizeof(T)); });
@@ -85,7 +85,7 @@ void stream_append(celerity::distr_queue& queue, celerity::buffer<T, 2> up, cele
 }
 
 void stream_close(celerity::distr_queue& queue, celerity::experimental::host_object<std::ofstream> os) {
-	queue.submit([=](celerity::handler& cgh) {
+	queue.submit([&](celerity::handler& cgh) {
 		celerity::experimental::side_effect os_eff{os, cgh};
 		cgh.host_task(celerity::on_master_node, [=] { os_eff->close(); });
 	});
diff --git a/include/accessor.h b/include/accessor.h
index 70766d2c5..ba42eef0e 100644
--- a/include/accessor.h
+++ b/include/accessor.h
@@ -7,6 +7,7 @@
 #include "access_modes.h"
 #include "buffer.h"
 #include "buffer_storage.h"
+#include "closure_hydrator.h"
 #include "handler.h"
 #include "sycl_wrappers.h"
 
@@ -153,6 +154,8 @@ class buffer_allocation_window {
 	friend class accessor;
 };
 
+#define CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR [[deprecated("Creating accessor from const buffer is deprecated, capture buffer by reference instead")]]
+
 /**
  * Celerity wrapper around SYCL accessors.
  *
@@ -171,47 +174,70 @@ class accessor<DataT, Dims, Mode, target::device> : public detail::accessor_base
 	accessor() noexcept = default;
 
 	template <typename Functor>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn) : accessor(ctor_internal_tag(), buff, cgh, rmfn) {}
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn) : accessor(ctor_internal_tag(), buff, cgh, rmfn) {}
 
 	template <typename Functor, access_mode TagModeNoInit>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<Mode, TagModeNoInit, target::device> /* tag */)
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<Mode, TagModeNoInit, target::device> /* tag */)
 	    : accessor(ctor_internal_tag(), buff, cgh, rmfn) {}
 
 	template <typename Functor, access_mode TagMode>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<TagMode, Mode, target::device> /* tag */,
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<TagMode, Mode, target::device> /* tag */,
 	    const property::no_init& /* no_init */)
 	    : accessor(ctor_internal_tag(), buff, cgh, rmfn) {}
 
-	template <typename Functor, access_mode TagMode, access_mode TagModeNoInit>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<TagMode, TagModeNoInit, target::device> /* tag */,
-	    const property_list& /* prop_list */) {
-		static_assert(detail::constexpr_false<Functor>,
-		    "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init "
-		    "as a last argument in the constructor");
-	}
-
 	template <int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh) : accessor(buff, cgh, access::all()) {}
+	accessor(buffer<DataT, Dims>& buff, handler& cgh) : accessor(buff, cgh, access::all()) {}
 
 	template <access_mode TagModeNoInit, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<Mode, TagModeNoInit, target::device> tag)
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<Mode, TagModeNoInit, target::device> tag)
 	    : accessor(buff, cgh, access::all(), tag) {}
 
 	template <access_mode TagMode, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, Mode, target::device> tag, const property::no_init& no_init)
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, Mode, target::device> tag, const property::no_init& no_init)
 	    : accessor(buff, cgh, access::all(), tag, no_init) {}
 
 	template <access_mode TagMode, access_mode TagModeNoInit, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(
-	    const buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, TagModeNoInit, target::device> tag, const property_list& prop_list)
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, TagModeNoInit, target::device> tag, const property_list& prop_list)
 	    : accessor(buff, cgh, access::all(), tag, prop_list) {}
 
+	template <typename Functor>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn)
+	    : accessor(ctor_internal_tag(), buff, cgh, rmfn) {}
+
+	template <typename Functor, access_mode TagModeNoInit>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(
+	    const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<Mode, TagModeNoInit, target::device> /* tag */)
+	    : accessor(ctor_internal_tag(), buff, cgh, rmfn) {}
+
+	template <typename Functor, access_mode TagMode>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn,
+	    const detail::access_tag<TagMode, Mode, target::device> /* tag */, const property::no_init& /* no_init */)
+	    : accessor(ctor_internal_tag(), buff, cgh, rmfn) {}
+
+	template <typename Functor, access_mode TagMode, access_mode TagModeNoInit>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn,
+	    const detail::access_tag<TagMode, TagModeNoInit, target::device> /* tag */, const property_list& /* prop_list */) {
+		static_assert(detail::constexpr_false<Functor>,
+		    "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init "
+		    "as a last argument in the constructor");
+	}
+
 	// explicitly defaulted because we define operator=(value_type) for Dims == 0
-	accessor(const accessor&) = default;
 	accessor(accessor&&) noexcept = default;
-	accessor& operator=(const accessor&) = default;
 	accessor& operator=(accessor&&) noexcept = default;
 
+#if !defined(__SYCL_DEVICE_ONLY__)
+	accessor(const accessor& other) { copy_and_hydrate(other); }
+
+	accessor& operator=(const accessor& other) {
+		if(this != &other) { copy_and_hydrate(other); }
+		return *this;
+	}
+#else
+	accessor(const accessor&) = default;
+	accessor& operator=(const accessor&) = default;
+#endif
+
 	// SYCL allows assigning values to accessors directly in the 0-dimensional case
 
 	template <int D = Dims, std::enable_if_t<D == 0, int> = 0>
@@ -284,38 +310,45 @@ class accessor<DataT, Dims, Mode, target::device> : public detail::accessor_base
 	CELERITY_DETAIL_NO_UNIQUE_ADDRESS id<Dims> m_index_offset;
 	CELERITY_DETAIL_NO_UNIQUE_ADDRESS range<Dims> m_buffer_range = detail::zero_range;
 
+	template <typename Functor>
+	accessor(const ctor_internal_tag /* tag */, const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn) {
+		using range_mapper = detail::range_mapper<Dims, std::decay_t<Functor>>; // decay function type to function pointer
+		const auto hid = detail::add_requirement(cgh, detail::get_buffer_id(buff), std::make_unique<range_mapper>(rmfn, Mode, buff.get_range()));
+		detail::extend_lifetime(cgh, std::move(detail::get_lifetime_extending_state(buff)));
+		m_device_ptr = detail::embed_hydration_id<DataT*>(hid);
+	}
+
 	// Constructor for tests, called through accessor_testspy.
-	accessor(DataT* ptr, id<Dims> index_offset, range<Dims> buffer_range) : m_device_ptr(ptr), m_index_offset(index_offset), m_buffer_range(buffer_range) {
+	accessor(DataT* const ptr, const id<Dims>& index_offset, const range<Dims>& buffer_range)
+	    : m_device_ptr(ptr), m_index_offset(index_offset), m_buffer_range(buffer_range) {
+#if defined(__SYCL_DEVICE_ONLY__)
 #if CELERITY_WORKAROUND_HIPSYCL // hipSYCL does not yet implement is_device_copyable_v
 		static_assert(std::is_trivially_copyable_v<accessor>);
 #else
 		static_assert(sycl::is_device_copyable_v<accessor>);
+#endif
 #endif
 	}
 
-	template <typename Functor>
-	accessor(const ctor_internal_tag /* tag */, const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn) {
-		if(detail::is_prepass_handler(cgh)) {
-			auto& prepass_cgh = dynamic_cast<detail::prepass_handler&>(cgh);
-			using range_mapper = detail::range_mapper<Dims, std::decay_t<Functor>>; // decay function type to function pointer
-			prepass_cgh.add_requirement(detail::get_buffer_id(buff), std::make_unique<range_mapper>(rmfn, Mode, buff.get_range()));
-		} else {
-			if(detail::get_handler_execution_target(cgh) != detail::execution_target::device) {
-				throw std::runtime_error(
-				    "Calling accessor constructor with device target is only allowed in parallel_for tasks."
-				    "If you want to access this buffer from within a host task, please specialize the call using one of the *_host_task tags");
+	// Constructor for tests, called through accessor_testspy.
+	accessor(const detail::hydration_id hid, const id<Dims>& index_offset, const range<Dims>& buffer_range)
+	    : accessor(detail::embed_hydration_id<DataT*>(hid), index_offset, buffer_range) {}
+
+	void copy_and_hydrate(const accessor& other) {
+		m_device_ptr = other.m_device_ptr;
+		m_index_offset = other.m_index_offset;
+		m_buffer_range = other.m_buffer_range;
+
+#if !defined(__SYCL_DEVICE_ONLY__)
+		if(detail::is_embedded_hydration_id(m_device_ptr)) {
+			if(detail::closure_hydrator::is_available() && detail::closure_hydrator::get_instance().is_hydrating()) {
+				const auto info = detail::closure_hydrator::get_instance().get_accessor_info<target::device>(detail::extract_hydration_id(m_device_ptr));
+				m_device_ptr = static_cast<DataT*>(info.ptr);
+				m_index_offset = detail::id_cast<Dims>(info.buffer_offset);
+				m_buffer_range = detail::range_cast<Dims>(info.buffer_range);
 			}
-
-			auto& live_cgh = dynamic_cast<detail::live_pass_device_handler&>(cgh);
-			// It's difficult to figure out which stored range mapper corresponds to this constructor call, which is why we just call the raw mapper manually.
-			const auto mapped_sr = live_cgh.apply_range_mapper<Dims>(rmfn, buff.get_range());
-			auto access_info =
-			    detail::runtime::get_instance().get_buffer_manager().access_device_buffer<DataT, Dims>(detail::get_buffer_id(buff), Mode, mapped_sr);
-
-			m_device_ptr = static_cast<DataT*>(access_info.ptr);
-			m_index_offset = detail::id_cast<Dims>(access_info.backing_buffer_offset);
-			m_buffer_range = detail::range_cast<Dims>(access_info.backing_buffer_range);
 		}
+#endif
 	}
 
 	size_t get_linear_offset(const id<Dims>& index) const { return detail::get_linear_index(m_buffer_range, index - m_index_offset); }
@@ -325,82 +358,81 @@ template <typename DataT, int Dims, access_mode Mode>
 class accessor<DataT, Dims, Mode, target::host_task> : public detail::accessor_base<DataT, Dims, Mode, target::host_task> {
 	friend struct detail::accessor_testspy;
 
+	struct ctor_internal_tag {};
+
   public:
 	static_assert(Mode != access_mode::atomic, "access_mode::atomic is not supported.");
 
 	accessor() noexcept = default;
 
 	template <typename Functor>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn) {
-		static_assert(!std::is_same_v<Functor, range<Dims>>, "The accessor constructor overload for master-access tasks (now called 'host tasks') has "
-		                                                     "been removed with Celerity 0.2.0. Please provide a range mapper instead.");
-
-		if(detail::is_prepass_handler(cgh)) {
-			auto& prepass_cgh = dynamic_cast<detail::prepass_handler&>(cgh);
-			prepass_cgh.add_requirement(detail::get_buffer_id(buff), std::make_unique<detail::range_mapper<Dims, Functor>>(rmfn, Mode, buff.get_range()));
-		} else {
-			if(detail::get_handler_execution_target(cgh) != detail::execution_target::host) {
-				throw std::runtime_error(
-				    "Calling accessor constructor with host_buffer target is only allowed in host tasks."
-				    "If you want to access this buffer from within a parallel_for task, please specialize the call using one of the non host tags");
-			}
-			auto& live_cgh = dynamic_cast<detail::live_pass_host_handler&>(cgh);
-			// It's difficult to figure out which stored range mapper corresponds to this constructor call, which is why we just call the raw mapper
-			// manually.
-			const auto sr = live_cgh.apply_range_mapper<Dims>(rmfn, buff.get_range());
-			auto access_info = detail::runtime::get_instance().get_buffer_manager().access_host_buffer<DataT, Dims>(detail::get_buffer_id(buff), Mode, sr);
-
-			m_mapped_subrange = sr;
-			m_index_offset = detail::id_cast<Dims>(access_info.backing_buffer_offset);
-			m_buffer_range = detail::range_cast<Dims>(access_info.backing_buffer_range);
-			m_virtual_buffer_range = buff.get_range();
-			m_host_ptr = static_cast<DataT*>(access_info.ptr);
-		}
-	}
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn) : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {}
 
 	template <typename Functor, access_mode TagModeNoInit>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<Mode, TagModeNoInit, target::host_task> /* tag */)
-	    : accessor(buff, cgh, rmfn) {}
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<Mode, TagModeNoInit, target::host_task> /* tag */)
+	    : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {}
 
 	/**
 	 * TODO: As of ComputeCpp 2.5.0 they do not support no_init prop, hence this constructor is needed along with discard deduction guide.
 	 *    but once they do this should be replace for a constructor that takes a prop list as an argument.
 	 */
 	template <typename Functor, access_mode TagMode, access_mode M = Mode, typename = std::enable_if_t<detail::access::mode_traits::is_producer(M)>>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<TagMode, Mode, target::host_task> /* tag */,
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<TagMode, Mode, target::host_task> /* tag */,
 	    const property::no_init& /* no_init */)
-	    : accessor(buff, cgh, rmfn) {}
-
-	template <typename Functor, access_mode TagMode, access_mode TagModeNoInit>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<TagMode, TagModeNoInit, target::host_task> /* tag */,
-	    const property_list& /* prop_list */) {
-		static_assert(detail::constexpr_false<Functor>,
-		    "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init "
-		    "as a last argument in the constructor");
-	}
+	    : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {}
 
 	template <int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh) : accessor(buff, cgh, access::all()) {}
+	accessor(buffer<DataT, Dims>& buff, handler& cgh) : accessor(buff, cgh, access::all()) {}
 
 	template <access_mode TagModeNoInit, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<Mode, TagModeNoInit, target::host_task> tag)
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<Mode, TagModeNoInit, target::host_task> tag)
 	    : accessor(buff, cgh, access::all(), tag) {}
 
 	template <access_mode TagMode, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(const buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, Mode, target::host_task> tag, const property::no_init& no_init)
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, Mode, target::host_task> tag, const property::no_init& no_init)
 	    : accessor(buff, cgh, access::all(), tag, no_init) {}
 
 	template <access_mode TagMode, access_mode TagModeNoInit, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor(
-	    const buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, TagModeNoInit, target::host_task> tag, const property_list& prop_list)
+	accessor(buffer<DataT, Dims>& buff, handler& cgh, const detail::access_tag<TagMode, TagModeNoInit, target::host_task> tag, const property_list& prop_list)
 	    : accessor(buff, cgh, access::all(), tag, prop_list) {}
 
+	template <typename Functor>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn)
+	    : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {}
+
+	template <typename Functor, access_mode TagModeNoInit>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(
+	    const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<Mode, TagModeNoInit, target::host_task> /* tag */)
+	    : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {}
+
+	/**
+	 * TODO: As of ComputeCpp 2.5.0 they do not support no_init prop, hence this constructor is needed along with discard deduction guide.
+	 *    but once they do this should be replace for a constructor that takes a prop list as an argument.
+	 */
+	template <typename Functor, access_mode TagMode, access_mode M = Mode, typename = std::enable_if_t<detail::access::mode_traits::is_producer(M)>>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn,
+	    const detail::access_tag<TagMode, Mode, target::host_task> /* tag */, const property::no_init& /* no_init */)
+	    : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {}
+
+	template <typename Functor, access_mode TagMode, access_mode TagModeNoInit>
+	CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn,
+	    const detail::access_tag<TagMode, TagModeNoInit, target::host_task> /* tag */, const property_list& /* prop_list */) {
+		static_assert(detail::constexpr_false<Functor>,
+		    "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init "
+		    "as a last argument in the constructor");
+	}
+
 	// explicitly defaulted because we define operator=(value_type) for Dims == 0
-	accessor(const accessor&) = default;
 	accessor(accessor&&) noexcept = default;
-	accessor& operator=(const accessor&) = default;
 	accessor& operator=(accessor&&) noexcept = default;
 
+	accessor(const accessor& other) { copy_and_hydrate(other); }
+
+	accessor& operator=(const accessor& other) {
+		if(this != &other) { copy_and_hydrate(other); }
+		return *this;
+	}
+
 	// SYCL allows assigning values to accessors directly in the 0-dimensional case
 
 	template <int D = Dims, std::enable_if_t<D == 0, int> = 0>
@@ -560,15 +592,49 @@ class accessor<DataT, Dims, Mode, target::host_task> : public detail::accessor_b
 	// m_host_ptr must be defined *last* for it to overlap with the sequence of range and id members in the 0-dimensional case
 	CELERITY_DETAIL_NO_UNIQUE_ADDRESS DataT* m_host_ptr = nullptr;
 
+	template <target Target = target::host_task, typename Functor>
+	accessor(ctor_internal_tag /* tag */, const buffer<DataT, Dims>& buff, handler& cgh, const Functor& rmfn) : m_virtual_buffer_range(buff.get_range()) {
+		using range_mapper = detail::range_mapper<Dims, std::decay_t<Functor>>; // decay function type to function pointer
+		const auto hid = detail::add_requirement(cgh, detail::get_buffer_id(buff), std::make_unique<range_mapper>(rmfn, Mode, buff.get_range()));
+		detail::extend_lifetime(cgh, std::move(detail::get_lifetime_extending_state(buff)));
+		m_host_ptr = detail::embed_hydration_id<DataT*>(hid);
+	}
+
 	// Constructor for tests, called through accessor_testspy.
-	accessor(subrange<Dims> mapped_subrange, DataT* ptr, id<Dims> backing_buffer_offset, range<Dims> backing_buffer_range, range<Dims> virtual_buffer_range)
+	accessor(const subrange<Dims> mapped_subrange, DataT* const ptr, const id<Dims>& backing_buffer_offset, const range<Dims>& backing_buffer_range,
+	    const range<Dims>& virtual_buffer_range)
 	    : m_mapped_subrange(mapped_subrange), m_index_offset(backing_buffer_offset), m_buffer_range(backing_buffer_range),
 	      m_virtual_buffer_range(virtual_buffer_range), m_host_ptr(ptr) {}
 
+	// Constructor for tests, called through accessor_testspy.
+	accessor(const subrange<Dims>& mapped_subrange, const detail::hydration_id hid, const id<Dims>& backing_buffer_offset,
+	    const range<Dims>& backing_buffer_range, range<Dims> virtual_buffer_range)
+	    : accessor(mapped_subrange, detail::embed_hydration_id<DataT*>(hid), backing_buffer_offset, backing_buffer_range, virtual_buffer_range) {}
+
+	void copy_and_hydrate(const accessor& other) {
+		m_mapped_subrange = other.m_mapped_subrange;
+		m_host_ptr = other.m_host_ptr;
+		m_index_offset = other.m_index_offset;
+		m_buffer_range = other.m_buffer_range;
+		m_virtual_buffer_range = other.m_virtual_buffer_range;
+
+		if(detail::is_embedded_hydration_id(m_host_ptr)) {
+			if(detail::closure_hydrator::is_available() && detail::closure_hydrator::get_instance().is_hydrating()) {
+				const auto info = detail::closure_hydrator::get_instance().get_accessor_info<target::host_task>(detail::extract_hydration_id(m_host_ptr));
+				m_host_ptr = static_cast<DataT*>(info.ptr);
+				m_index_offset = detail::id_cast<Dims>(info.buffer_offset);
+				m_buffer_range = detail::range_cast<Dims>(info.buffer_range);
+				m_mapped_subrange = detail::subrange_cast<Dims>(info.accessor_sr);
+			}
+		}
+	}
+
 	size_t get_linear_offset(const id<Dims>& index) const { return detail::get_linear_index(m_buffer_range, index - m_index_offset); }
 };
 
+#undef CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR
 
+// TODO: Make buffer non-const once corresponding (deprecated!) constructor overloads are removed
 template <typename T, int D, typename Functor, access_mode Mode, access_mode ModeNoInit, target Target>
 accessor(const buffer<T, D>& buff, handler& cgh, const Functor& rmfn, const detail::access_tag<Mode, ModeNoInit, Target> tag) -> accessor<T, D, Mode, Target>;
 
@@ -597,6 +663,7 @@ class local_accessor {
 	friend struct detail::accessor_testspy;
 
 	static_assert(Dims <= 3);
+	friend struct detail::accessor_testspy;
 
   private:
 	constexpr static int sycl_dims = std::max(1, Dims);
@@ -618,17 +685,15 @@ class local_accessor {
 	template <int D = Dims, typename = std::enable_if_t<D == 0>>
 	local_accessor(handler& cgh) : local_accessor(range<0>(), cgh) {}
 
-#if !defined(__SYCL_DEVICE_ONLY__) && !defined(SYCL_DEVICE_ONLY)
-	local_accessor(const range<Dims>& allocation_size, handler& cgh) : m_sycl_acc{}, m_allocation_size(allocation_size) {
-		if(!detail::is_prepass_handler(cgh)) {
-			auto& device_handler = dynamic_cast<detail::live_pass_device_handler&>(cgh);
-			m_eventual_sycl_cgh = device_handler.get_eventual_sycl_cgh();
-		}
-	}
+#if !defined(__SYCL_DEVICE_ONLY__)
+	local_accessor(const range<Dims>& allocation_size, handler& cgh) : m_sycl_acc{}, m_allocation_size(allocation_size) {}
 
 	local_accessor(const local_accessor& other)
-	    : m_sycl_acc(other.sycl_cgh() ? sycl_accessor{other.sycl_allocation_size(), *other.sycl_cgh()} : other.m_sycl_acc),
-	      m_allocation_size(other.m_allocation_size), m_eventual_sycl_cgh(other.sycl_cgh() ? nullptr : other.m_eventual_sycl_cgh) {}
+	    : m_sycl_acc(
+	        detail::closure_hydrator::is_available() && detail::closure_hydrator::get_instance().is_hydrating() && other.sycl_allocation_size().size() > 0
+	            ? sycl_accessor{other.sycl_allocation_size(), detail::closure_hydrator::get_instance().get_sycl_handler()}
+	            : other.m_sycl_acc),
+	      m_allocation_size(other.m_allocation_size) {}
 #else
 	local_accessor(const range<Dims>& allocation_size, handler& cgh);
 	local_accessor(const local_accessor&) = default;
@@ -696,12 +761,11 @@ class local_accessor {
   private:
 	sycl_accessor m_sycl_acc;
 	CELERITY_DETAIL_NO_UNIQUE_ADDRESS range<Dims> m_allocation_size;
-	cl::sycl::handler* const* m_eventual_sycl_cgh = nullptr;
-	// TODO after multi-pass removal: verify that sizeof(celerity::local_accessor) == sizeof(sycl::local_accessor) in accessor_tests (currently [!shouldfail])
-
-	cl::sycl::handler* sycl_cgh() const { return m_eventual_sycl_cgh != nullptr ? *m_eventual_sycl_cgh : nullptr; }
 
 	sycl::range<sycl_dims> sycl_allocation_size() const { return sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(m_allocation_size)); }
+
+	// Constructor for tests, called through accessor_testspy.
+	explicit local_accessor(const range<Dims>& allocation_size) : m_sycl_acc{}, m_allocation_size(allocation_size) {}
 };
 
 } // namespace celerity
diff --git a/include/buffer.h b/include/buffer.h
index a9ed29862..38f933a40 100644
--- a/include/buffer.h
+++ b/include/buffer.h
@@ -6,6 +6,7 @@
 #include <allscale/utils/functional_utils.h>
 
 #include "buffer_manager.h"
+#include "lifetime_extending_state.h"
 #include "range_mapper.h"
 #include "ranges.h"
 #include "runtime.h"
@@ -36,17 +37,14 @@ template <typename DataT, int Dims, access_mode Mode, target Target>
 class accessor;
 
 template <typename DataT, int Dims>
-class buffer {
+class buffer final : public detail::lifetime_extending_state_wrapper {
   public:
 	static_assert(Dims <= 3);
 
 	template <int D = Dims, typename = std::enable_if_t<D == 0>>
 	buffer() : buffer(nullptr, {}) {}
 
-	explicit buffer(const DataT* host_ptr, range<Dims> range) {
-		if(!detail::runtime::is_initialized()) { detail::runtime::init(nullptr, nullptr); }
-		m_impl = std::make_shared<impl>(range, host_ptr);
-	}
+	explicit buffer(const DataT* host_ptr, range<Dims> range) : m_impl(std::make_shared<impl>(range, host_ptr)) {}
 
 	explicit buffer(range<Dims> range) : buffer(nullptr, range) {}
 
@@ -62,35 +60,52 @@ class buffer {
 	~buffer() {}
 
 	template <access_mode Mode, typename Functor, int D = Dims, std::enable_if_t<(D > 0), int> = 0>
-	accessor<DataT, Dims, Mode, target::device> get_access(handler& cgh, Functor rmfn) const {
+	accessor<DataT, Dims, Mode, target::device> get_access(handler& cgh, Functor rmfn) {
 		return get_access<Mode, target::device, Functor>(cgh, rmfn);
 	}
 
 	template <access_mode Mode, typename Functor, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor<DataT, Dims, Mode, target::device> get_access(handler& cgh) const {
+	accessor<DataT, Dims, Mode, target::device> get_access(handler& cgh) {
 		return get_access<Mode, target::device, Functor>(cgh);
 	}
 
 	template <access_mode Mode, target Target, typename Functor, int D = Dims, std::enable_if_t<(D > 0), int> = 0>
-	accessor<DataT, Dims, Mode, Target> get_access(handler& cgh, Functor rmfn) const {
+	accessor<DataT, Dims, Mode, Target> get_access(handler& cgh, Functor rmfn) {
 		return accessor<DataT, Dims, Mode, Target>(*this, cgh, rmfn);
 	}
 
 	template <access_mode Mode, target Target, typename Functor, int D = Dims, std::enable_if_t<D == 0, int> = 0>
-	accessor<DataT, Dims, Mode, Target> get_access(handler& cgh) const {
+	accessor<DataT, Dims, Mode, Target> get_access(handler& cgh) {
 		return accessor<DataT, Dims, Mode, Target>(*this, cgh);
 	}
 
+	template <access_mode Mode, typename Functor, int D = Dims, std::enable_if_t<(D > 0), int> = 0>
+	[[deprecated("Calling get_access on a const buffer is deprecated")]] accessor<DataT, Dims, Mode, target::device> get_access(
+	    handler& cgh, Functor rmfn) const {
+		return get_access<Mode, target::device, Functor>(cgh, rmfn);
+	}
+
+	template <access_mode Mode, target Target, typename Functor, int D = Dims, std::enable_if_t<(D > 0), int> = 0>
+	[[deprecated("Calling get_access on a const buffer is deprecated")]] accessor<DataT, Dims, Mode, Target> get_access(handler& cgh, Functor rmfn) const {
+		return accessor<DataT, Dims, Mode, Target>(*this, cgh, rmfn);
+	}
+
 	const range<Dims>& get_range() const { return m_impl->range; }
 
+  protected:
+	std::shared_ptr<detail::lifetime_extending_state> get_lifetime_extending_state() const override { return m_impl; }
+
   private:
-	struct impl {
+	struct impl final : public detail::lifetime_extending_state {
 		impl(range<Dims> rng, const DataT* host_init_ptr) : range(rng) {
+			if(!detail::runtime::is_initialized()) { detail::runtime::init(nullptr, nullptr); }
 			id = detail::runtime::get_instance().get_buffer_manager().register_buffer<DataT, Dims>(detail::range_cast<3>(range), host_init_ptr);
 		}
 		impl(const impl&) = delete;
 		impl(impl&&) = delete;
-		~impl() noexcept { detail::runtime::get_instance().get_buffer_manager().unregister_buffer(id); }
+		impl& operator=(const impl&) = delete;
+		impl& operator=(impl&&) = delete;
+		~impl() override { detail::runtime::get_instance().get_buffer_manager().unregister_buffer(id); }
 		detail::buffer_id id;
 		celerity::range<Dims> range;
 		std::string debug_name;
diff --git a/include/closure_hydrator.h b/include/closure_hydrator.h
new file mode 100644
index 000000000..554ebc7a0
--- /dev/null
+++ b/include/closure_hydrator.h
@@ -0,0 +1,157 @@
+#pragma once
+
+#include <numeric>
+#include <optional>
+#include <vector>
+
+#include "log.h"
+#include "ranges.h"
+#include "sycl_wrappers.h"
+#include "types.h"
+
+namespace celerity::detail {
+
+// To avoid additional register pressure, we embed hydration IDs into pointers for
+// accessors, with the assumption that a real pointer will never be in the
+// range [0, max_hydration_id]. Embedding / extracting are currently no-ops
+// and the associated helper functions only exist for documentation purposes.
+// This number puts an effective limit on the number of task objects (accessors
+// etc.) that can be captured into a command function.
+constexpr size_t max_hydration_id = 128;
+
+template <typename T>
+constexpr bool can_embed_hydration_id = std::bool_constant<sizeof(hydration_id) == sizeof(T)>::value;
+
+template <typename T>
+T embed_hydration_id(const hydration_id hid) {
+	static_assert(can_embed_hydration_id<T>);
+	assert(hid > 0); // Has to be greater than zero so nullptr is not considered an embedded id
+	assert(hid <= max_hydration_id);
+	T result;
+	std::memcpy(&result, &hid, sizeof(hid));
+	return result;
+}
+
+template <typename T>
+hydration_id extract_hydration_id(const T value) {
+	static_assert(can_embed_hydration_id<T>);
+	hydration_id result;
+	std::memcpy(&result, &value, sizeof(value));
+	return result;
+}
+
+template <typename T>
+bool is_embedded_hydration_id(const T value) {
+	static_assert(can_embed_hydration_id<T>);
+	const auto hid = extract_hydration_id(value);
+	return hid > 0 && hid <= max_hydration_id;
+}
+
+/**
+ * The closure hydrator is used to inject information into objects (currently host/device and local accessors) that have been captured into a lambda closure.
+ * We abuse the copy constructor of the captured objects to modify them while the containing closure is being copied by the hydrate() function.
+ * Accessors request their corresponding information by means of per-closure unique "hydration ids" that are assigned upon accessor creation.
+ *
+ * The hydrator is implemented as a thread-local singleton that needs to be explicitly enabled per-thread. This is because kernel command function
+ * closures may be copied any number of times after having been passed to SYCL, which should not trigger the hydration mechanism.
+ */
+class closure_hydrator {
+  public:
+	struct accessor_info {
+		void* ptr;
+		range<3> buffer_range;
+		id<3> buffer_offset;
+		subrange<3> accessor_sr;
+	};
+
+	closure_hydrator(const closure_hydrator&) = delete;
+	closure_hydrator(closure_hydrator&&) = delete;
+	closure_hydrator& operator=(const closure_hydrator&) = delete;
+	closure_hydrator& operator=(closure_hydrator&&) = delete;
+	~closure_hydrator() = default;
+
+	static void make_available() {
+		assert(m_instance == nullptr);
+		m_instance = std::unique_ptr<closure_hydrator>(new closure_hydrator());
+	}
+
+	static bool is_available() { return m_instance != nullptr; }
+
+	static void teardown() { m_instance.reset(); }
+
+	static closure_hydrator& get_instance() {
+		assert(m_instance != nullptr);
+		return *m_instance;
+	}
+
+	/**
+	 * Puts the hydrator into the "armed" state, after which hydrate() can be called to hydrate kernel functions.
+	 *
+	 * accessor_infos must contain one entry for each hydration id that has been assigned to accessors in the
+	 * closure that is to be hydrated, in matching order.
+	 */
+	void arm(const target tgt, std::vector<accessor_info> accessor_infos) {
+		assert(!m_armed_for.has_value());
+		assert(accessor_infos.size() < max_hydration_id);
+		m_armed_for = tgt;
+		m_accessor_infos = std::move(accessor_infos);
+	}
+
+	/**
+	 * Hydrates the provided closure by copying it in a context where calls to get_accessor_info and get_sycl_handler are allowed.
+	 */
+	template <target Tgt, typename Closure, std::enable_if_t<Tgt == target::device, int> = 0>
+	[[nodiscard]] auto hydrate(sycl::handler& cgh, const Closure& closure) {
+		return hydrate(target::device, &cgh, closure);
+	}
+
+	/**
+	 * Hydrates the provided closure by copying it in a context where calls to get_accessor_info are allowed.
+	 */
+	template <target Tgt, typename Closure, std::enable_if_t<Tgt == target::host_task, int> = 0>
+	[[nodiscard]] auto hydrate(const Closure& closure) {
+		return hydrate(target::host_task, nullptr, closure);
+	}
+
+	bool is_hydrating() const { return m_is_hydrating; }
+
+	template <target Tgt>
+	accessor_info get_accessor_info(const hydration_id hid) {
+		assert(m_armed_for.has_value() && *m_armed_for == Tgt);
+		assert(!m_accessor_infos.empty());
+		assert(hid > 0);
+		assert(hid <= m_accessor_infos.size());
+		return m_accessor_infos[hid - 1];
+	}
+
+	sycl::handler& get_sycl_handler() {
+		assert(m_sycl_cgh != nullptr);
+		return *m_sycl_cgh;
+	}
+
+  private:
+	inline static thread_local std::unique_ptr<closure_hydrator> m_instance; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables)
+	std::vector<accessor_info> m_accessor_infos;
+	std::optional<target> m_armed_for = std::nullopt;
+	bool m_is_hydrating = false;
+	sycl::handler* m_sycl_cgh = nullptr;
+
+	closure_hydrator() = default;
+
+	template <typename Closure>
+	[[nodiscard]] auto hydrate(const target tgt, sycl::handler* cgh, const Closure& closure) {
+		static_assert(std::is_copy_constructible_v<std::decay_t<Closure>>);
+		assert(m_armed_for.has_value() && *m_armed_for == tgt);
+		assert(tgt == target::host_task || cgh != nullptr);
+		m_sycl_cgh = cgh;
+		m_is_hydrating = true;
+		Closure hydrated{closure};
+		m_is_hydrating = false;
+		m_sycl_cgh = nullptr;
+		m_accessor_infos.clear();
+		m_armed_for = std::nullopt;
+		return hydrated;
+	}
+};
+
+}; // namespace celerity::detail
\ No newline at end of file
diff --git a/include/distr_queue.h b/include/distr_queue.h
index 53d8992e5..7a00e8f65 100644
--- a/include/distr_queue.h
+++ b/include/distr_queue.h
@@ -15,14 +15,14 @@ namespace detail {
 		~distr_queue_tracker() { runtime::get_instance().shutdown(); }
 	};
 
-	template <typename CGF>
-	constexpr bool is_safe_cgf = std::is_standard_layout<CGF>::value;
-
 } // namespace detail
 
-struct allow_by_ref_t {};
+struct [[deprecated("This tag type is no longer required to capture by reference")]] allow_by_ref_t{};
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 inline constexpr allow_by_ref_t allow_by_ref{};
+#pragma GCC diagnostic pop
 
 class distr_queue {
   public:
@@ -51,28 +51,19 @@ class distr_queue {
 	 * Submits a command group to the queue.
 	 *
 	 * Invoke via `q.submit(celerity::allow_by_ref, [&](celerity::handler &cgh) {...})`.
-	 *
-	 * With this overload, CGF may capture by-reference. This may lead to lifetime issues with asynchronous execution, so using the `submit(cgf)` overload is
-	 * preferred in the common case.
 	 */
 	template <typename CGF>
-	void submit(allow_by_ref_t, CGF cgf) { // NOLINT(readability-convert-member-functions-to-static)
-		// (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.)
-		detail::runtime::get_instance().get_task_manager().submit_command_group(std::move(cgf));
+	[[deprecated("This overload is no longer required to capture by reference")]] void submit(allow_by_ref_t /* tag */, CGF cgf) {
+		submit(std::move(cgf));
 	}
 
 	/**
 	 * Submits a command group to the queue.
-	 *
-	 * CGF must not capture by reference. This is a conservative safety check to avoid lifetime issues when command groups are executed asynchronously.
-	 *
-	 * If you know what you are doing, you can use the `allow_by_ref` overload of `submit` to bypass this check.
 	 */
 	template <typename CGF>
-	void submit(CGF cgf) {
-		static_assert(detail::is_safe_cgf<CGF>, "The provided command group function is not multi-pass execution safe. Please make sure to only capture by "
-		                                        "value. If you know what you're doing, use submit(celerity::allow_by_ref, ...).");
-		submit(allow_by_ref, std::move(cgf));
+	void submit(CGF cgf) { // NOLINT(readability-convert-member-functions-to-static)
+		// (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.)
+		detail::runtime::get_instance().get_task_manager().submit_command_group(std::move(cgf));
 	}
 
 	/**
diff --git a/include/handler.h b/include/handler.h
index 2ed53e644..7a472a585 100644
--- a/include/handler.h
+++ b/include/handler.h
@@ -1,13 +1,16 @@
 #pragma once
 
+#include <memory>
 #include <regex>
 #include <type_traits>
 #include <typeinfo>
+#include <utility>
 
 #include <CL/sycl.hpp>
 #include <spdlog/fmt/fmt.h>
 
 #include "buffer.h"
+#include "closure_hydrator.h"
 #include "device_queue.h"
 #include "host_queue.h"
 #include "item.h"
@@ -30,10 +33,13 @@ class handler;
 namespace detail {
 	class device_queue;
 	class task_manager;
-	class prepass_handler;
 
-	inline bool is_prepass_handler(const handler& cgh);
-	inline execution_target get_handler_execution_target(const handler& cgh);
+	handler make_command_group_handler(const task_id tid, const size_t num_collective_nodes);
+	std::unique_ptr<task> into_task(handler&& cgh);
+	hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr<range_mapper_base> rm);
+	void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order);
+	void add_reduction(handler& cgh, const reduction_info& rinfo);
+	void extend_lifetime(handler& cgh, std::shared_ptr<detail::lifetime_extending_state> state);
 
 	template <typename Name>
 	std::string kernel_debug_name() {
@@ -171,10 +177,132 @@ namespace experimental {
 	inline constexpr collective_tag_factory collective;
 } // namespace experimental
 
+namespace detail {
+	template <typename Kernel, int Dims, typename... Reducers>
+	inline void invoke_kernel(const Kernel& kernel, const sycl::id<std::max(1, Dims)>& s_id, const range<Dims>& global_range, const id<Dims>& global_offset,
+	    const id<Dims>& chunk_offset, Reducers&... reducers) {
+		kernel(make_item<Dims>(id_cast<Dims>(id<std::max(1, Dims)>(s_id)) + chunk_offset, global_offset, global_range), reducers...);
+	}
+
+	template <typename Kernel, int Dims, typename... Reducers>
+	inline void invoke_kernel(const Kernel& kernel, const cl::sycl::nd_item<std::max(1, Dims)>& s_item, const range<Dims>& global_range,
+	    const id<Dims>& global_offset, const id<Dims>& chunk_offset, const range<Dims>& group_range, const id<Dims>& group_offset, Reducers&... reducers) {
+		kernel(make_nd_item<Dims>(s_item, global_range, global_offset, chunk_offset, group_range, group_offset), reducers...);
+	}
+
+	template <typename Kernel, int Dims>
+	auto bind_simple_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims>& chunk_offset) {
+		return [=](auto s_item_or_id, auto&... reducers) {
+			constexpr int sycl_dims = std::max(1, Dims);
+			static_assert(std::is_invocable_v<Kernel, celerity::item<Dims>, decltype(reducers)...>,
+			    "Kernel function must be invocable with celerity::item<Dims> and as many reducer objects as reductions passed to parallel_for");
+			if constexpr(CELERITY_WORKAROUND(DPCPP) && std::is_same_v<sycl::id<Dims>, decltype(s_item_or_id)>) {
+				// CELERITY_WORKAROUND_LESS_OR_EQUAL: DPC++ passes a sycl::id instead of a sycl::item to kernels alongside reductions
+				invoke_kernel(kernel, s_item_or_id, global_range, global_offset, chunk_offset, reducers...);
+			} else {
+				// Explicit item constructor: ComputeCpp does not pass a sycl::item, but an implicitly convertible sycl::item_base (?) which does not have
+				// `sycl::id<> get_id()`
+				invoke_kernel(kernel, cl::sycl::item<sycl_dims>{s_item_or_id}.get_id(), global_range, global_offset, chunk_offset, reducers...);
+			}
+		};
+	}
+
+	template <typename Kernel, int Dims>
+	auto bind_nd_range_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims> chunk_offset,
+	    const range<Dims>& group_range, const id<Dims>& group_offset) {
+		return [=](sycl::nd_item<std::max(1, Dims)> s_item, auto&... reducers) {
+			static_assert(std::is_invocable_v<Kernel, celerity::nd_item<Dims>, decltype(reducers)...>,
+			    "Kernel function must be invocable with celerity::nd_item<Dims> or and as many reducer objects as reductions passed to parallel_for");
+			invoke_kernel(kernel, s_item, global_range, global_offset, chunk_offset, group_range, group_offset, reducers...);
+		};
+	}
+
+	template <typename KernelName, typename... Params>
+	inline void invoke_sycl_parallel_for(cl::sycl::handler& cgh, Params&&... args) {
+		static_assert(CELERITY_FEATURE_UNNAMED_KERNELS || !is_unnamed_kernel<KernelName>,
+		    "Your SYCL implementation does not support unnamed kernels, add a kernel name template parameter to this parallel_for invocation");
+		if constexpr(detail::is_unnamed_kernel<KernelName>) {
+#if CELERITY_FEATURE_UNNAMED_KERNELS // see static_assert above
+			cgh.parallel_for(std::forward<Params>(args)...);
+#endif
+		} else {
+			cgh.parallel_for<detail::bind_kernel_name<KernelName>>(std::forward<Params>(args)...);
+		}
+	}
+
+	template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
+	class reduction_descriptor;
+
+	template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
+	auto make_sycl_reduction(const reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>& d, void* ptr, const bool is_initializer) {
+#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
+		static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
+#else
+		cl::sycl::property_list props;
+		if(!d.m_include_current_buffer_value || !is_initializer) { props = {cl::sycl::property::reduction::initialize_to_identity{}}; }
+		if constexpr(WithExplicitIdentity) {
+			return sycl::reduction(static_cast<DataT*>(ptr), d.m_identity, d.m_op, props);
+		} else {
+			return sycl::reduction(static_cast<DataT*>(ptr), d.m_op, props);
+		}
+#endif
+	}
+
+	template <typename DataT, int Dims, typename BinaryOperation>
+	class reduction_descriptor<DataT, Dims, BinaryOperation, false /* WithExplicitIdentity */> {
+	  public:
+		reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT /* identity */, bool include_current_buffer_value)
+		    : m_bid(bid), m_op(combiner), m_include_current_buffer_value(include_current_buffer_value) {}
+
+	  private:
+		friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, false>(const reduction_descriptor&, void*, const bool);
+
+		buffer_id m_bid;
+		BinaryOperation m_op;
+		bool m_include_current_buffer_value;
+	};
+
+	template <typename DataT, int Dims, typename BinaryOperation>
+	class reduction_descriptor<DataT, Dims, BinaryOperation, true /* WithExplicitIdentity */> {
+	  public:
+		reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT identity, bool include_current_buffer_value)
+		    : m_bid(bid), m_op(combiner), m_identity(identity), m_include_current_buffer_value(include_current_buffer_value) {}
+
+	  private:
+		friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, true>(const reduction_descriptor&, void*, const bool);
+
+		buffer_id m_bid;
+		BinaryOperation m_op;
+		DataT m_identity{};
+		bool m_include_current_buffer_value;
+	};
+
+	template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation>
+	auto make_reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) {
+#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
+		static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
+#else
+		if(vars.get_range().size() != 1) {
+			// Like SYCL 2020, Celerity only supports reductions to unit-sized buffers. This allows us to avoid tracking different parts of the buffer
+			// as distributed_state and pending_reduction_state.
+			throw std::runtime_error("Only unit-sized buffers can be reduction targets");
+		}
+
+		const auto bid = detail::get_buffer_id(vars);
+		const auto include_current_buffer_value = !prop_list.has_property<celerity::property::reduction::initialize_to_identity>();
+
+		const auto rid = detail::runtime::get_instance().get_reduction_manager().create_reduction<DataT, Dims>(bid, op, identity);
+		add_reduction(cgh, reduction_info{rid, bid, include_current_buffer_value});
+		extend_lifetime(cgh, std::move(get_lifetime_extending_state(vars)));
+
+		return detail::reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>{bid, op, identity, include_current_buffer_value};
+#endif
+	}
+
+} // namespace detail
+
 class handler {
   public:
-	virtual ~handler() = default;
-
 	template <typename KernelName = detail::unnamed_kernel, int Dims, typename... ReductionsAndKernel>
 	void parallel_for(range<Dims> global_range, ReductionsAndKernel... reductions_and_kernel) {
 		static_assert(sizeof...(reductions_and_kernel) > 0, "No kernel given");
@@ -208,7 +336,10 @@ class handler {
 	 * may require different lifetimes for captures. See `celerity::allow_by_ref` for more information on this topic.
 	 */
 	template <typename Functor>
-	void host_task(on_master_node_tag, Functor kernel);
+	void host_task(on_master_node_tag /* tag */, Functor kernel) {
+		auto launcher = make_host_task_launcher<0, false>(detail::zero_range, 0, kernel);
+		create_master_node_task(std::move(launcher));
+	}
 
 	/**
 	 * Schedules `kernel` to be executed collectively on all nodes participating in the specified collective group. Call via
@@ -224,7 +355,11 @@ class handler {
 	 * and all host kernel invocations are executed in a single thread on each host.
 	 */
 	template <typename Functor>
-	void host_task(experimental::collective_tag tag, Functor kernel);
+	void host_task(experimental::collective_tag tag, Functor kernel) {
+		// FIXME: We should not have to know how the global range is determined for collective tasks to create the launcher
+		auto launcher = make_host_task_launcher<1, true>(range<3>{m_num_collective_nodes, 1, 1}, tag.m_cgid, kernel);
+		create_collective_task(tag.m_cgid, std::move(launcher));
+	}
 
 	/**
 	 * Schedules a distributed execution of `kernel` by splitting the iteration space in a runtime-defined manner. The kernel is assumed to be invocable
@@ -239,7 +374,11 @@ class handler {
 	 * another node. If you need guarantees about execution order
 	 */
 	template <int Dims, typename Functor>
-	void host_task(range<Dims> global_range, id<Dims> global_offset, Functor kernel);
+	void host_task(range<Dims> global_range, id<Dims> global_offset, Functor kernel) {
+		const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), {1, 1, 1}};
+		auto launcher = make_host_task_launcher<Dims, false>(detail::range_cast<3>(global_range), 0, kernel);
+		create_host_compute_task(geometry, std::move(launcher));
+	}
 
 	/**
 	 * Like `host_task(range<Dims> global_range, id<Dims> global_offset, Functor kernel)`, but with a `global_offset` of zero.
@@ -249,17 +388,25 @@ class handler {
 		host_task(global_range, {}, task);
 	}
 
-  protected:
-	friend bool detail::is_prepass_handler(const handler& cgh);
-	friend detail::execution_target detail::get_handler_execution_target(const handler& cgh);
-
-	handler() = default;
-
-	virtual bool is_prepass() const = 0;
-
-	virtual const detail::task& get_task() const = 0;
-
   private:
+	friend handler detail::make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes);
+	friend std::unique_ptr<detail::task> detail::into_task(handler&& cgh);
+	friend detail::hydration_id detail::add_requirement(handler& cgh, const detail::buffer_id bid, std::unique_ptr<detail::range_mapper_base> rm);
+	friend void detail::add_requirement(handler& cgh, const detail::host_object_id hoid, const experimental::side_effect_order order);
+	friend void detail::add_reduction(handler& cgh, const detail::reduction_info& rinfo);
+	friend void detail::extend_lifetime(handler& cgh, std::shared_ptr<detail::lifetime_extending_state> state);
+
+	detail::task_id m_tid;
+	detail::buffer_access_map m_access_map;
+	detail::side_effect_map m_side_effects;
+	detail::reduction_set m_reductions;
+	std::unique_ptr<detail::task> m_task = nullptr;
+	size_t m_num_collective_nodes;
+	detail::hydration_id m_next_accessor_hydration_id = 1;
+	std::vector<std::shared_ptr<detail::lifetime_extending_state>> m_attached_state;
+
+	handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {}
+
 	template <typename KernelFlavor, typename KernelName, int Dims, typename... ReductionsAndKernel, size_t... ReductionIndices>
 	void parallel_for_reductions_and_kernel(range<Dims> global_range, id<Dims> global_offset,
 	    typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_size, std::index_sequence<ReductionIndices...> indices,
@@ -272,399 +419,212 @@ class handler {
 
 	template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, typename... Reductions>
 	void parallel_for_kernel_and_reductions(range<Dims> global_range, id<Dims> global_offset,
-	    typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel& kernel, Reductions&... reductions);
-};
-
-namespace detail {
-
-	inline bool is_prepass_handler(const handler& cgh) { return cgh.is_prepass(); }
-	inline execution_target get_handler_execution_target(const handler& cgh) { return cgh.get_task().get_execution_target(); }
-
-	class prepass_handler final : public handler {
-	  public:
-		explicit prepass_handler(task_id tid, std::unique_ptr<command_group_storage_base> cgf, size_t num_collective_nodes)
-		    : m_tid(tid), m_cgf(std::move(cgf)), m_num_collective_nodes(num_collective_nodes) {}
-
-		void add_requirement(buffer_id bid, std::unique_ptr<range_mapper_base> rm) {
-			assert(m_task == nullptr);
-			m_access_map.add_access(bid, std::move(rm));
-		}
-
-		void add_requirement(const host_object_id hoid, const experimental::side_effect_order order) {
-			assert(m_task == nullptr);
-			m_side_effects.add_side_effect(hoid, order);
-		}
-
-		void add_reduction(const reduction_info& rinfo) { m_reductions.push_back(rinfo); }
-
-		void create_host_compute_task(task_geometry geometry) {
-			assert(m_task == nullptr);
-			if(geometry.global_size.size() == 0) {
-				// TODO this can be easily supported by not creating a task in case the execution range is empty
-				throw std::runtime_error{"The execution range of distributed host tasks must have at least one item"};
-			}
-			m_task =
-			    detail::task::make_host_compute(m_tid, geometry, std::move(m_cgf), std::move(m_access_map), std::move(m_side_effects), std::move(m_reductions));
+	    typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel& kernel, Reductions&... reductions) {
+		if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) {
+			static_assert(detail::constexpr_false<Kernel>, "Reductions are not supported by your SYCL implementation");
 		}
 
-		void create_device_compute_task(task_geometry geometry, std::string debug_name) {
-			assert(m_task == nullptr);
-			if(geometry.global_size.size() == 0) {
-				// TODO unless reductions are involved, this can be easily supported by not creating a task in case the execution range is empty.
-				// Edge case: If the task includes reductions that specify property::reduction::initialize_to_identity, we need to create a task that sets
-				// the buffer state to an empty pending_reduction_state in the graph_generator. This will cause a trivial reduction_command to be generated on
-				// each node that reads from the reduction output buffer, initializing it to the identity value locally.
-				throw std::runtime_error{"The execution range of device tasks must have at least one item"};
+		range<3> granularity = {1, 1, 1};
+		if constexpr(detail::kernel_flavor_traits<KernelFlavor, Dims>::has_local_size) {
+			for(int d = 0; d < Dims; ++d) {
+				granularity[d] = local_range[d];
 			}
-			if(!m_side_effects.empty()) { throw std::runtime_error{"Side effects cannot be used in device kernels"}; }
-			m_task =
-			    detail::task::make_device_compute(m_tid, geometry, std::move(m_cgf), std::move(m_access_map), std::move(m_reductions), std::move(debug_name));
 		}
+		const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), granularity};
+		auto launcher = make_device_kernel_launcher<KernelFlavor, KernelName, Dims>(
+		    global_range, global_offset, local_range, kernel, std::index_sequence_for<Reductions...>(), reductions...);
+		create_device_compute_task(geometry, detail::kernel_debug_name<KernelName>(), std::move(launcher));
+	}
 
-		void create_collective_task(collective_group_id cgid) {
-			assert(m_task == nullptr);
-			m_task = detail::task::make_collective(m_tid, cgid, m_num_collective_nodes, std::move(m_cgf), std::move(m_access_map), std::move(m_side_effects));
-		}
+	[[nodiscard]] detail::hydration_id add_requirement(const detail::buffer_id bid, std::unique_ptr<detail::range_mapper_base> rm) {
+		assert(m_task == nullptr);
+		m_access_map.add_access(bid, std::move(rm));
+		return m_next_accessor_hydration_id++;
+	}
 
-		void create_master_node_task() {
-			assert(m_task == nullptr);
-			m_task = detail::task::make_master_node(m_tid, std::move(m_cgf), std::move(m_access_map), std::move(m_side_effects));
-		}
+	void add_requirement(const detail::host_object_id hoid, const experimental::side_effect_order order) {
+		assert(m_task == nullptr);
+		m_side_effects.add_side_effect(hoid, order);
+	}
 
-		std::unique_ptr<class task> into_task() && { return std::move(m_task); }
+	void add_reduction(const detail::reduction_info& rinfo) {
+		assert(m_task == nullptr);
+		m_reductions.push_back(rinfo);
+	}
 
-	  protected:
-		bool is_prepass() const override { return true; }
+	void extend_lifetime(std::shared_ptr<detail::lifetime_extending_state> state) { m_attached_state.emplace_back(std::move(state)); }
 
-		const class task& get_task() const override {
-			assert(m_task != nullptr);
-			return *m_task;
+	void create_host_compute_task(detail::task_geometry geometry, std::unique_ptr<detail::command_launcher_storage_base> launcher) {
+		assert(m_task == nullptr);
+		if(geometry.global_size.size() == 0) {
+			// TODO this can be easily supported by not creating a task in case the execution range is empty
+			throw std::runtime_error{"The execution range of distributed host tasks must have at least one item"};
 		}
+		m_task =
+		    detail::task::make_host_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_side_effects), std::move(m_reductions));
+	}
 
-	  private:
-		task_id m_tid;
-		std::unique_ptr<command_group_storage_base> m_cgf;
-		buffer_access_map m_access_map;
-		side_effect_map m_side_effects;
-		reduction_set m_reductions;
-		std::unique_ptr<class task> m_task = nullptr;
-		size_t m_num_collective_nodes;
-	};
-
-	class live_pass_handler : public handler {
-	  public:
-		bool is_prepass() const final { return false; }
-
-		const class task& get_task() const final { return *m_task; }
-
-		template <int BufferDims, typename RangeMapper>
-		subrange<BufferDims> apply_range_mapper(RangeMapper rm, const range<BufferDims>& buffer_range) const {
-			return invoke_range_mapper(m_task->get_dimensions(), rm, chunk{m_sr.offset, m_sr.range, m_task->get_global_size()}, buffer_range);
+	void create_device_compute_task(detail::task_geometry geometry, std::string debug_name, std::unique_ptr<detail::command_launcher_storage_base> launcher) {
+		assert(m_task == nullptr);
+		if(geometry.global_size.size() == 0) {
+			// TODO unless reductions are involved, this can be easily supported by not creating a task in case the execution range is empty.
+			// Edge case: If the task includes reductions that specify property::reduction::initialize_to_identity, we need to create a task that sets
+			// the buffer state to an empty pending_reduction_state in the graph_generator. This will cause a trivial reduction_command to be generated on
+			// each node that reads from the reduction output buffer, initializing it to the identity value locally.
+			throw std::runtime_error{"The execution range of device tasks must have at least one item"};
 		}
+		if(!m_side_effects.empty()) { throw std::runtime_error{"Side effects cannot be used in device kernels"}; }
+		m_task =
+		    detail::task::make_device_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_reductions), std::move(debug_name));
+	}
 
-		subrange<3> get_iteration_range() { return m_sr; }
-
-		bool is_reduction_initializer() const { return m_initialize_reductions; }
-
-	  protected:
-		live_pass_handler(const class task* task, subrange<3> sr, bool initialize_reductions)
-		    : m_task(task), m_sr(sr), m_initialize_reductions(initialize_reductions) {}
+	void create_collective_task(detail::collective_group_id cgid, std::unique_ptr<detail::command_launcher_storage_base> launcher) {
+		assert(m_task == nullptr);
+		m_task = detail::task::make_collective(m_tid, cgid, m_num_collective_nodes, std::move(launcher), std::move(m_access_map), std::move(m_side_effects));
+	}
 
-		const class task* m_task = nullptr;
+	void create_master_node_task(std::unique_ptr<detail::command_launcher_storage_base> launcher) {
+		assert(m_task == nullptr);
+		m_task = detail::task::make_master_node(m_tid, std::move(launcher), std::move(m_access_map), std::move(m_side_effects));
+	}
 
-		// The subrange, when combined with the tasks global size, defines the chunk this handler executes.
-		subrange<3> m_sr;
+	template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, size_t... ReductionIndices, typename... Reductions>
+	auto make_device_kernel_launcher(const range<Dims>& global_range, const id<Dims>& global_offset,
+	    typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel kernel,
+	    std::index_sequence<ReductionIndices...> /* indices */, Reductions... reductions) {
+		static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
+
+		auto fn = [=](detail::device_queue& q, const subrange<3> execution_sr, const std::vector<void*>& reduction_ptrs, const bool is_reduction_initializer) {
+			return q.submit([&](sycl::handler& cgh) {
+				constexpr int sycl_dims = std::max(1, Dims);
+				// Copy once to hydrate accessors
+				auto hydrated_kernel = detail::closure_hydrator::get_instance().hydrate<target::device>(cgh, kernel);
+				if constexpr(std::is_same_v<KernelFlavor, detail::simple_kernel_flavor>) {
+					const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_sr.range));
+					detail::invoke_sycl_parallel_for<KernelName>(cgh, sycl_global_range,
+					    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices], is_reduction_initializer)...,
+					    detail::bind_simple_kernel(hydrated_kernel, global_range, global_offset, detail::id_cast<Dims>(execution_sr.offset)));
+				} else if constexpr(std::is_same_v<KernelFlavor, detail::nd_range_kernel_flavor>) {
+					const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(execution_sr.range));
+					const auto sycl_local_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(local_range));
+					detail::invoke_sycl_parallel_for<KernelName>(cgh, cl::sycl::nd_range{sycl_global_range, sycl_local_range},
+					    detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices], is_reduction_initializer)...,
+					    detail::bind_nd_range_kernel(hydrated_kernel, global_range, global_offset, detail::id_cast<Dims>(execution_sr.offset),
+					        global_range / local_range, detail::id_cast<Dims>(execution_sr.offset) / local_range));
+				} else {
+					static_assert(detail::constexpr_false<KernelFlavor>);
+				}
+			});
+		};
 
-		bool m_initialize_reductions;
-	};
+		return std::make_unique<detail::command_launcher_storage<decltype(fn)>>(std::move(fn));
+	}
 
-	class live_pass_host_handler final : public live_pass_handler {
-	  public:
-		live_pass_host_handler(const class task* task, subrange<3> sr, bool initialize_reductions, host_queue& queue)
-		    : live_pass_handler(task, sr, initialize_reductions), m_queue(&queue) {}
+	template <int Dims, bool Collective, typename Kernel>
+	auto make_host_task_launcher(const range<3>& global_range, const detail::collective_group_id cgid, Kernel kernel) {
+		static_assert(std::is_copy_constructible_v<std::decay_t<Kernel>>, "Kernel functor must be copyable"); // Required for hydration
+		static_assert(Dims >= 0);
 
-		template <int Dims, typename Kernel>
-		void schedule(Kernel kernel) {
-			static_assert(Dims >= 0);
-			m_future = m_queue->submit(m_task->get_collective_group_id(), [kernel, global_size = m_task->get_global_size(), sr = m_sr](MPI_Comm) {
+		auto fn = [kernel, cgid, global_range](detail::host_queue& q, const subrange<3>& execution_sr) {
+			auto hydrated_kernel = detail::closure_hydrator::get_instance().hydrate<target::host_task>(kernel);
+			return q.submit(cgid, [hydrated_kernel, global_range, execution_sr](MPI_Comm comm) {
+				(void)global_range;
 				if constexpr(Dims > 0) {
-					const auto part = make_partition<Dims>(range_cast<Dims>(global_size), subrange_cast<Dims>(sr));
-					kernel(part);
+					if constexpr(Collective) {
+						static_assert(Dims == 1);
+						const auto part = detail::make_collective_partition(detail::range_cast<1>(global_range), detail::subrange_cast<1>(execution_sr), comm);
+						hydrated_kernel(part);
+					} else {
+						const auto part = detail::make_partition<Dims>(detail::range_cast<Dims>(global_range), detail::subrange_cast<Dims>(execution_sr));
+						hydrated_kernel(part);
+					}
 				} else if constexpr(std::is_invocable_v<Kernel, const partition<0>&>) {
-					(void)sr;
-					const auto part = make_0d_partition();
-					kernel(part);
+					(void)execution_sr;
+					const auto part = detail::make_0d_partition();
+					hydrated_kernel(part);
 				} else {
-					(void)sr;
-					kernel();
+					(void)execution_sr;
+					hydrated_kernel();
 				}
 			});
-		}
-
-		template <typename Kernel>
-		void schedule_collective(Kernel kernel) {
-			m_future = m_queue->submit(m_task->get_collective_group_id(), [kernel, global_size = m_task->get_global_size(), sr = m_sr](MPI_Comm comm) {
-				const auto part = make_collective_partition(range_cast<1>(global_size), subrange_cast<1>(sr), comm);
-				kernel(part);
-			});
-		}
-
-		std::future<host_queue::execution_info> into_future() { return std::move(m_future); }
-
-	  private:
-		host_queue* m_queue;
-		std::future<host_queue::execution_info> m_future;
-	};
-
-	template <typename Kernel, int Dims, typename... Reducers>
-	inline void invoke_kernel(const Kernel& kernel, const sycl::id<std::max(1, Dims)>& s_id, const range<Dims>& global_range, const id<Dims>& global_offset,
-	    const id<Dims>& chunk_offset, Reducers&... reducers) {
-		kernel(make_item<Dims>(id_cast<Dims>(id<std::max(1, Dims)>(s_id)) + chunk_offset, global_offset, global_range), reducers...);
-	}
-
-	template <typename Kernel, int Dims, typename... Reducers>
-	inline void invoke_kernel(const Kernel& kernel, const cl::sycl::nd_item<std::max(1, Dims)>& s_item, const range<Dims>& global_range,
-	    const id<Dims>& global_offset, const id<Dims>& chunk_offset, const range<Dims>& group_range, const id<Dims>& group_offset, Reducers&... reducers) {
-		kernel(make_nd_item<Dims>(s_item, global_range, global_offset, chunk_offset, group_range, group_offset), reducers...);
-	}
-
-	template <typename Kernel, int Dims>
-	auto bind_simple_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims>& chunk_offset) {
-		// The current mechanism for hydrating the SYCL placeholder accessors inside Celerity accessors requires that the kernel functor
-		// capturing those accessors is copied at least once during submission (see also live_pass_device_handler::submit_to_sycl).
-		// As of SYCL 2020 kernel functors are passed as const references, so we explicitly capture by value here.
-		return [=](auto s_item_or_id, auto&... reducers) {
-			constexpr int sycl_dims = std::max(1, Dims);
-			static_assert(std::is_invocable_v<Kernel, celerity::item<Dims>, decltype(reducers)...>,
-			    "Kernel function must be invocable with celerity::item<Dims> and as many reducer objects as reductions passed to parallel_for");
-			if constexpr(CELERITY_WORKAROUND(DPCPP) && std::is_same_v<sycl::id<Dims>, decltype(s_item_or_id)>) {
-				// CELERITY_WORKAROUND_LESS_OR_EQUAL: DPC++ passes a sycl::id instead of a sycl::item to kernels alongside reductions
-				invoke_kernel(kernel, s_item_or_id, global_range, global_offset, chunk_offset, reducers...);
-			} else {
-				// Explicit item constructor: ComputeCpp does not pass a sycl::item, but an implicitly convertible sycl::item_base (?) which does not have
-				// `sycl::id<> get_id()`
-				invoke_kernel(kernel, cl::sycl::item<sycl_dims>{s_item_or_id}.get_id(), global_range, global_offset, chunk_offset, reducers...);
-			}
 		};
-	}
 
-	template <typename Kernel, int Dims>
-	auto bind_nd_range_kernel(const Kernel& kernel, const range<Dims>& global_range, const id<Dims>& global_offset, const id<Dims> chunk_offset,
-	    const range<Dims>& group_range, const id<Dims>& group_offset) {
-		return [=](sycl::nd_item<std::max(1, Dims)> s_item, auto&... reducers) {
-			static_assert(std::is_invocable_v<Kernel, celerity::nd_item<Dims>, decltype(reducers)...>,
-			    "Kernel function must be invocable with celerity::nd_item<Dims> or and as many reducer objects as reductions passed to parallel_for");
-			invoke_kernel(kernel, s_item, global_range, global_offset, chunk_offset, group_range, group_offset, reducers...);
-		};
+		return std::make_unique<detail::command_launcher_storage<decltype(fn)>>(std::move(fn));
 	}
 
-	template <typename KernelName, typename... Params>
-	inline void invoke_sycl_parallel_for(cl::sycl::handler& cgh, Params&&... args) {
-		static_assert(CELERITY_FEATURE_UNNAMED_KERNELS || !is_unnamed_kernel<KernelName>,
-		    "Your SYCL implementation does not support unnamed kernels, add a kernel name template parameter to this parallel_for invocation");
-		if constexpr(detail::is_unnamed_kernel<KernelName>) {
-#if CELERITY_FEATURE_UNNAMED_KERNELS // see static_assert above
-			cgh.parallel_for(std::forward<Params>(args)...);
-#endif
-		} else {
-			cgh.parallel_for<detail::bind_kernel_name<KernelName>>(std::forward<Params>(args)...);
+	std::unique_ptr<detail::task> into_task() && {
+		assert(m_task != nullptr);
+		for(auto state : m_attached_state) {
+			m_task->extend_lifetime(std::move(state));
 		}
+		return std::move(m_task);
 	}
+};
 
-	class live_pass_device_handler final : public live_pass_handler {
-	  public:
-		live_pass_device_handler(const class task* task, subrange<3> sr, bool initialize_reductions, device_queue& d_queue)
-		    : live_pass_handler(task, sr, initialize_reductions), m_d_queue(&d_queue) {}
-
-		template <typename CGF>
-		void submit_to_sycl(CGF&& cgf) {
-			m_event = m_d_queue->submit([&](cl::sycl::handler& cgh) {
-				this->m_eventual_cgh = &cgh;
-				std::forward<CGF>(cgf)(cgh);
-				this->m_eventual_cgh = nullptr;
-			});
-		}
+namespace detail {
 
-		cl::sycl::event get_submission_event() const { return m_event; }
+	inline handler make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes) { return handler(tid, num_collective_nodes); }
 
-		cl::sycl::handler* const* get_eventual_sycl_cgh() const { return &m_eventual_cgh; }
+	inline std::unique_ptr<detail::task> into_task(handler&& cgh) { return std::move(cgh).into_task(); }
 
-	  private:
-		device_queue* m_d_queue;
-		cl::sycl::handler* m_eventual_cgh = nullptr;
-		cl::sycl::event m_event;
-	};
+	[[nodiscard]] inline hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr<range_mapper_base> rm) {
+		return cgh.add_requirement(bid, std::move(rm));
+	}
 
-	template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
-	class reduction_descriptor;
+	inline void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order) {
+		return cgh.add_requirement(hoid, order);
+	}
 
-	template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
-	auto make_sycl_reduction(const reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>& d) {
+	inline void add_reduction(handler& cgh, const detail::reduction_info& rinfo) { return cgh.add_reduction(rinfo); }
+
+	inline void extend_lifetime(handler& cgh, std::shared_ptr<detail::lifetime_extending_state> state) { cgh.extend_lifetime(std::move(state)); }
+
+	// TODO: The _impl functions in detail only exist during the grace period for deprecated reductions on const buffers; move outside again afterwards.
+	template <typename DataT, int Dims, typename BinaryOperation>
+	auto reduction_impl(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
 #if !CELERITY_FEATURE_SCALAR_REDUCTIONS
 		static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
 #else
-		cl::sycl::property_list props;
-		if(!d.m_include_current_buffer_value) { props = {cl::sycl::property::reduction::initialize_to_identity{}}; }
-		if constexpr(WithExplicitIdentity) {
-			return sycl::reduction(d.m_device_ptr, d.m_identity, d.m_op, props);
-		} else {
-			return sycl::reduction(d.m_device_ptr, d.m_op, props);
-		}
+		static_assert(cl::sycl::has_known_identity_v<BinaryOperation, DataT>,
+		    "Celerity does not currently support reductions without an identity. Either specialize "
+		    "cl::sycl::known_identity or use the reduction() overload taking an identity at runtime");
+		return detail::make_reduction<false>(vars, cgh, combiner, cl::sycl::known_identity_v<BinaryOperation, DataT>, prop_list);
 #endif
 	}
 
 	template <typename DataT, int Dims, typename BinaryOperation>
-	class reduction_descriptor<DataT, Dims, BinaryOperation, false /* WithExplicitIdentity */> {
-	  public:
-		reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT /* identity */, bool include_current_buffer_value, DataT* device_ptr)
-		    : m_bid(bid), m_op(combiner), m_include_current_buffer_value(include_current_buffer_value), m_device_ptr(device_ptr) {}
-
-	  private:
-		friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, false>(const reduction_descriptor&);
-
-		buffer_id m_bid;
-		BinaryOperation m_op;
-		bool m_include_current_buffer_value;
-		DataT* m_device_ptr;
-	};
-
-	template <typename DataT, int Dims, typename BinaryOperation>
-	class reduction_descriptor<DataT, Dims, BinaryOperation, true /* WithExplicitIdentity */> {
-	  public:
-		reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT identity, bool include_current_buffer_value, DataT* device_ptr)
-		    : m_bid(bid), m_op(combiner), m_identity(identity), m_include_current_buffer_value(include_current_buffer_value), m_device_ptr(device_ptr) {}
-
-	  private:
-		friend auto make_sycl_reduction<DataT, Dims, BinaryOperation, true>(const reduction_descriptor&);
-
-		buffer_id m_bid;
-		BinaryOperation m_op;
-		DataT m_identity{};
-		bool m_include_current_buffer_value;
-		DataT* m_device_ptr;
-	};
-
-	template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation>
-	auto make_reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) {
+	auto reduction_impl(
+	    const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
 #if !CELERITY_FEATURE_SCALAR_REDUCTIONS
 		static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
 #else
-		if(vars.get_range().size() != 1) {
-			// Like SYCL 2020, Celerity only supports reductions to unit-sized buffers. This allows us to avoid tracking different parts of the buffer
-			// as distributed_state and pending_reduction_state.
-			throw std::runtime_error("Only unit-sized buffers can be reduction targets");
-		}
-
-		auto bid = detail::get_buffer_id(vars);
-		auto include_current_buffer_value = !prop_list.has_property<celerity::property::reduction::initialize_to_identity>();
-		DataT* device_ptr = nullptr;
-
-		if(detail::is_prepass_handler(cgh)) {
-			auto rid = detail::runtime::get_instance().get_reduction_manager().create_reduction<DataT, Dims>(bid, op, identity);
-			static_cast<detail::prepass_handler&>(cgh).add_reduction(reduction_info{rid, bid, include_current_buffer_value});
-		} else {
-			auto& device_handler = static_cast<detail::live_pass_device_handler&>(cgh);
-			include_current_buffer_value &= device_handler.is_reduction_initializer();
-
-			auto mode = access_mode::discard_write;
-			if(include_current_buffer_value) { mode = access_mode::read_write; }
-			device_ptr = static_cast<DataT*>(
-			    runtime::get_instance().get_buffer_manager().access_device_buffer<DataT, Dims>(bid, mode, subrange_cast<Dims>(subrange<3>{{}, {1, 1, 1}})).ptr);
-		}
-		return detail::reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>{bid, op, identity, include_current_buffer_value, device_ptr};
+		static_assert(!cl::sycl::has_known_identity_v<BinaryOperation, DataT>, "Identity is known to SYCL, remove the identity parameter from reduction()");
+		return detail::make_reduction<true>(vars, cgh, combiner, identity, prop_list);
 #endif
 	}
 
 } // namespace detail
 
-template <typename KernelFlavor, typename KernelName, int Dims, typename Kernel, typename... Reductions>
-void handler::parallel_for_kernel_and_reductions(range<Dims> global_range, id<Dims> global_offset,
-    typename detail::kernel_flavor_traits<KernelFlavor, Dims>::local_size_type local_range, Kernel& kernel, Reductions&... reductions) {
-	if(is_prepass()) {
-		range<3> granularity = {1, 1, 1};
-		if constexpr(detail::kernel_flavor_traits<KernelFlavor, Dims>::has_local_size) {
-			for(int d = 0; d < Dims; ++d) {
-				granularity[d] = local_range[d];
-			}
-		}
-		const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), granularity};
-		return dynamic_cast<detail::prepass_handler&>(*this).create_device_compute_task(geometry, detail::kernel_debug_name<KernelName>());
-	}
-
-	auto& device_handler = dynamic_cast<detail::live_pass_device_handler&>(*this);
-	const auto sr = device_handler.get_iteration_range();
-	auto chunk_range = detail::range_cast<Dims>(sr.range);
-	auto chunk_offset = detail::id_cast<Dims>(sr.offset);
-
-	device_handler.submit_to_sycl([&](cl::sycl::handler& cgh) {
-		constexpr int sycl_dims = std::max(1, Dims);
-
-		if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) {
-			static_assert(detail::constexpr_false<Kernel>, "Reductions are not supported by your SYCL implementation");
-		} else if constexpr(std::is_same_v<KernelFlavor, detail::simple_kernel_flavor>) {
-			const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(chunk_range));
-			detail::invoke_sycl_parallel_for<KernelName>(cgh, sycl_global_range, detail::make_sycl_reduction(reductions)...,
-			    detail::bind_simple_kernel(kernel, global_range, global_offset, chunk_offset));
-		} else if constexpr(std::is_same_v<KernelFlavor, detail::nd_range_kernel_flavor>) {
-			const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(chunk_range));
-			const auto sycl_local_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(local_range));
-			detail::invoke_sycl_parallel_for<KernelName>(cgh, cl::sycl::nd_range(sycl_global_range, sycl_local_range),
-			    detail::make_sycl_reduction(reductions)...,
-			    detail::bind_nd_range_kernel(kernel, global_range, global_offset, chunk_offset, global_range / local_range, chunk_offset / local_range));
-		} else {
-			static_assert(detail::constexpr_false<KernelFlavor>);
-		}
-	});
-}
-
-template <typename Functor>
-void handler::host_task(on_master_node_tag, Functor kernel) {
-	if(is_prepass()) {
-		dynamic_cast<detail::prepass_handler&>(*this).create_master_node_task();
-	} else {
-		dynamic_cast<detail::live_pass_host_handler&>(*this).schedule<0>(kernel);
-	}
-}
-
-template <typename Functor>
-void handler::host_task(experimental::collective_tag tag, Functor kernel) {
-	if(is_prepass()) {
-		dynamic_cast<detail::prepass_handler&>(*this).create_collective_task(tag.m_cgid);
-	} else {
-		dynamic_cast<detail::live_pass_host_handler&>(*this).schedule_collective(kernel);
-	}
+template <typename DataT, int Dims, typename BinaryOperation>
+auto reduction(buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
+	return detail::reduction_impl(vars, cgh, combiner, prop_list);
 }
 
-template <int Dims, typename Functor>
-void handler::host_task(range<Dims> global_range, id<Dims> global_offset, Functor kernel) {
-	if(is_prepass()) {
-		const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), {1, 1, 1}};
-		dynamic_cast<detail::prepass_handler&>(*this).create_host_compute_task(geometry);
-	} else {
-		dynamic_cast<detail::live_pass_host_handler&>(*this).schedule<Dims>(kernel);
-	}
+template <typename DataT, int Dims, typename BinaryOperation>
+auto reduction(buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
+	return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
 }
 
 template <typename DataT, int Dims, typename BinaryOperation>
-auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
-#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
-	static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
-#else
-	static_assert(cl::sycl::has_known_identity_v<BinaryOperation, DataT>,
-	    "Celerity does not currently support reductions without an identity. Either specialize "
-	    "cl::sycl::known_identity or use the reduction() overload taking an identity at runtime");
-	return detail::make_reduction<false>(vars, cgh, combiner, cl::sycl::known_identity_v<BinaryOperation, DataT>, prop_list);
-#endif
+[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
+    const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
+	return detail::reduction_impl(vars, cgh, combiner, prop_list);
 }
 
 template <typename DataT, int Dims, typename BinaryOperation>
-auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
-#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
-	static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
-#else
-	static_assert(!cl::sycl::has_known_identity_v<BinaryOperation, DataT>, "Identity is known to SYCL, remove the identity parameter from reduction()");
-	return detail::make_reduction<true>(vars, cgh, combiner, identity, prop_list);
-#endif
+[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction(
+    const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
+	return detail::reduction_impl(vars, cgh, identity, combiner, prop_list);
 }
 
 } // namespace celerity
diff --git a/include/host_object.h b/include/host_object.h
index bf2fcc219..518abc3dd 100644
--- a/include/host_object.h
+++ b/include/host_object.h
@@ -6,9 +6,9 @@
 #include <unordered_set>
 #include <utility>
 
+#include "lifetime_extending_state.h"
 #include "runtime.h"
 
-
 namespace celerity::experimental {
 
 template <typename T>
@@ -45,7 +45,7 @@ class host_object_manager {
 };
 
 // Base for `state` structs in all host_object specializations: registers and unregisters host_objects with the host_object_manager.
-struct host_object_tracker {
+struct host_object_tracker : public lifetime_extending_state {
 	detail::host_object_id id{};
 
 	host_object_tracker() {
@@ -53,8 +53,10 @@ struct host_object_tracker {
 		id = detail::runtime::get_instance().get_host_object_manager().create_host_object();
 	}
 
+	host_object_tracker(const host_object_tracker&) = delete;
 	host_object_tracker(host_object_tracker&&) = delete;
 	host_object_tracker& operator=(host_object_tracker&&) = delete;
+	host_object_tracker& operator=(const host_object_tracker&) = delete;
 
 	~host_object_tracker() { detail::runtime::get_instance().get_host_object_manager().destroy_host_object(id); }
 };
@@ -96,7 +98,7 @@ namespace celerity::experimental {
  * - `host_object<void>` does not carry internal state and can be used to track access to global variables or functions like `printf()`.
  */
 template <typename T>
-class host_object {
+class host_object final : public detail::lifetime_extending_state_wrapper {
 	static_assert(std::is_object_v<T>); // disallow host_object<T&&> and host_object<function-type>
 
   public:
@@ -113,6 +115,9 @@ class host_object {
 	explicit host_object(const std::in_place_t /* tag */, CtorParams&&... ctor_args) // requiring std::in_place avoids overriding copy and move constructors
 	    : m_shared_state(std::make_shared<state>(std::in_place, std::forward<CtorParams>(ctor_args)...)) {}
 
+  protected:
+	std::shared_ptr<detail::lifetime_extending_state> get_lifetime_extending_state() const override { return m_shared_state; }
+
   private:
 	template <typename U>
 	friend detail::host_object_id detail::get_host_object_id(const experimental::host_object<U>& ho);
@@ -134,7 +139,7 @@ class host_object {
 };
 
 template <typename T>
-class host_object<T&> {
+class host_object<T&> final : public detail::lifetime_extending_state_wrapper {
   public:
 	using instance_type = T;
 
@@ -142,6 +147,9 @@ class host_object<T&> {
 
 	explicit host_object(const std::reference_wrapper<T> ref) : m_shared_state(std::make_shared<state>(ref.get())) {}
 
+  protected:
+	std::shared_ptr<detail::lifetime_extending_state> get_lifetime_extending_state() const override { return m_shared_state; }
+
   private:
 	template <typename U>
 	friend detail::host_object_id detail::get_host_object_id(const experimental::host_object<U>& ho);
@@ -149,7 +157,7 @@ class host_object<T&> {
 	template <typename U>
 	friend typename experimental::host_object<U>::instance_type& detail::get_host_object_instance(const experimental::host_object<U>& ho);
 
-	struct state : detail::host_object_tracker {
+	struct state final : detail::host_object_tracker {
 		T& instance;
 
 		explicit state(T& instance) : instance{instance} {}
@@ -162,17 +170,20 @@ class host_object<T&> {
 };
 
 template <>
-class host_object<void> {
+class host_object<void> final : public detail::lifetime_extending_state_wrapper {
   public:
 	using instance_type = void;
 
 	explicit host_object() : m_shared_state(std::make_shared<state>()) {}
 
+  protected:
+	std::shared_ptr<detail::lifetime_extending_state> get_lifetime_extending_state() const override { return m_shared_state; }
+
   private:
 	template <typename U>
 	friend detail::host_object_id detail::get_host_object_id(const experimental::host_object<U>& ho);
 
-	struct state : detail::host_object_tracker {};
+	struct state final : detail::host_object_tracker {};
 
 	detail::host_object_id get_id() const { return m_shared_state->id; }
 
diff --git a/include/lifetime_extending_state.h b/include/lifetime_extending_state.h
new file mode 100644
index 000000000..59b0163ae
--- /dev/null
+++ b/include/lifetime_extending_state.h
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <memory>
+
+namespace celerity::detail {
+
+/**
+ * Helper type for creating objects with reference semantics, whose lifetime can be extended by tasks.
+ */
+class lifetime_extending_state {
+  public:
+	lifetime_extending_state() = default;
+	lifetime_extending_state(const lifetime_extending_state&) = delete;
+	lifetime_extending_state(lifetime_extending_state&&) = delete;
+	lifetime_extending_state& operator=(const lifetime_extending_state&) = delete;
+	lifetime_extending_state& operator=(lifetime_extending_state&&) = delete;
+
+	virtual ~lifetime_extending_state() = default;
+};
+
+/**
+ * Wrapper type that allows to retrieve the contained lifetime extending state (creation and storage of which is to be implemented by sub-classes).
+ */
+class lifetime_extending_state_wrapper {
+  public:
+	lifetime_extending_state_wrapper() = default;
+	lifetime_extending_state_wrapper(const lifetime_extending_state_wrapper&) = default;
+	lifetime_extending_state_wrapper(lifetime_extending_state_wrapper&&) noexcept = default;
+	lifetime_extending_state_wrapper& operator=(const lifetime_extending_state_wrapper&) = default;
+	lifetime_extending_state_wrapper& operator=(lifetime_extending_state_wrapper&&) noexcept = default;
+
+	virtual ~lifetime_extending_state_wrapper() = default;
+
+  protected:
+	friend std::shared_ptr<lifetime_extending_state> get_lifetime_extending_state(const lifetime_extending_state_wrapper& wrapper);
+	virtual std::shared_ptr<lifetime_extending_state> get_lifetime_extending_state() const = 0;
+};
+
+inline std::shared_ptr<lifetime_extending_state> get_lifetime_extending_state(const lifetime_extending_state_wrapper& wrapper) {
+	return wrapper.get_lifetime_extending_state();
+}
+
+} // namespace celerity::detail
\ No newline at end of file
diff --git a/include/runtime.h b/include/runtime.h
index d948b7657..079aac8eb 100644
--- a/include/runtime.h
+++ b/include/runtime.h
@@ -167,11 +167,7 @@ namespace detail {
 		}
 
 		// Deletes the runtime instance, which happens only in tests. Called from runtime_fixture.
-		static void test_case_exit() {
-			assert(m_test_mode && m_test_active);
-			instance.reset();
-			m_test_active = false;
-		}
+		static void test_case_exit();
 
 	  private:
 		inline static bool m_test_mode = false;
diff --git a/include/side_effect.h b/include/side_effect.h
index 39394d12b..06a46a617 100644
--- a/include/side_effect.h
+++ b/include/side_effect.h
@@ -14,29 +14,47 @@ namespace celerity::experimental {
  */
 template <typename T, side_effect_order Order = side_effect_order::sequential>
 class side_effect {
+	struct ctor_internal_tag {};
+
   public:
 	using instance_type = typename host_object<T>::instance_type;
 	constexpr static inline side_effect_order order = Order;
 
-	explicit side_effect(const host_object<T>& object, handler& cgh) : m_object(object) {
-		if(detail::is_prepass_handler(cgh)) {
-			auto& prepass_cgh = static_cast<detail::prepass_handler&>(cgh);
-			prepass_cgh.add_requirement(detail::get_host_object_id(object), order);
-		}
-	}
+	explicit side_effect(host_object<T>& object, handler& cgh) : side_effect(ctor_internal_tag{}, object, cgh) {}
+
+	[[deprecated("Creating side_effect from const host_object is deprecated, capture host_object by reference instead")]] explicit side_effect(
+	    const host_object<T>& object, handler& cgh)
+	    : side_effect(ctor_internal_tag{}, object, cgh) {}
 
 	template <typename U = T>
 	std::enable_if_t<!std::is_void_v<U>, instance_type>& operator*() const {
-		return detail::get_host_object_instance(m_object);
+		return *m_instance;
 	}
 
 	template <typename U = T>
 	std::enable_if_t<!std::is_void_v<U>, instance_type>* operator->() const {
-		return &detail::get_host_object_instance(m_object);
+		return m_instance;
 	}
 
   private:
-	host_object<T> m_object;
+	instance_type* m_instance;
+
+	side_effect(ctor_internal_tag /* tag */, const host_object<T>& object, handler& cgh) : m_instance{&detail::get_host_object_instance(object)} {
+		detail::add_requirement(cgh, detail::get_host_object_id(object), order);
+		detail::extend_lifetime(cgh, detail::get_lifetime_extending_state(object));
+	}
+};
+
+template <side_effect_order Order>
+class side_effect<void, Order> {
+  public:
+	using instance_type = typename host_object<void>::instance_type;
+	constexpr static inline side_effect_order order = Order;
+
+	explicit side_effect(const host_object<void>& object, handler& cgh) {
+		detail::add_requirement(cgh, detail::get_host_object_id(object), order);
+		detail::extend_lifetime(cgh, detail::get_lifetime_extending_state(object));
+	}
 };
 
 template <typename T>
diff --git a/include/task.h b/include/task.h
index 4a2db827b..1b4aa9fbe 100644
--- a/include/task.h
+++ b/include/task.h
@@ -3,10 +3,14 @@
 #include <memory>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
+#include "device_queue.h"
 #include "grid.h"
+#include "host_queue.h"
 #include "intrusive_graph.h"
+#include "lifetime_extending_state.h"
 #include "range_mapper.h"
 #include "types.h"
 
@@ -38,26 +42,58 @@ namespace detail {
 		shutdown,
 	};
 
-	struct command_group_storage_base {
-		virtual void operator()(handler& cgh) const = 0;
-
-		virtual ~command_group_storage_base() = default;
+	class command_launcher_storage_base {
+	  public:
+		command_launcher_storage_base() = default;
+		command_launcher_storage_base(const command_launcher_storage_base&) = delete;
+		command_launcher_storage_base(command_launcher_storage_base&&) = default;
+		command_launcher_storage_base& operator=(const command_launcher_storage_base&) = delete;
+		command_launcher_storage_base& operator=(command_launcher_storage_base&&) = default;
+		virtual ~command_launcher_storage_base() = default;
+
+		virtual sycl::event operator()(
+		    device_queue& q, const subrange<3> execution_sr, const std::vector<void*>& reduction_ptrs, const bool is_reduction_initializer) const = 0;
+		virtual std::future<host_queue::execution_info> operator()(host_queue& q, const subrange<3>& execution_sr) const = 0;
 	};
 
 	template <typename Functor>
-	struct command_group_storage : command_group_storage_base {
-		Functor fun;
+	class command_launcher_storage : public command_launcher_storage_base {
+	  public:
+		command_launcher_storage(Functor&& fun) : m_fun(std::move(fun)) {}
+
+		sycl::event operator()(
+		    device_queue& q, const subrange<3> execution_sr, const std::vector<void*>& reduction_ptrs, const bool is_reduction_initializer) const override {
+			return invoke<sycl::event>(q, execution_sr, reduction_ptrs, is_reduction_initializer);
+		}
+
+		std::future<host_queue::execution_info> operator()(host_queue& q, const subrange<3>& execution_sr) const override {
+			return invoke<std::future<host_queue::execution_info>>(q, execution_sr);
+		}
 
-		command_group_storage(Functor fun) : fun(fun) {}
-		void operator()(handler& cgh) const override { fun(cgh); }
+	  private:
+		Functor m_fun;
+
+		template <typename Ret, typename... Args>
+		Ret invoke(Args&&... args) const {
+			if constexpr(std::is_invocable_v<Functor, Args...>) {
+				return m_fun(args...);
+			} else {
+				throw std::runtime_error("Cannot launch command function with provided arguments");
+			}
+		}
 	};
 
 	class buffer_access_map {
 	  public:
-		void add_access(buffer_id bid, std::unique_ptr<range_mapper_base>&& rm) { m_map.emplace(bid, std::move(rm)); }
+		void add_access(buffer_id bid, std::unique_ptr<range_mapper_base>&& rm) { m_accesses.emplace_back(bid, std::move(rm)); }
 
 		std::unordered_set<buffer_id> get_accessed_buffers() const;
 		std::unordered_set<cl::sycl::access::mode> get_access_modes(buffer_id bid) const;
+		size_t get_num_accesses() const { return m_accesses.size(); }
+		std::pair<buffer_id, access_mode> get_nth_access(const size_t n) const {
+			const auto& [bid, rm] = m_accesses[n];
+			return {bid, rm->get_access_mode()};
+		}
 
 		/**
 		 * @brief Computes the combined access-region for a given buffer, mode and subrange.
@@ -68,11 +104,13 @@ namespace detail {
 		 *
 		 * @returns The region obtained by merging the results of all range-mappers for this buffer and mode
 		 */
-		GridRegion<3> get_requirements_for_access(
-		    buffer_id bid, cl::sycl::access::mode mode, int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const;
+		GridRegion<3> get_mode_requirements(
+		    const buffer_id bid, const access_mode mode, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const;
+
+		GridBox<3> get_requirements_for_nth_access(const size_t n, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const;
 
 	  private:
-		std::unordered_multimap<buffer_id, std::unique_ptr<range_mapper_base>> m_map;
+		std::vector<std::pair<buffer_id, std::unique_ptr<range_mapper_base>>> m_accesses;
 	};
 
 	using reduction_set = std::vector<reduction_info>;
@@ -126,8 +164,6 @@ namespace detail {
 
 		const side_effect_map& get_side_effect_map() const { return m_side_effects; }
 
-		const command_group_storage_base& get_command_group() const { return *m_cgf; }
-
 		const task_geometry& get_geometry() const { return m_geometry; }
 
 		int get_dimensions() const { return m_geometry.dimensions; }
@@ -161,33 +197,40 @@ namespace detail {
 
 		fence_promise* get_fence_promise() const { return m_fence_promise.get(); }
 
+		template <typename... Args>
+		auto launch(Args&&... args) const {
+			return (*m_launcher)(std::forward<Args>(args)...);
+		}
+
+		void extend_lifetime(std::shared_ptr<lifetime_extending_state> state) { m_attached_state.emplace_back(std::move(state)); }
+
 		static std::unique_ptr<task> make_epoch(task_id tid, detail::epoch_action action) {
 			return std::unique_ptr<task>(new task(tid, task_type::epoch, collective_group_id{}, task_geometry{}, nullptr, {}, {}, {}, {}, action, nullptr));
 		}
 
-		static std::unique_ptr<task> make_host_compute(task_id tid, task_geometry geometry, std::unique_ptr<command_group_storage_base> cgf,
+		static std::unique_ptr<task> make_host_compute(task_id tid, task_geometry geometry, std::unique_ptr<command_launcher_storage_base> launcher,
 		    buffer_access_map access_map, side_effect_map side_effect_map, reduction_set reductions) {
-			return std::unique_ptr<task>(new task(tid, task_type::host_compute, collective_group_id{}, geometry, std::move(cgf), std::move(access_map),
+			return std::unique_ptr<task>(new task(tid, task_type::host_compute, collective_group_id{}, geometry, std::move(launcher), std::move(access_map),
 			    std::move(side_effect_map), std::move(reductions), {}, {}, nullptr));
 		}
 
-		static std::unique_ptr<task> make_device_compute(task_id tid, task_geometry geometry, std::unique_ptr<command_group_storage_base> cgf,
+		static std::unique_ptr<task> make_device_compute(task_id tid, task_geometry geometry, std::unique_ptr<command_launcher_storage_base> launcher,
 		    buffer_access_map access_map, reduction_set reductions, std::string debug_name) {
-			return std::unique_ptr<task>(new task(tid, task_type::device_compute, collective_group_id{}, geometry, std::move(cgf), std::move(access_map), {},
-			    std::move(reductions), std::move(debug_name), {}, nullptr));
+			return std::unique_ptr<task>(new task(tid, task_type::device_compute, collective_group_id{}, geometry, std::move(launcher), std::move(access_map),
+			    {}, std::move(reductions), std::move(debug_name), {}, nullptr));
 		}
 
 		static std::unique_ptr<task> make_collective(task_id tid, collective_group_id cgid, size_t num_collective_nodes,
-		    std::unique_ptr<command_group_storage_base> cgf, buffer_access_map access_map, side_effect_map side_effect_map) {
+		    std::unique_ptr<command_launcher_storage_base> launcher, buffer_access_map access_map, side_effect_map side_effect_map) {
 			const task_geometry geometry{1, detail::range_cast<3>(range(num_collective_nodes)), {}, {1, 1, 1}};
-			return std::unique_ptr<task>(
-			    new task(tid, task_type::collective, cgid, geometry, std::move(cgf), std::move(access_map), std::move(side_effect_map), {}, {}, {}, nullptr));
+			return std::unique_ptr<task>(new task(
+			    tid, task_type::collective, cgid, geometry, std::move(launcher), std::move(access_map), std::move(side_effect_map), {}, {}, {}, nullptr));
 		}
 
 		static std::unique_ptr<task> make_master_node(
-		    task_id tid, std::unique_ptr<command_group_storage_base> cgf, buffer_access_map access_map, side_effect_map side_effect_map) {
-			return std::unique_ptr<task>(new task(tid, task_type::master_node, collective_group_id{}, task_geometry{}, std::move(cgf), std::move(access_map),
-			    std::move(side_effect_map), {}, {}, {}, nullptr));
+		    task_id tid, std::unique_ptr<command_launcher_storage_base> launcher, buffer_access_map access_map, side_effect_map side_effect_map) {
+			return std::unique_ptr<task>(new task(tid, task_type::master_node, collective_group_id{}, task_geometry{}, std::move(launcher),
+			    std::move(access_map), std::move(side_effect_map), {}, {}, {}, nullptr));
 		}
 
 		static std::unique_ptr<task> make_horizon(task_id tid) {
@@ -205,18 +248,19 @@ namespace detail {
 		task_type m_type;
 		collective_group_id m_cgid;
 		task_geometry m_geometry;
-		std::unique_ptr<command_group_storage_base> m_cgf;
+		std::unique_ptr<command_launcher_storage_base> m_launcher;
 		buffer_access_map m_access_map;
 		detail::side_effect_map m_side_effects;
 		reduction_set m_reductions;
 		std::string m_debug_name;
 		detail::epoch_action m_epoch_action;
 		std::unique_ptr<fence_promise> m_fence_promise;
+		std::vector<std::shared_ptr<lifetime_extending_state>> m_attached_state;
 
-		task(task_id tid, task_type type, collective_group_id cgid, task_geometry geometry, std::unique_ptr<command_group_storage_base> cgf,
+		task(task_id tid, task_type type, collective_group_id cgid, task_geometry geometry, std::unique_ptr<command_launcher_storage_base> launcher,
 		    buffer_access_map access_map, detail::side_effect_map side_effects, reduction_set reductions, std::string debug_name,
 		    detail::epoch_action epoch_action, std::unique_ptr<fence_promise> fence_promise)
-		    : m_tid(tid), m_type(type), m_cgid(cgid), m_geometry(geometry), m_cgf(std::move(cgf)), m_access_map(std::move(access_map)),
+		    : m_tid(tid), m_type(type), m_cgid(cgid), m_geometry(geometry), m_launcher(std::move(launcher)), m_access_map(std::move(access_map)),
 		      m_side_effects(std::move(side_effects)), m_reductions(std::move(reductions)), m_debug_name(std::move(debug_name)), m_epoch_action(epoch_action),
 		      m_fence_promise(std::move(fence_promise)) {
 			assert(type == task_type::host_compute || type == task_type::device_compute || get_granularity().size() == 1);
diff --git a/include/task_manager.h b/include/task_manager.h
index 9ed82b0cd..f2041a467 100644
--- a/include/task_manager.h
+++ b/include/task_manager.h
@@ -67,10 +67,10 @@ namespace detail {
 			auto reservation = m_task_buffer.reserve_task_entry(await_free_task_slot_callback());
 			const auto tid = reservation.get_tid();
 
-			prepass_handler cgh(tid, std::make_unique<command_group_storage<CGF>>(cgf), m_num_collective_nodes);
+			handler cgh = make_command_group_handler(tid, m_num_collective_nodes);
 			cgf(cgh);
 
-			auto unique_tsk = std::move(cgh).into_task();
+			auto unique_tsk = into_task(std::move(cgh));
 
 			// Require the collective group before inserting the task into the ring buffer, otherwise the executor will try to schedule the collective host
 			// task on a collective-group thread that does not yet exist.
@@ -185,7 +185,7 @@ namespace detail {
 		task_id m_epoch_for_new_tasks{initial_epoch_task};
 
 		// We store a map of which task last wrote to a certain region of a buffer.
-		// NOTE: This represents the state after the latest performed pre-pass.
+		// NOTE: This represents the state after the latest task created.
 		buffer_writers_map m_buffers_last_writers;
 
 		std::unordered_map<collective_group_id, task_id> m_last_collective_tasks;
diff --git a/include/types.h b/include/types.h
index baf75b6a4..3642791d9 100644
--- a/include/types.h
+++ b/include/types.h
@@ -55,6 +55,7 @@ MAKE_PHANTOM_TYPE(command_id, size_t)
 MAKE_PHANTOM_TYPE(collective_group_id, size_t)
 MAKE_PHANTOM_TYPE(reduction_id, size_t)
 MAKE_PHANTOM_TYPE(host_object_id, size_t)
+MAKE_PHANTOM_TYPE(hydration_id, size_t);
 
 
 // declared in this header for include-dependency reasons
diff --git a/src/executor.cc b/src/executor.cc
index d534fa80a..df3bc75fe 100644
--- a/src/executor.cc
+++ b/src/executor.cc
@@ -2,6 +2,7 @@
 
 #include <queue>
 
+#include "closure_hydrator.h"
 #include "distr_queue.h"
 #include "frame.h"
 #include "log.h"
@@ -45,6 +46,7 @@ namespace detail {
 	}
 
 	void executor::run() {
+		closure_hydrator::make_available();
 		bool done = false;
 		std::queue<unique_frame_ptr<command_frame>> command_queue;
 		while(!done || !m_jobs.empty()) {
@@ -138,6 +140,7 @@ namespace detail {
 		}
 
 		assert(m_running_device_compute_jobs == 0);
+		closure_hydrator::teardown();
 	}
 
 	bool executor::handle_command(const command_frame& frame) {
diff --git a/src/graph_generator.cc b/src/graph_generator.cc
index 999ce5d6b..5d5bce51c 100644
--- a/src/graph_generator.cc
+++ b/src/graph_generator.cc
@@ -213,7 +213,7 @@ namespace detail {
 		for(const buffer_id bid : buffers) {
 			const auto modes = access_map.get_access_modes(bid);
 			for(auto m : modes) {
-				result[bid][m] = access_map.get_requirements_for_access(bid, m, tsk.get_dimensions(), sr, global_size);
+				result[bid][m] = access_map.get_mode_requirements(bid, m, tsk.get_dimensions(), sr, global_size);
 			}
 		}
 		return result;
diff --git a/src/print_graph.cc b/src/print_graph.cc
index 0494a27de..fce8fc19c 100644
--- a/src/print_graph.cc
+++ b/src/print_graph.cc
@@ -57,7 +57,7 @@ namespace detail {
 		const auto& bam = tsk.get_buffer_access_map();
 		for(const auto bid : bam.get_accessed_buffers()) {
 			for(const auto mode : bam.get_access_modes(bid)) {
-				const auto req = bam.get_requirements_for_access(bid, mode, tsk.get_dimensions(), execution_range, tsk.get_global_size());
+				const auto req = bam.get_mode_requirements(bid, mode, tsk.get_dimensions(), execution_range, tsk.get_global_size());
 				const std::string bl = get_buffer_label(bm, bid);
 				// While uncommon, we do support chunks that don't require access to a particular buffer at all.
 				if(!req.empty()) { fmt::format_to(std::back_inserter(label), "<br/><i>{}</i> {} {}", detail::access::mode_traits::name(mode), bl, req); }
diff --git a/src/runtime.cc b/src/runtime.cc
index 3fcd08ed6..cd89d27e5 100644
--- a/src/runtime.cc
+++ b/src/runtime.cc
@@ -290,5 +290,14 @@ namespace detail {
 		if(done) { m_active_flushes.pop_front(); }
 	}
 
+	void runtime::test_case_exit() {
+		assert(m_test_mode && m_test_active);
+		// We need to delete all tasks manually first, b/c objects that have their lifetime
+		// extended by tasks (buffers, host objects) will attempt to shut down the runtime.
+		if(instance != nullptr) { instance->m_task_mngr.reset(); }
+		instance.reset();
+		m_test_active = false;
+	}
+
 } // namespace detail
 } // namespace celerity
diff --git a/src/task.cc b/src/task.cc
index 464b7f599..b552879b1 100644
--- a/src/task.cc
+++ b/src/task.cc
@@ -7,7 +7,7 @@ namespace detail {
 
 	std::unordered_set<buffer_id> buffer_access_map::get_accessed_buffers() const {
 		std::unordered_set<buffer_id> result;
-		for(auto& [bid, _] : m_map) {
+		for(const auto& [bid, _] : m_accesses) {
 			result.emplace(bid);
 		}
 		return result;
@@ -15,8 +15,8 @@ namespace detail {
 
 	std::unordered_set<cl::sycl::access::mode> buffer_access_map::get_access_modes(buffer_id bid) const {
 		std::unordered_set<cl::sycl::access::mode> result;
-		for(auto [first, last] = m_map.equal_range(bid); first != last; ++first) {
-			result.insert(first->second->get_access_mode());
+		for(const auto& [b, rm] : m_accesses) {
+			if(b == bid) { result.insert(rm->get_access_mode()); }
 		}
 		return result;
 	}
@@ -33,31 +33,32 @@ namespace detail {
 		return subrange<3>{};
 	}
 
-	GridRegion<3> buffer_access_map::get_requirements_for_access(
-	    buffer_id bid, cl::sycl::access::mode mode, int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const {
-		auto [first, last] = m_map.equal_range(bid);
-		if(first == m_map.end()) { return {}; }
-
+	GridRegion<3> buffer_access_map::get_mode_requirements(
+	    const buffer_id bid, const access_mode mode, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const {
 		GridRegion<3> result;
-		for(auto iter = first; iter != last; ++iter) {
-			auto rm = iter->second.get();
-			if(rm->get_access_mode() != mode) continue;
-
-			chunk<3> chnk{sr.offset, sr.range, global_size};
-			subrange<3> req;
-			switch(kernel_dims) {
-			case 0: req = apply_range_mapper<0>(rm, chunk_cast<0>(chnk)); break;
-			case 1: req = apply_range_mapper<1>(rm, chunk_cast<1>(chnk)); break;
-			case 2: req = apply_range_mapper<2>(rm, chunk_cast<2>(chnk)); break;
-			case 3: req = apply_range_mapper<3>(rm, chunk_cast<3>(chnk)); break;
-			default: assert(!"Unreachable");
-			}
-			result = GridRegion<3>::merge(result, subrange_to_grid_box(req));
+		for(size_t i = 0; i < m_accesses.size(); ++i) {
+			if(m_accesses[i].first != bid || m_accesses[i].second->get_access_mode() != mode) continue;
+			result = GridRegion<3>::merge(result, get_requirements_for_nth_access(i, kernel_dims, sr, global_size));
 		}
-
 		return result;
 	}
 
+	GridBox<3> buffer_access_map::get_requirements_for_nth_access(
+	    const size_t n, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const {
+		const auto& [_, rm] = m_accesses[n];
+
+		chunk<3> chnk{sr.offset, sr.range, global_size};
+		subrange<3> req;
+		switch(kernel_dims) {
+		case 0: req = apply_range_mapper<0>(rm.get(), chunk_cast<0>(chnk)); break;
+		case 1: req = apply_range_mapper<1>(rm.get(), chunk_cast<1>(chnk)); break;
+		case 2: req = apply_range_mapper<2>(rm.get(), chunk_cast<2>(chnk)); break;
+		case 3: req = apply_range_mapper<3>(rm.get(), chunk_cast<3>(chnk)); break;
+		default: assert(!"Unreachable");
+		}
+		return subrange_to_grid_box(req);
+	}
+
 	void side_effect_map::add_side_effect(const host_object_id hoid, const experimental::side_effect_order order) {
 		// TODO for multiple side effects on the same hoid, find the weakest order satisfying all of them
 		emplace(hoid, order);
diff --git a/src/task_manager.cc b/src/task_manager.cc
index 2f931106b..4e6030dad 100644
--- a/src/task_manager.cc
+++ b/src/task_manager.cc
@@ -60,7 +60,7 @@ namespace detail {
 		const subrange<3> full_range{tsk.get_global_offset(), tsk.get_global_size()};
 		GridRegion<3> result;
 		for(auto m : modes) {
-			result = GridRegion<3>::merge(result, access_map.get_requirements_for_access(bid, m, tsk.get_dimensions(), full_range, tsk.get_global_size()));
+			result = GridRegion<3>::merge(result, access_map.get_mode_requirements(bid, m, tsk.get_dimensions(), full_range, tsk.get_global_size()));
 		}
 		return result;
 	}
diff --git a/src/worker_job.cc b/src/worker_job.cc
index d45d93321..a768af2d6 100644
--- a/src/worker_job.cc
+++ b/src/worker_job.cc
@@ -3,6 +3,7 @@
 #include <spdlog/fmt/fmt.h>
 
 #include "buffer_manager.h"
+#include "closure_hydrator.h"
 #include "device_queue.h"
 #include "handler.h"
 #include "reduction_manager.h"
@@ -149,14 +150,20 @@ namespace detail {
 
 			if(!m_buffer_mngr.try_lock(pkg.cid, tsk->get_buffer_access_map().get_accessed_buffers())) { return false; }
 
-			CELERITY_TRACE("Execute live-pass, scheduling host task in thread pool");
+			CELERITY_TRACE("Scheduling host task in thread pool");
 
-			// Note that for host tasks, there is no indirection through a queue->submit step like there is for SYCL tasks. The CGF is executed directly,
-			// which then schedules task in the thread pool through the host_queue.
-			auto& cgf = tsk->get_command_group();
-			live_pass_host_handler cgh(tsk, data.sr, data.initialize_reductions, m_queue);
-			cgf(cgh);
-			m_future = cgh.into_future();
+			const auto& access_map = tsk->get_buffer_access_map();
+			std::vector<closure_hydrator::accessor_info> access_infos;
+			access_infos.reserve(access_map.get_num_accesses());
+			for(size_t i = 0; i < access_map.get_num_accesses(); ++i) {
+				const auto [bid, mode] = access_map.get_nth_access(i);
+				const auto sr = grid_box_to_subrange(access_map.get_requirements_for_nth_access(i, tsk->get_dimensions(), data.sr, tsk->get_global_size()));
+				const auto info = m_buffer_mngr.access_host_buffer(bid, mode, sr);
+				access_infos.push_back(closure_hydrator::accessor_info{info.ptr, info.backing_buffer_range, info.backing_buffer_offset, sr});
+			}
+
+			closure_hydrator::get_instance().arm(target::host_task, std::move(access_infos));
+			m_future = tsk->launch(m_queue, data.sr);
 
 			assert(m_future.valid());
 			m_submitted = true;
@@ -193,12 +200,31 @@ namespace detail {
 
 			if(!m_buffer_mngr.try_lock(pkg.cid, tsk->get_buffer_access_map().get_accessed_buffers())) { return false; }
 
-			CELERITY_TRACE("Execute live-pass, submit kernel to SYCL");
+			CELERITY_TRACE("Submit kernel to SYCL");
+
+			const auto& access_map = tsk->get_buffer_access_map();
+			const auto& reductions = tsk->get_reductions();
+			std::vector<closure_hydrator::accessor_info> accessor_infos;
+			std::vector<void*> reduction_ptrs;
+			accessor_infos.reserve(access_map.get_num_accesses());
+			reduction_ptrs.reserve(reductions.size());
+
+			for(size_t i = 0; i < access_map.get_num_accesses(); ++i) {
+				const auto [bid, mode] = access_map.get_nth_access(i);
+				const auto sr = grid_box_to_subrange(access_map.get_requirements_for_nth_access(i, tsk->get_dimensions(), data.sr, tsk->get_global_size()));
+				const auto info = m_buffer_mngr.access_device_buffer(bid, mode, sr);
+				accessor_infos.push_back(closure_hydrator::accessor_info{info.ptr, info.backing_buffer_range, info.backing_buffer_offset, sr});
+			}
+
+			for(size_t i = 0; i < reductions.size(); ++i) {
+				const auto& rd = reductions[i];
+				const auto mode = rd.init_from_buffer ? access_mode::read_write : access_mode::discard_write;
+				const auto info = m_buffer_mngr.access_device_buffer(rd.bid, mode, subrange<3>{{}, range<3>{1, 1, 1}});
+				reduction_ptrs.push_back(info.ptr);
+			}
 
-			live_pass_device_handler cgh(tsk, data.sr, data.initialize_reductions, m_queue);
-			auto& cgf = tsk->get_command_group();
-			cgf(cgh);
-			m_event = cgh.get_submission_event();
+			closure_hydrator::get_instance().arm(target::device, std::move(accessor_infos));
+			m_event = tsk->launch(m_queue, data.sr, reduction_ptrs, data.initialize_reductions);
 
 			m_submitted = true;
 			CELERITY_TRACE("Kernel submitted to SYCL");
diff --git a/test/accessor_tests.cc b/test/accessor_tests.cc
index 79a68df30..288eda9fa 100644
--- a/test/accessor_tests.cc
+++ b/test/accessor_tests.cc
@@ -2,12 +2,16 @@
 
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers_string.hpp>
 
 #include <celerity.h>
 
 #include "ranges.h"
 
 #include "buffer_manager_test_utils.h"
+#include "log_test_utils.h"
+
+// NOTE: There are some additional accessor tests in buffer_manager_tests.cc
 
 namespace celerity {
 namespace detail {
@@ -20,12 +24,12 @@ namespace detail {
 		distr_queue q;
 		std::vector mem_a{42};
 		buffer<int, 1> buf_a(mem_a.data(), range<1>{1});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			auto a = buf_a.get_access<cl::sycl::access::mode::read_write, target::host_task>(cgh, fixed<1>({0, 1}));
 			cgh.host_task(on_master_node, [=] { ++a[0]; });
 		});
 		int out = 0;
-		q.submit(celerity::allow_by_ref, [=, &out](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			auto a = buf_a.get_access<cl::sycl::access::mode::read, target::host_task>(cgh, fixed<1>({0, 1}));
 			cgh.host_task(on_master_node, [=, &out] { out = a[0]; });
 		});
@@ -38,9 +42,12 @@ namespace detail {
 		using buf0d_t = buffer<int, 0>&;
 
 		SECTION("device accessors") {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 			// This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter.
 			using acc0 = decltype(accessor{std::declval<buf1d_t>(), std::declval<handler&>(), one_to_one{}, celerity::write_only, celerity::property_list{}});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 1, access_mode::write, target::device>, acc0>);
+#pragma GCC diagnostic pop
 
 			using acc1 = decltype(accessor{std::declval<buf1d_t>(), std::declval<handler&>(), one_to_one{}, celerity::write_only});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 1, access_mode::write, target::device>, acc1>);
@@ -57,9 +64,12 @@ namespace detail {
 			using acc5 = decltype(accessor{std::declval<buf1d_t>(), std::declval<handler&>(), one_to_one{}, celerity::read_write, celerity::no_init});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 1, access_mode::discard_read_write, target::device>, acc5>);
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 			// This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter.
 			using acc6 = decltype(accessor{std::declval<buf0d_t>(), std::declval<handler&>(), all(), celerity::write_only, celerity::property_list{}});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 0, access_mode::write, target::device>, acc6>);
+#pragma GCC diagnostic pop
 
 			using acc7 = decltype(accessor{std::declval<buf0d_t>(), std::declval<handler&>(), all(), celerity::read_only});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 0, access_mode::read, target::device>, acc7>);
@@ -79,10 +89,13 @@ namespace detail {
 		}
 
 		SECTION("host accessors") {
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 			// This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter.
 			using acc0 =
 			    decltype(accessor{std::declval<buf1d_t>(), std::declval<handler&>(), one_to_one{}, celerity::write_only_host_task, celerity::property_list{}});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 1, access_mode::write, target::host_task>, acc0>);
+#pragma GCC diagnostic pop
 
 			using acc1 = decltype(accessor{std::declval<buf1d_t>(), std::declval<handler&>(), one_to_one{}, celerity::write_only_host_task});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 1, access_mode::write, target::host_task>, acc1>);
@@ -99,10 +112,13 @@ namespace detail {
 			using acc5 = decltype(accessor{std::declval<buf1d_t>(), std::declval<handler&>(), one_to_one{}, celerity::read_write_host_task, celerity::no_init});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 1, access_mode::discard_read_write, target::host_task>, acc5>);
 
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
 			// This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter.
 			using acc6 =
 			    decltype(accessor{std::declval<buf0d_t>(), std::declval<handler&>(), all(), celerity::write_only_host_task, celerity::property_list{}});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 0, access_mode::write, target::host_task>, acc6>);
+#pragma GCC diagnostic pop
 
 			using acc7 = decltype(accessor{std::declval<buf0d_t>(), std::declval<handler&>(), all(), celerity::read_only_host_task});
 			STATIC_REQUIRE(std::is_same_v<accessor<int, 0, access_mode::read, target::host_task>, acc7>);
@@ -152,37 +168,31 @@ namespace detail {
 
 		auto& q = accessor_fixture<Dims>::get_device_queue();
 		auto sr = subrange<3>({}, range);
-		live_pass_device_handler cgh(nullptr, sr, true, q);
 
 		// this kernel initializes the buffer what will be read after.
-		auto acc_write =
-		    accessor_fixture<Dims>::template get_device_accessor<size_t, Dims, cl::sycl::access::mode::discard_write>(cgh, bid, range_cast<Dims>(range), {});
-		cgh.parallel_for<class kernel_multi_dim_accessor_write_<Dims>>(
-		    range_cast<Dims>(range), [=](celerity::item<Dims> item) { acc_write[item] = item.get_linear_id(); });
-		cgh.get_submission_event().wait();
+		auto acc_write = accessor_fixture<Dims>::template get_device_accessor<size_t, Dims, access_mode::discard_write>(bid, range_cast<Dims>(range), {});
+		test_utils::run_parallel_for<class kernel_multi_dim_accessor_write_<Dims>>(accessor_fixture<Dims>::get_device_queue().get_sycl_queue(),
+		    range_cast<Dims>(range), {}, [=](celerity::item<Dims> item) { acc_write[item] = item.get_linear_id(); });
 
 		SECTION("for device buffers") {
-			auto acc_read =
-			    accessor_fixture<Dims>::template get_device_accessor<size_t, Dims, cl::sycl::access::mode::read>(cgh, bid, range_cast<Dims>(range), {});
-			auto acc = accessor_fixture<Dims>::template get_device_accessor<size_t, Dims, cl::sycl::access::mode::discard_write>(
-			    cgh, bid, range_cast<Dims>(range), {});
-			cgh.parallel_for<class kernel_multi_dim_accessor_read_<Dims>>(range_cast<Dims>(range), [=](celerity::item<Dims> item) {
-				size_t i = item[0];
-				size_t j = item[1];
-				if constexpr(Dims == 2) {
-					acc[i][j] = acc_read[i][j];
-				} else {
-					size_t k = item[2];
-					acc[i][j][k] = acc_read[i][j][k];
-				}
-			});
-			cgh.get_submission_event().wait();
+			auto acc_read = accessor_fixture<Dims>::template get_device_accessor<size_t, Dims, access_mode::read>(bid, range_cast<Dims>(range), {});
+			auto acc = accessor_fixture<Dims>::template get_device_accessor<size_t, Dims, access_mode::discard_write>(bid, range_cast<Dims>(range), {});
+			test_utils::run_parallel_for<class kernel_multi_dim_accessor_read_<Dims>>(
+			    accessor_fixture<Dims>::get_device_queue().get_sycl_queue(), range_cast<Dims>(range), {}, [=](celerity::item<Dims> item) {
+				    size_t i = item[0];
+				    size_t j = item[1];
+				    if constexpr(Dims == 2) {
+					    acc[i][j] = acc_read[i][j];
+				    } else {
+					    size_t k = item[2];
+					    acc[i][j][k] = acc_read[i][j][k];
+				    }
+			    });
 		}
 
 		SECTION("for host buffers") {
-			auto acc_read = accessor_fixture<Dims>::template get_host_accessor<size_t, Dims, cl::sycl::access::mode::read>(bid, range_cast<Dims>(range), {});
-			auto acc =
-			    accessor_fixture<Dims>::template get_host_accessor<size_t, Dims, cl::sycl::access::mode::discard_write>(bid, range_cast<Dims>(range), {});
+			auto acc_read = accessor_fixture<Dims>::template get_host_accessor<size_t, Dims, access_mode::read>(bid, range_cast<Dims>(range), {});
+			auto acc = accessor_fixture<Dims>::template get_host_accessor<size_t, Dims, access_mode::discard_write>(bid, range_cast<Dims>(range), {});
 			for(size_t i = 0; i < range[0]; i++) {
 				for(size_t j = 0; j < range[1]; j++) {
 					for(size_t k = 0; k < (Dims == 2 ? 1 : range[2]); k++) {
@@ -245,7 +255,7 @@ namespace detail {
 		CAPTURE(Mode);
 		bool verified = false;
 		buffer<bool> verify_buf{&verified, 1};
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			// access with offset == buffer range just to mess with things
 			const auto offset = id_cast<1>(test_buf.get_range());
 			const auto test_acc = test_buf.get_access<Mode>(cgh, [=](chunk<1>) { return subrange<1>{offset, 0}; });
@@ -255,7 +265,7 @@ namespace detail {
 				verify_acc[0] = true;
 			});
 		});
-		q.submit(allow_by_ref, [&](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			const accessor verify_acc{verify_buf, cgh, all{}, read_only_host_task};
 			cgh.host_task(on_master_node, [=, &verified] { verified = verify_acc[0]; });
 		});
@@ -291,7 +301,7 @@ namespace detail {
 		std::vector<char> memory1d(10);
 		buffer<char, 1> buf1d(memory1d.data(), range<1>(10));
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor b{buf1d, cgh, all{}, celerity::write_only_host_task, celerity::no_init};
 			cgh.host_task(on_master_node, [=](partition<0> part) {
 				auto aw = b.get_allocation_window(part);
@@ -303,7 +313,7 @@ namespace detail {
 			});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor b{buf1d, cgh, one_to_one{}, celerity::write_only_host_task, celerity::no_init};
 			cgh.host_task(range<1>(6), id<1>(2), [=](partition<1> part) {
 				auto aw = b.get_allocation_window(part);
@@ -319,7 +329,7 @@ namespace detail {
 		std::vector<char> memory2d(10 * 10);
 		buffer<char, 2> buf2d(memory2d.data(), range<2>(10, 10));
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor b{buf2d, cgh, one_to_one{}, celerity::write_only_host_task, celerity::no_init};
 			cgh.host_task(range<2>(5, 6), id<2>(1, 2), [=](partition<2> part) {
 				auto aw = b.get_allocation_window(part);
@@ -338,7 +348,7 @@ namespace detail {
 		std::vector<char> memory3d(10 * 10 * 10);
 		buffer<char, 3> buf3d(memory3d.data(), range<3>(10, 10, 10));
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor b{buf3d, cgh, one_to_one{}, celerity::write_only_host_task, celerity::no_init};
 			cgh.host_task(range<3>(5, 6, 7), id<3>(1, 2, 3), [=](partition<3> part) {
 				auto aw = b.get_allocation_window(part);
@@ -369,7 +379,7 @@ namespace detail {
 		buffer<float, 1> buf_1(100);
 
 		distr_queue q;
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_0(buf_0, cgh, read_write_host_task);
 			cgh.host_task(on_master_node, [=] {
 				CHECK(acc_0 == value_a);
@@ -380,7 +390,7 @@ namespace detail {
 				*acc_0 = value_b;
 			});
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_0(buf_0, cgh, read_only);
 			accessor acc_1(buf_1, cgh, one_to_one(), write_only, no_init);
 			cgh.parallel_for<class UKN(device)>(buf_1.get_range(), [=](item<1> it) {
@@ -390,7 +400,7 @@ namespace detail {
 				acc_1[it] = acc_0[id<0>()];
 			});
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_1(buf_1, cgh, all(), read_only_host_task);
 			cgh.host_task(on_master_node, [=] {
 				for(size_t i = 0; i < buf_1.get_range().size(); ++i) {
@@ -407,7 +417,7 @@ namespace detail {
 		buffer<float, 3> buf_3d({5, 5, 5});
 
 		distr_queue q;
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_0d(buf_0d, cgh, all(), write_only, no_init);
 			accessor acc_1d(buf_1d, cgh, all(), write_only, no_init);
 			accessor acc_2d(buf_2d, cgh, all(), write_only, no_init);
@@ -422,7 +432,7 @@ namespace detail {
 				acc_3d[4][4][4] = 4;
 			});
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_0d(buf_0d, cgh, all(), read_write_host_task);
 			accessor acc_1d(buf_1d, cgh, all(), read_write_host_task);
 			accessor acc_2d(buf_2d, cgh, all(), read_write_host_task);
@@ -438,7 +448,7 @@ namespace detail {
 				acc_3d[4][4][4] += 9;
 			});
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_0d(buf_0d, cgh, all(), read_only_host_task);
 			accessor acc_1d(buf_1d, cgh, all(), read_only_host_task);
 			accessor acc_2d(buf_2d, cgh, all(), read_only_host_task);
@@ -462,11 +472,11 @@ namespace detail {
 		buffer<float, 1> buf_1(32);
 
 		distr_queue q;
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_1(buf_1, cgh, one_to_one(), write_only, no_init);
 			cgh.parallel_for<class UKN(device)>(buf_1.get_range(), [=](item<1> it) { acc_1[it] = value_a; });
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_1(buf_1, cgh, one_to_one(), write_only);
 			local_accessor<float, 0> local_0(cgh);
 			cgh.parallel_for<class UKN(device)>(nd_range(buf_1.get_range(), buf_1.get_range()), [=](nd_item<1> it) {
@@ -483,7 +493,7 @@ namespace detail {
 				acc_1[it.get_global_id()] = local_0[id<0>()];
 			});
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_1(buf_1, cgh, all(), read_only_host_task);
 			cgh.host_task(on_master_node, [=] {
 				for(size_t i = 0; i < buf_1.get_range().size(); ++i) {
@@ -499,7 +509,7 @@ namespace detail {
 
 		distr_queue q;
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor<float, 0, access_mode::discard_write, target::device> device_acc_0;
 			accessor<float, 1, access_mode::discard_write, target::device> device_acc_1;
 			local_accessor<float, 0> local_acc_0;
@@ -513,7 +523,7 @@ namespace detail {
 			    nd_range(1, 1), [=](nd_item<1> /* it */) { (void)device_acc_0, (void)local_acc_0, (void)device_acc_1, (void)local_acc_1; });
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor<float, 0, access_mode::discard_write, target::host_task> host_acc_0;
 			accessor<float, 1, access_mode::discard_write, target::host_task> host_acc_1;
 			host_acc_0 = decltype(host_acc_0)(buf_0, cgh, all());
@@ -530,13 +540,106 @@ namespace detail {
 		CHECK(sizeof(accessor<int, 0, access_mode::read, target::host_task>) == sizeof(int*));
 	}
 
-	TEST_CASE("0-dimensional local accessor has no overhead over SYCL", "[accessor][!shouldfail]") {
-		//  TODO after multi-pass removal: drop !shouldfail (see TODO in local_accessor definition)
+	TEST_CASE("0-dimensional local accessor has no overhead over SYCL", "[accessor]") {
 		if(!CELERITY_DETAIL_HAS_NO_UNIQUE_ADDRESS) SKIP("[[no_unique_address]] not available on this compiler");
 
 		// this check is not a static_assert because it depends on an (optional) compiler layout optimization
 		CHECK(sizeof(local_accessor<int, 0>) == sizeof(accessor_testspy::declval_sycl_accessor<local_accessor<int, 0>>()));
 	}
 
+	TEST_CASE_METHOD(accessor_fixture<0>, "closure_hydrator provides correct pointers to host and device accessors", "[closure_hydrator][accessor]") {
+		auto& bm = get_buffer_manager();
+		auto bid = bm.register_buffer<size_t, 1>({100, 1, 1});
+		auto& q = get_device_queue();
+
+		SECTION("host accessor") {
+			auto access_info = bm.access_host_buffer<size_t, 1>(bid, access_mode::discard_write, {{}, {100}});
+			std::vector<closure_hydrator::accessor_info> infos;
+			infos.push_back({access_info.ptr, access_info.backing_buffer_range, access_info.backing_buffer_offset, subrange<3>{{}, {100, 1, 1}}});
+			auto acc = accessor_testspy::make_host_accessor<size_t, 1, access_mode::discard_write>(subrange<1>({}, {100}), hydration_id(1),
+			    detail::id_cast<1>(access_info.backing_buffer_offset), detail::range_cast<1>(access_info.backing_buffer_range),
+			    detail::range_cast<1>(bm.get_buffer_info(bid).range));
+			CHECK(accessor_testspy::get_pointer(acc) != access_info.ptr);
+			closure_hydrator::get_instance().arm(target::host_task, std::move(infos));
+			const auto run_check = closure_hydrator::get_instance().hydrate<target::host_task>(
+			    [&, hydrated_acc = acc] { CHECK(accessor_testspy::get_pointer(hydrated_acc) == access_info.ptr); });
+			run_check();
+		}
+
+		SECTION("device accessor") {
+			auto access_info = bm.access_device_buffer<size_t, 1>(bid, access_mode::discard_write, {{}, {100}});
+			std::vector<closure_hydrator::accessor_info> infos;
+			infos.push_back({access_info.ptr, access_info.backing_buffer_range, access_info.backing_buffer_offset, subrange<3>{{}, {100, 1, 1}}});
+			auto acc = accessor_testspy::make_device_accessor<size_t, 1, access_mode::discard_write>(
+			    hydration_id(1), id_cast<1>(access_info.backing_buffer_offset), detail::range_cast<1>(access_info.backing_buffer_range));
+			CHECK(accessor_testspy::get_pointer(acc) != access_info.ptr);
+			accessor<size_t, 1, access_mode::discard_write, target::device> hydrated_acc;
+			closure_hydrator::get_instance().arm(target::device, std::move(infos));
+			q.get_sycl_queue().submit([&](sycl::handler& cgh) {
+				closure_hydrator::get_instance().hydrate<target::device>(cgh, [&hydrated_acc, acc]() { hydrated_acc = acc; })(/* call to hydrate */);
+				cgh.single_task<class UKN(nop)>([] {});
+			});
+			CHECK(accessor_testspy::get_pointer(hydrated_acc) == access_info.ptr);
+		}
+	}
+
+	TEST_CASE_METHOD(accessor_fixture<0>, "closure_hydrator correctly handles unused and duplicate accessors", "[closure_hydrator][accessor]") {
+		auto& bm = get_buffer_manager();
+		auto& q = get_device_queue();
+
+		std::vector<closure_hydrator::accessor_info> infos;
+		hydration_id next_hid = 1;
+		auto create_accessor = [&](const buffer_id bid) {
+			auto access_info = bm.access_host_buffer<size_t, 1>(bid, access_mode::discard_write, {{}, {10}});
+			infos.push_back({access_info.ptr, access_info.backing_buffer_range, access_info.backing_buffer_offset, subrange<3>{{}, {10, 1, 1}}});
+			auto acc = accessor_testspy::make_host_accessor<size_t, 1, access_mode::discard_write>(subrange<1>({}, {10}), next_hid++,
+			    id_cast<1>(access_info.backing_buffer_offset), detail::range_cast<1>(access_info.backing_buffer_range),
+			    detail::range_cast<1>(bm.get_buffer_info(bid).range));
+			return std::pair{acc, access_info.ptr};
+		};
+
+		const auto bid1 = bm.register_buffer<size_t, 1>({10, 1, 1});
+		[[maybe_unused]] const auto [acc1, ptr1] = create_accessor(bid1);
+		const auto bid2 = bm.register_buffer<size_t, 1>({20, 1, 1});
+		const auto [acc2, ptr2] = create_accessor(bid2);
+		const auto bid3 = bm.register_buffer<size_t, 1>({30, 1, 1});
+		[[maybe_unused]] const auto [acc3, ptr3] = create_accessor(bid3);
+		const auto bid4 = bm.register_buffer<size_t, 1>({40, 1, 1});
+		const auto [acc4, ptr4] = create_accessor(bid4);
+		auto acc5 = acc4;
+
+		auto closure = [acc2 = acc2, acc4 = acc4, acc5 = acc5] {
+			return std::tuple{accessor_testspy::get_pointer(acc2), accessor_testspy::get_pointer(acc4), accessor_testspy::get_pointer(acc5)};
+		};
+		closure_hydrator::get_instance().arm(target::host_task, std::move(infos));
+		auto hydrated_closure = closure_hydrator::get_instance().hydrate<target::host_task>(closure);
+		CHECK(ptr2 == std::get<0>(hydrated_closure()));
+		CHECK(ptr4 == std::get<1>(hydrated_closure()));
+		CHECK(ptr4 == std::get<2>(hydrated_closure()));
+	}
+
+	TEST_CASE_METHOD(accessor_fixture<0>, "closure_hydrator correctly hydrates local_accessor", "[closure_hydrator][accessor][smoke-test]") {
+		auto& q = get_device_queue();
+		auto local_acc = accessor_testspy::make_local_accessor<size_t, 1>(range<1>(2));
+		size_t* const result = sycl::malloc_device<size_t>(2, q.get_sycl_queue());
+		auto closure = [=](sycl::nd_item<1> itm) {
+			// We can't really check pointers or anything, so this is a smoke test
+			local_acc[itm.get_local_id()] = 100 + itm.get_local_id(0) * 10;
+			sycl::group_barrier(itm.get_group());
+			// Write other item's value
+			result[itm.get_local_id(0)] = local_acc[1 - itm.get_local_id()];
+		};
+		closure_hydrator::get_instance().arm(target::device, {});
+		q.submit([&](sycl::handler& cgh) {
+			 auto hydrated_closure = closure_hydrator::get_instance().hydrate<target::device>(cgh, closure);
+			 cgh.parallel_for(sycl::nd_range<1>{{2}, {2}}, [=](sycl::nd_item<1> itm) { hydrated_closure(itm); });
+		 }).wait_and_throw();
+		size_t result_host[2];
+		q.get_sycl_queue().memcpy(&result_host[0], result, 2 * sizeof(size_t)).wait_and_throw();
+		CHECK(result_host[0] == 110);
+		CHECK(result_host[1] == 100);
+		sycl::free(result, q.get_sycl_queue());
+	}
+
 } // namespace detail
 } // namespace celerity
diff --git a/test/buffer_manager_test_utils.h b/test/buffer_manager_test_utils.h
index cc12c2b2e..f6c52bfff 100644
--- a/test/buffer_manager_test_utils.h
+++ b/test/buffer_manager_test_utils.h
@@ -21,11 +21,34 @@ namespace detail {
 		static typename LocalAccessor::sycl_accessor declval_sycl_accessor() {
 			static_assert(constexpr_false<LocalAccessor>, "declval_sycl_accessor cannot be used in an evaluated context");
 		}
+
+		template <typename DataT, int Dims, typename... Args>
+		static local_accessor<DataT, Dims> make_local_accessor(Args&&... args) {
+			return local_accessor<DataT, Dims>{std::forward<Args>(args)...};
+		}
+
+		template <typename DataT, int Dims, access_mode Mode, target Tgt>
+		static DataT* get_pointer(const accessor<DataT, Dims, Mode, Tgt>& acc) {
+			if constexpr(Tgt == target::device) {
+				return acc.m_device_ptr;
+			} else {
+				return acc.m_host_ptr;
+			}
+		}
 	};
 } // namespace detail
 
 namespace test_utils {
 
+	// Convenience function for submitting parallel_for with global offset without having to create a CGF
+	template <typename KernelName = detail::unnamed_kernel, int Dims, typename KernelFn>
+	void run_parallel_for(sycl::queue& q, const range<Dims>& global_range, const id<Dims>& global_offset, KernelFn fn) {
+		q.submit([=](sycl::handler& cgh) {
+			cgh.parallel_for<KernelName>(sycl::range<Dims>{global_range}, detail::bind_simple_kernel(fn, global_range, global_offset, global_offset));
+		});
+		q.wait_and_throw();
+	}
+
 	class buffer_manager_fixture : public device_queue_fixture {
 	  public:
 		enum class access_target { host, device };
@@ -152,8 +175,7 @@ namespace test_utils {
 		}
 
 		template <typename DataT, int Dims, access_mode Mode>
-		accessor<DataT, Dims, Mode, target::device> get_device_accessor(
-		    detail::live_pass_device_handler& cgh, detail::buffer_id bid, const range<Dims>& range, const id<Dims>& offset) {
+		accessor<DataT, Dims, Mode, target::device> get_device_accessor(detail::buffer_id bid, const range<Dims>& range, const id<Dims>& offset) {
 			auto buf_info = m_bm->access_device_buffer<DataT, Dims>(bid, Mode, {offset, range});
 			return detail::accessor_testspy::make_device_accessor<DataT, Dims, Mode>(static_cast<DataT*>(buf_info.ptr),
 			    detail::id_cast<Dims>(buf_info.backing_buffer_offset), detail::range_cast<Dims>(buf_info.backing_buffer_range));
diff --git a/test/buffer_manager_tests.cc b/test/buffer_manager_tests.cc
index ea1acfd72..cf345df7e 100644
--- a/test/buffer_manager_tests.cc
+++ b/test/buffer_manager_tests.cc
@@ -24,7 +24,7 @@ namespace detail {
 		{
 			celerity::buffer<int, 1> b(range<1>(128));
 			b_id = celerity::detail::get_buffer_id(b);
-			q.submit([=](celerity::handler& cgh) {
+			q.submit([&](celerity::handler& cgh) {
 				celerity::accessor a{b, cgh, celerity::access::all(), celerity::write_only};
 				cgh.parallel_for<class UKN(i)>(b.get_range(), [=](celerity::item<1> it) {});
 			});
@@ -34,11 +34,11 @@ namespace detail {
 		// we need horizon_step_size * 3 + 1 tasks to generate the third horizon,
 		// and one extra task to trigger the clean_up process
 		for(int i = 0; i < (new_horizon_step * 3 + 2); i++) {
-			q.submit([=](celerity::handler& cgh) {
+			q.submit([&](celerity::handler& cgh) {
 				celerity::accessor a{c, cgh, celerity::access::all(), celerity::write_only};
 				cgh.parallel_for<class UKN(i)>(c.get_range(), [=](celerity::item<1>) {});
 			});
-			// this sync is inside the loop because otherwise there is a race between the prepass and the executor informing the TDAG
+			// this sync is inside the loop because otherwise there is a race between this thread and the executor informing the TDAG
 			// of the executed horizons, meaning that task deletion is not guaranteed.
 			q.slow_full_sync();
 		}
@@ -817,12 +817,11 @@ namespace detail {
 			const auto range = celerity::range<2>(32, 32);
 			const auto offset = id<2>(32, 0);
 			auto sr = subrange<3>(id_cast<3>(offset), range_cast<3>(range));
-			live_pass_device_handler cgh(nullptr, sr, true, dq);
 
-			get_device_accessor<size_t, 2, access_mode::discard_write>(cgh, bid, {48, 32}, {16, 0});
-			auto acc = get_device_accessor<size_t, 2, access_mode::discard_write>(cgh, bid, {32, 32}, {32, 0});
-			cgh.parallel_for<class UKN(write_buf)>(range, offset, [=](id<2> id) { acc[id] = id[0] + id[1]; });
-			cgh.get_submission_event().wait();
+			get_device_accessor<size_t, 2, access_mode::discard_write>(bid, {48, 32}, {16, 0});
+			auto acc = get_device_accessor<size_t, 2, access_mode::discard_write>(bid, {32, 32}, {32, 0});
+
+			test_utils::run_parallel_for<class UKN(write_buf)>(dq.get_sycl_queue(), range, offset, [=](id<2> id) { acc[id] = id[0] + id[1]; });
 
 			auto buf_info = bm.access_host_buffer<size_t, 2>(bid, access_mode::read, {{32, 0}, {32, 32}});
 			for(size_t i = 32; i < 64; ++i) {
@@ -863,33 +862,32 @@ namespace detail {
 		SECTION("when using device buffers") {
 			auto range = celerity::range<1>(32);
 			auto sr = subrange<3>({}, range_cast<3>(range));
-			live_pass_device_handler cgh(nullptr, sr, true, dq);
 
 			// For device accessors we test this both on host and device
 
 			// Copy ctor
-			auto device_acc_a = get_device_accessor<size_t, 1, access_mode::discard_write>(cgh, bid_a, {32}, {0});
+			auto device_acc_a = get_device_accessor<size_t, 1, access_mode::discard_write>(bid_a, {32}, {0});
 			decltype(device_acc_a) device_acc_a1(device_acc_a);
 
 			// Move ctor
-			auto device_acc_b = get_device_accessor<size_t, 1, access_mode::discard_write>(cgh, bid_b, {32}, {0});
+			auto device_acc_b = get_device_accessor<size_t, 1, access_mode::discard_write>(bid_b, {32}, {0});
 			decltype(device_acc_b) device_acc_b1(std::move(device_acc_b));
 
 			// Copy assignment
-			auto device_acc_c = get_device_accessor<size_t, 1, access_mode::discard_write>(cgh, bid_c, {32}, {0});
-			auto device_acc_c1 = get_device_accessor<size_t, 1, access_mode::discard_write>(cgh, bid_a, {32}, {0});
+			auto device_acc_c = get_device_accessor<size_t, 1, access_mode::discard_write>(bid_c, {32}, {0});
+			auto device_acc_c1 = get_device_accessor<size_t, 1, access_mode::discard_write>(bid_a, {32}, {0});
 			device_acc_c1 = device_acc_c;
 
 			// Move assignment
-			auto device_acc_d = get_device_accessor<size_t, 1, access_mode::discard_write>(cgh, bid_d, {32}, {0});
-			auto device_acc_d1 = get_device_accessor<size_t, 1, access_mode::discard_write>(cgh, bid_a, {32}, {0});
+			auto device_acc_d = get_device_accessor<size_t, 1, access_mode::discard_write>(bid_d, {32}, {0});
+			auto device_acc_d1 = get_device_accessor<size_t, 1, access_mode::discard_write>(bid_a, {32}, {0});
 			device_acc_d1 = std::move(device_acc_d);
 
 			// Hidden friends (equality operators)
 			REQUIRE(device_acc_a == device_acc_a1);
 			REQUIRE(device_acc_a1 != device_acc_b1);
 
-			cgh.parallel_for<class UKN(member_fn_test)>(range, [=](id<1> id) {
+			test_utils::run_parallel_for<class UKN(member_fn_test)>(dq.get_sycl_queue(), range, {}, [=](id<1> id) {
 				// Copy ctor
 				decltype(device_acc_a1) device_acc_a2(device_acc_a1);
 				device_acc_a2[id] = 1 * id[0];
@@ -911,8 +909,6 @@ namespace detail {
 				// Hidden friends (equality operators) are only required to be defined on the host
 			});
 
-			cgh.get_submission_event().wait();
-
 			auto host_acc_a = get_host_accessor<size_t, 1, access_mode::read>(bid_a, {32}, {0});
 			auto host_acc_b = get_host_accessor<size_t, 1, access_mode::read>(bid_b, {32}, {0});
 			auto host_acc_c = get_host_accessor<size_t, 1, access_mode::read>(bid_c, {32}, {0});
@@ -1017,28 +1013,37 @@ namespace detail {
 
 		const std::string error_msg = "Buffer cannot be accessed with expected stride";
 
+		// TODO: Use single lambda once https://github.com/KhronosGroup/SYCL-Docs/pull/351 is merged and implemented
+		auto get_pointer_1d = [this](const buffer_id bid, const range<1>& range, const id<1>& offset) {
+			get_host_accessor<size_t, 1, access_mode::discard_write>(bid, range, offset).get_pointer();
+		};
+
+		auto get_pointer_2d = [this](const buffer_id bid, const range<2>& range, const id<2>& offset) {
+			get_host_accessor<size_t, 2, access_mode::discard_write>(bid, range, offset).get_pointer();
+		};
+
 		// This is not allowed, as the backing buffer hasn't been allocated from offset 0, which means the pointer would point to offset 32.
-		REQUIRE_THROWS_WITH((get_host_accessor<size_t, 1, access_mode::discard_write>(bid_a, {32}, {32}).get_pointer()), error_msg);
+		REQUIRE_THROWS_WITH(get_pointer_1d(bid_a, {32}, {32}), error_msg);
 
 		// This is fine, as the backing buffer has been resized to start from 0 now.
-		REQUIRE_NOTHROW(get_host_accessor<size_t, 1, access_mode::discard_write>(bid_a, {64}, {0}).get_pointer());
+		REQUIRE_NOTHROW(get_pointer_1d(bid_a, {64}, {0}));
 
 		// This is now also okay, as the backing buffer starts at 0, and the pointer points to offset 0.
 		// (Same semantics as SYCL accessor with offset, i.e., UB outside of requested range).
-		REQUIRE_NOTHROW(get_host_accessor<size_t, 1, access_mode::discard_write>(bid_a, {32}, {32}).get_pointer());
+		REQUIRE_NOTHROW(get_pointer_1d(bid_a, {32}, {32}));
 
 		// In 2D (and 3D) it's trickier, as the stride of the backing buffer must also match what the user expects.
 		// This is not allowed, even though the offset is 0.
-		REQUIRE_THROWS_WITH((get_host_accessor<size_t, 2, access_mode::discard_write>(bid_b, {64, 64}, {0, 0}).get_pointer()), error_msg);
+		REQUIRE_THROWS_WITH(get_pointer_2d(bid_b, {64, 64}, {0, 0}), error_msg);
 
 		// This is allowed, as we request the full buffer.
-		REQUIRE_NOTHROW(get_host_accessor<size_t, 2, access_mode::discard_write>(bid_b, {128, 128}, {0, 0}).get_pointer());
+		REQUIRE_NOTHROW(get_pointer_2d(bid_b, {128, 128}, {0, 0}));
 
 		// This is now allowed, as the backing buffer has the expected stride.
-		REQUIRE_NOTHROW(get_host_accessor<size_t, 2, access_mode::discard_write>(bid_b, {64, 64}, {0, 0}).get_pointer());
+		REQUIRE_NOTHROW(get_pointer_2d(bid_b, {64, 64}, {0, 0}));
 
 		// Passing an offset is now also possible.
-		REQUIRE_NOTHROW(get_host_accessor<size_t, 2, access_mode::discard_write>(bid_b, {64, 64}, {32, 32}).get_pointer());
+		REQUIRE_NOTHROW(get_pointer_2d(bid_b, {64, 64}, {32, 32}));
 	}
 
 	TEST_CASE_METHOD(test_utils::buffer_manager_fixture, "empty access ranges do not inflate backing buffer allocations", "[buffer_manager]") {
diff --git a/test/graph_compaction_tests.cc b/test/graph_compaction_tests.cc
index d34dc8c14..698ebe810 100644
--- a/test/graph_compaction_tests.cc
+++ b/test/graph_compaction_tests.cc
@@ -205,7 +205,7 @@ namespace detail {
 		auto write_b_cmd = ctx.get_command_graph().get(*cmds.cbegin());
 		auto write_b_dependencies = write_b_cmd->get_dependencies();
 		CHECK(!write_b_dependencies.empty());
-		CHECK(write_b_dependencies.front().kind == dependency_kind::anti_dep);
+		CHECK(std::any_of(write_b_dependencies.begin(), write_b_dependencies.end(), [](auto& dep) { return dep.kind == dependency_kind::anti_dep; }));
 
 		test_utils::maybe_print_graphs(ctx);
 	}
diff --git a/test/integration/backend.cc b/test/integration/backend.cc
index 81141e606..c9d2ad02d 100644
--- a/test/integration/backend.cc
+++ b/test/integration/backend.cc
@@ -14,7 +14,7 @@ void test_copy(celerity::distr_queue& q) {
 	celerity::buffer<size_t, Dims> buf(celerity::detail::range_cast<Dims>(celerity::range<3>{5, 7, 9}));
 
 	// Initialize on device
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor acc{buf, cgh, celerity::access::one_to_one<>{}, celerity::write_only, celerity::no_init};
 		cgh.parallel_for<kernel_name<class init, Dims>>(buf.get_range(), [=](celerity::item<Dims> itm) { acc[itm] = itm.get_linear_id(); });
 	});
@@ -22,7 +22,7 @@ void test_copy(celerity::distr_queue& q) {
 	// Check and modify partially on host
 	const auto sr = celerity::detail::subrange_cast<Dims>(celerity::subrange<3>{{1, 2, 3}, {3, 4, 5}});
 	const auto sr3 = celerity::detail::subrange_cast<3>(sr);
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor acc{buf, cgh, celerity::access::fixed<Dims>{sr}, celerity::read_write_host_task};
 		cgh.host_task(celerity::on_master_node, [=]() {
 			for(size_t k = 0; k < sr3.range[0]; ++k) {
@@ -39,13 +39,13 @@ void test_copy(celerity::distr_queue& q) {
 	});
 
 	// Modify everything on device
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor acc{buf, cgh, celerity::access::one_to_one<>{}, celerity::read_write};
 		cgh.parallel_for<kernel_name<class modify, Dims>>(buf.get_range(), [=](celerity::item<Dims> itm) { acc[itm] += 1; });
 	});
 
 	// Check everything on host
-	q.submit([=](celerity::handler& cgh) {
+	q.submit([&](celerity::handler& cgh) {
 		celerity::accessor acc{buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
 		cgh.host_task(celerity::on_master_node, [=]() {
 			const auto r3 = celerity::detail::range_cast<3>(buf.get_range());
diff --git a/test/print_graph_tests.cc b/test/print_graph_tests.cc
index c287f554b..d40b1fabb 100644
--- a/test/print_graph_tests.cc
+++ b/test/print_graph_tests.cc
@@ -75,8 +75,8 @@ TEST_CASE("command graph printing is unchanged", "[print_graph][command-graph]")
 	// replace the `expected` value with the new dot graph.
 	const auto expected =
 	    "digraph G{label=\"Command Graph\" subgraph cluster_2{label=<<font color=\"#606060\">T2 (master-node host)</font>>;color=darkgray;9[label=<C9 on "
-	    "N0<br/><b>execution</b> [[0,0,0] - [0,0,0]]<br/><i>read</i> B0 {[[0,0,0] - [1,1,1]]}<br/><i>read_write</i> B0 {[[0,0,0] - "
-	    "[1,1,1]]}<br/><i>write</i> "
+	    "N0<br/><b>execution</b> [[0,0,0] - [0,0,0]]<br/><i>write</i> B0 {[[0,0,0] - [1,1,1]]}<br/><i>read_write</i> B0 {[[0,0,0] - "
+	    "[1,1,1]]}<br/><i>read</i> "
 	    "B0 {[[0,0,0] - [1,1,1]]}> fontcolor=black shape=box];}subgraph cluster_1{label=<<font color=\"#606060\">T1 \"task_reduction_8\" "
 	    "(device-compute)</font>>;color=darkgray;5[label=<C5 on N0<br/><b>execution</b> [[0,0,0] - [1,1,1]]<br/>(R1) <i>discard_write</i> B0 {[[0,0,0] - "
 	    "[1,1,1]]}> fontcolor=black shape=box];6[label=<C6 on N1<br/><b>execution</b> [[1,0,0] - [2,1,1]]<br/>(R1) <i>discard_write</i> B0 {[[0,0,0] - "
@@ -107,7 +107,7 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "Buffer debug names show up in the
 	celerity::debug::set_buffer_name(buff_a, buff_name);
 	CHECK(celerity::debug::get_buffer_name(buff_a) == buff_name);
 
-	q.submit([=](handler& cgh) {
+	q.submit([&](handler& cgh) {
 		celerity::accessor acc{buff_a, cgh, celerity::access::all{}, celerity::write_only};
 		cgh.parallel_for<class UKN(print_graph_buffer_name)>(range, [=](item<1> item) {});
 	});
diff --git a/test/runtime_deprecation_tests.cc b/test/runtime_deprecation_tests.cc
index 6fbc478de..e2c1bbd3a 100644
--- a/test/runtime_deprecation_tests.cc
+++ b/test/runtime_deprecation_tests.cc
@@ -134,5 +134,32 @@ namespace detail {
 		});
 	}
 
+	TEST_CASE_METHOD(test_utils::runtime_fixture,
+	    "distr_queue::submit(allow_by_ref_t, ...) and creation of accessors/side-effects/reductions from const buffers/host-objects continues to work",
+	    "[handler][deprecated]") {
+		distr_queue q;
+		buffer<size_t, 1> buf{32};
+		buffer<size_t, 1> reduction_buf{1};
+		experimental::host_object<size_t> ho;
+		int my_int = 33;
+		q.submit(allow_by_ref, [= /* capture buffer/host-object by value */, &my_int](handler& cgh) {
+			accessor acc{buf, cgh, celerity::access::all{}, celerity::write_only_host_task};
+			experimental::side_effect se{ho, cgh};
+			cgh.host_task(on_master_node, [=, &my_int] {
+				(void)acc;
+				(void)se;
+				my_int = 42;
+			});
+		});
+		q.submit([= /* capture by value */](handler& cgh) {
+			accessor acc{buf, cgh, celerity::access::one_to_one{}, celerity::read_only};
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
+			auto red = reduction(reduction_buf, cgh, std::plus<size_t>{});
+#endif
+			cgh.parallel_for(range<1>{32}, [=](item<1>) { (void)acc; });
+		});
+		SUCCEED();
+	}
+
 } // namespace detail
 } // namespace celerity
diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc
index f8d653c0b..dae7dd28b 100644
--- a/test/runtime_tests.cc
+++ b/test/runtime_tests.cc
@@ -93,7 +93,7 @@ namespace detail {
 		buffer<float, 2> buf_a{range<2>{32, 64}};
 		auto& tm = runtime::get_instance().get_task_manager();
 		const auto tid = test_utils::add_compute_task<class get_access_const>(
-		    tm, [buf_a /* capture by value */](handler& cgh) { buf_a.get_access<cl::sycl::access::mode::read>(cgh, one_to_one{}); }, buf_a.get_range());
+		    tm, [&](handler& cgh) { buf_a.get_access<cl::sycl::access::mode::read>(cgh, one_to_one{}); }, buf_a.get_range());
 		const auto tsk = tm.get_task(tid);
 		const auto bufs = tsk->get_buffer_access_map().get_accessed_buffers();
 		REQUIRE(bufs.size() == 1);
@@ -267,10 +267,10 @@ namespace detail {
 		REQUIRE(std::find(bufs.cbegin(), bufs.cend(), buf_b.get_id()) != bufs.cend());
 		REQUIRE(bam.get_access_modes(buf_a.get_id()).count(cl::sycl::access::mode::read) == 1);
 		REQUIRE(bam.get_access_modes(buf_b.get_id()).count(cl::sycl::access::mode::discard_read_write) == 1);
-		const auto reqs_a = bam.get_requirements_for_access(
+		const auto reqs_a = bam.get_mode_requirements(
 		    buf_a.get_id(), cl::sycl::access::mode::read, tsk->get_dimensions(), {tsk->get_global_offset(), tsk->get_global_size()}, tsk->get_global_size());
 		REQUIRE(reqs_a == subrange_to_grid_box(subrange<3>({32, 24, 0}, {32, 128, 1})));
-		const auto reqs_b = bam.get_requirements_for_access(buf_b.get_id(), cl::sycl::access::mode::discard_read_write, tsk->get_dimensions(),
+		const auto reqs_b = bam.get_mode_requirements(buf_b.get_id(), cl::sycl::access::mode::discard_read_write, tsk->get_dimensions(),
 		    {tsk->get_global_offset(), tsk->get_global_size()}, tsk->get_global_size());
 		REQUIRE(reqs_b == subrange_to_grid_box(subrange<3>({}, {5, 18, 74})));
 	}
@@ -279,24 +279,16 @@ namespace detail {
 		buffer_access_map bam;
 		bam.add_access(0, std::make_unique<range_mapper<2, fixed<2>>>(subrange<2>{{3, 0}, {10, 20}}, cl::sycl::access::mode::read, range<2>{30, 30}));
 		bam.add_access(0, std::make_unique<range_mapper<2, fixed<2>>>(subrange<2>{{10, 0}, {7, 20}}, cl::sycl::access::mode::read, range<2>{30, 30}));
-		const auto req = bam.get_requirements_for_access(0, cl::sycl::access::mode::read, 2, subrange<3>({0, 0, 0}, {100, 100, 1}), {100, 100, 1});
+		const auto req = bam.get_mode_requirements(0, cl::sycl::access::mode::read, 2, subrange<3>({0, 0, 0}, {100, 100, 1}), {100, 100, 1});
 		REQUIRE(req == subrange_to_grid_box(subrange<3>({3, 0, 0}, {14, 20, 1})));
 	}
 
 	TEST_CASE("tasks gracefully handle get_requirements() calls for buffers they don't access", "[task]") {
 		buffer_access_map bam;
-		const auto req = bam.get_requirements_for_access(0, cl::sycl::access::mode::read, 3, subrange<3>({0, 0, 0}, {100, 1, 1}), {100, 1, 1});
+		const auto req = bam.get_mode_requirements(0, cl::sycl::access::mode::read, 3, subrange<3>({0, 0, 0}, {100, 1, 1}), {100, 1, 1});
 		REQUIRE(req == subrange_to_grid_box(subrange<3>({0, 0, 0}, {0, 0, 0})));
 	}
 
-	TEST_CASE("safe command group functions must not capture by reference", "[lifetime][dx]") {
-		int value = 123;
-		const auto unsafe = [&]() { return value + 1; };
-		REQUIRE_FALSE(is_safe_cgf<decltype(unsafe)>);
-		const auto safe = [=]() { return value + 1; };
-		REQUIRE(is_safe_cgf<decltype(safe)>);
-	}
-
 	namespace foo {
 		class MySecondKernel;
 	}
@@ -321,12 +313,12 @@ namespace detail {
 		buffer<int, 1> buff(N);
 		std::vector<int> host_buff(N);
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			auto b = buff.get_access<cl::sycl::access::mode::discard_write>(cgh, one_to_one{});
 			cgh.parallel_for<class sync_test>(range<1>(N), [=](celerity::item<1> item) { b[item] = item.get_linear_id(); });
 		});
 
-		q.submit(allow_by_ref, [&](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			auto b = buff.get_access<cl::sycl::access::mode::read, target::host_task>(cgh, celerity::access::fixed<1>{{{}, buff.get_range()}});
 			cgh.host_task(on_master_node, [=, &host_buff] {
 				std::this_thread::sleep_for(std::chrono::milliseconds(10)); // give the synchronization more time to fail
@@ -461,37 +453,37 @@ namespace detail {
 			experimental::collective_group primary_group;
 			experimental::collective_group secondary_group;
 
-			q.submit(celerity::allow_by_ref, [&](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				cgh.host_task(experimental::collective, [&](experimental::collective_partition part) {
 					default1_thread = std::this_thread::get_id();
 					default1_comm = part.get_collective_mpi_comm();
 				});
 			});
-			q.submit(celerity::allow_by_ref, [&](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				cgh.host_task(experimental::collective(primary_group), [&](experimental::collective_partition part) {
 					primary1_thread = std::this_thread::get_id();
 					primary1_comm = part.get_collective_mpi_comm();
 				});
 			});
-			q.submit(celerity::allow_by_ref, [&](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				cgh.host_task(experimental::collective(secondary_group), [&](experimental::collective_partition part) {
 					secondary1_thread = std::this_thread::get_id();
 					secondary1_comm = part.get_collective_mpi_comm();
 				});
 			});
-			q.submit(celerity::allow_by_ref, [&](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				cgh.host_task(experimental::collective, [&](experimental::collective_partition part) {
 					default2_thread = std::this_thread::get_id();
 					default2_comm = part.get_collective_mpi_comm();
 				});
 			});
-			q.submit(celerity::allow_by_ref, [&](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				cgh.host_task(experimental::collective(primary_group), [&](experimental::collective_partition part) {
 					primary2_thread = std::this_thread::get_id();
 					primary2_comm = part.get_collective_mpi_comm();
 				});
 			});
-			q.submit(celerity::allow_by_ref, [&](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				cgh.host_task(experimental::collective(secondary_group), [&](experimental::collective_partition part) {
 					secondary2_thread = std::this_thread::get_id();
 					secondary2_comm = part.get_collective_mpi_comm();
@@ -582,31 +574,31 @@ namespace detail {
 		distr_queue q;
 		buffer<int, 2> buf{{10, 10}};
 
-		CHECK_THROWS_WITH(q.submit([=](handler& cgh) {
+		CHECK_THROWS_WITH(q.submit([&](handler& cgh) {
 			buf.get_access<cl::sycl::access::mode::read>(cgh, one_to_one{});
 			cgh.parallel_for<class UKN(kernel)>(range<1>{10}, [=](celerity::item<1>) {});
 		}),
 		    "Invalid range mapper dimensionality: 1-dimensional kernel submitted with a requirement whose range mapper is neither invocable for chunk<1> nor "
 		    "(chunk<1>, range<2>) to produce subrange<2>");
 
-		CHECK_NOTHROW(q.submit([=](handler& cgh) {
+		CHECK_NOTHROW(q.submit([&](handler& cgh) {
 			buf.get_access<cl::sycl::access::mode::read>(cgh, one_to_one{});
 			cgh.parallel_for<class UKN(kernel)>(range<2>{10, 10}, [=](celerity::item<2>) {});
 		}));
 
-		CHECK_THROWS_WITH(q.submit([=](handler& cgh) {
+		CHECK_THROWS_WITH(q.submit([&](handler& cgh) {
 			buf.get_access<cl::sycl::access::mode::read>(cgh, one_to_one{});
 			cgh.parallel_for<class UKN(kernel)>(range<3>{10, 10, 10}, [=](celerity::item<3>) {});
 		}),
 		    "Invalid range mapper dimensionality: 3-dimensional kernel submitted with a requirement whose range mapper is neither invocable for chunk<3> nor "
 		    "(chunk<3>, range<2>) to produce subrange<2>");
 
-		CHECK_NOTHROW(q.submit([=](handler& cgh) {
+		CHECK_NOTHROW(q.submit([&](handler& cgh) {
 			buf.get_access<cl::sycl::access::mode::read>(cgh, all{});
 			cgh.parallel_for<class UKN(kernel)>(range<3>{10, 10, 10}, [=](celerity::item<3>) {});
 		}));
 
-		CHECK_NOTHROW(q.submit([=](handler& cgh) {
+		CHECK_NOTHROW(q.submit([&](handler& cgh) {
 			buf.get_access<cl::sycl::access::mode::read>(cgh, all{});
 			cgh.parallel_for<class UKN(kernel)>(range<3>{10, 10, 10}, [=](celerity::item<3>) {});
 		}));
@@ -626,7 +618,7 @@ namespace detail {
 		const auto global_offset = detail::id_cast<Dims>(id<3>{4, 5, 6});
 
 		buffer<size_t, 2> linear_id{{n, Dims + 1}};
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor a{linear_id, cgh, celerity::access::all{}, write_only, no_init}; // all RM is sane because runtime_tests runs single-node
 			cgh.parallel_for<linear_id_kernel<Dims>>(detail::range_cast<Dims>(range<3>{n, 1, 1}), global_offset, [=](celerity::item<Dims> item) {
 				auto i = (item.get_id() - item.get_offset())[0];
@@ -636,7 +628,7 @@ namespace detail {
 				a[i][Dims] = item.get_linear_id();
 			});
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor a{linear_id, cgh, celerity::access::all{}, read_only_host_task};
 			cgh.host_task(on_master_node, [=] {
 				for(int i = 0; i < n; ++i) {
@@ -737,7 +729,7 @@ namespace detail {
 
 		// Note: We assume a local range size of 32 here, this should be supported by most devices.
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			local_accessor<int> la{32, cgh};
 			accessor ga{out, cgh, celerity::access::one_to_one{}, write_only};
 			cgh.parallel_for<class UKN(device_kernel)>(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) {
@@ -747,7 +739,7 @@ namespace detail {
 			});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor ga{out, cgh, celerity::access::all{}, read_only_host_task};
 			cgh.host_task(on_master_node, [=] {
 				for(size_t i = 0; i < 64; ++i) {
@@ -762,7 +754,7 @@ namespace detail {
 		// Note: We assume a local range size of 16 here, this should be supported by most devices.
 
 		buffer<int, 1> b{range<1>{1}};
-		distr_queue{}.submit([=](handler& cgh) {
+		distr_queue{}.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(kernel)>(celerity::nd_range{range<2>{8, 8}, range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}),
 			    [](nd_item<2> item, auto& sum) { sum += item.get_global_linear_id(); });
 		});
@@ -783,11 +775,11 @@ namespace detail {
 		q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [](nd_item<1> item) {}); });
 #if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		buffer<int> b{{1}};
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for(
 			    range<1>{64}, reduction(b, cgh, cl::sycl::plus<int>{}), [=](item<1> item, auto& r) { r += static_cast<int>(item.get_linear_id()); });
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for(celerity::nd_range<1>{64, 32}, reduction(b, cgh, cl::sycl::plus<int>{}),
 			    [=](nd_item<1> item, auto& r) { r += static_cast<int>(item.get_global_linear_id()); });
 		});
@@ -797,11 +789,11 @@ namespace detail {
 		q.submit([=](handler& cgh) { cgh.parallel_for<class UKN(simple_kernel_with_name)>(range<1>{64}, [=](item<1> item) {}); });
 		q.submit([=](handler& cgh) { cgh.parallel_for<class UKN(nd_range_kernel_with_name)>(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) {}); });
 #if CELERITY_FEATURE_SCALAR_REDUCTIONS
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(simple_kernel_with_name_and_reductions)>(
 			    range<1>{64}, reduction(b, cgh, cl::sycl::plus<int>{}), [=](item<1> item, auto& r) { r += static_cast<int>(item.get_linear_id()); });
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(nd_range_kernel_with_name_and_reductions)>(celerity::nd_range<1>{64, 32}, reduction(b, cgh, cl::sycl::plus<int>{}),
 			    [=](nd_item<1> item, auto& r) { r += static_cast<int>(item.get_global_linear_id()); });
 		});
@@ -821,7 +813,7 @@ namespace detail {
 		constexpr int extents = 16;
 
 		buffer<int, 1> buf_a(extents);
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc{buf_a, cgh, celerity::access::all{}, celerity::write_only_host_task, celerity::no_init};
 			cgh.host_task(on_master_node, [] {});
 		});
@@ -831,7 +823,7 @@ namespace detail {
 			constexpr int task_limit = 15;
 
 			for(int i = 0; i < chain_length; ++i) {
-				q.submit([=](handler& cgh) {
+				q.submit([&](handler& cgh) {
 					accessor acc{buf_a, cgh, celerity::access::all{}, celerity::read_write_host_task};
 					cgh.host_task(on_master_node, [] {});
 				});
@@ -847,6 +839,74 @@ namespace detail {
 		}
 	}
 
+	TEST_CASE_METHOD(test_utils::runtime_fixture, "tasks extend the lifetime of buffers and host objects", "[task_manager]") {
+		std::weak_ptr<lifetime_extending_state> buffer_state_1;
+		std::weak_ptr<lifetime_extending_state> buffer_state_2;
+		std::weak_ptr<lifetime_extending_state> buffer_state_3;
+		std::weak_ptr<lifetime_extending_state> host_object_state;
+
+		distr_queue q;
+		auto& tm = runtime::get_instance().get_task_manager();
+		tm.set_horizon_step(2);
+
+		std::promise<void> wait_for_this;
+
+		const size_t have_reduction = CELERITY_FEATURE_SCALAR_REDUCTIONS;
+
+		{
+			// Buffers and host object go out of scope before tasks has completed
+			buffer<size_t, 1> buf_1{32};
+			buffer<size_t, 1> buf_2{32};
+			buffer<size_t, 1> buf_3{1};
+			experimental::host_object<size_t> ho;
+			buffer_state_1 = get_lifetime_extending_state(buf_1);
+			buffer_state_2 = get_lifetime_extending_state(buf_2);
+			buffer_state_3 = get_lifetime_extending_state(buf_3);
+			host_object_state = get_lifetime_extending_state(ho);
+			q.submit([&](handler& cgh) {
+				accessor acc{buf_1, cgh, celerity::access::all{}, celerity::write_only_host_task};
+				experimental::side_effect se{ho, cgh};
+				cgh.host_task(on_master_node, [=, &wait_for_this] {
+					(void)acc;
+					(void)se;
+					wait_for_this.get_future().wait();
+				});
+			});
+			q.submit([&](handler& cgh) {
+				accessor acc_1{buf_1, cgh, celerity::access::one_to_one{}, celerity::read_only};
+				accessor acc_2{buf_2, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
+				auto red = reduction(buf_3, cgh, std::plus<>());
+				cgh.parallel_for(range<1>{32}, red, [=](item<1>, auto&) {
+#else
+				cgh.parallel_for(range<1>{32}, [=](item<1>) {
+#endif
+					(void)acc_1;
+					(void)acc_2;
+				});
+			});
+			CHECK(buffer_state_1.use_count() == 3);
+			CHECK(buffer_state_2.use_count() == 2);
+			CHECK(buffer_state_3.use_count() == 1 + have_reduction);
+			CHECK(host_object_state.use_count() == 2);
+		}
+
+		CHECK(buffer_state_1.use_count() == 2);
+		CHECK(buffer_state_2.use_count() == 1);
+		CHECK(buffer_state_3.use_count() == have_reduction);
+		CHECK(host_object_state.use_count() == 1);
+		wait_for_this.set_value();
+		// Now trigger deletion of the task (which should then also delete the buffer and host object).
+		q.slow_full_sync();
+		// We currently only delete tasks when submitting new tasks (and not when creating epochs), so we have to submit a no-op here.
+		q.submit([](handler& cgh) { cgh.host_task(on_master_node, [] {}); });
+		q.slow_full_sync();
+		CHECK(buffer_state_1.use_count() == 0);
+		CHECK(buffer_state_2.use_count() == 0);
+		CHECK(buffer_state_3.use_count() == 0);
+		CHECK(host_object_state.use_count() == 0);
+	}
+
 #ifndef __APPLE__
 	class restore_process_affinity_fixture {
 		restore_process_affinity_fixture(const restore_process_affinity_fixture&) = delete;
@@ -914,7 +974,7 @@ namespace detail {
 		experimental::host_object ref_ho{std::ref(exterior)};
 		experimental::host_object void_ho;
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			experimental::side_effect append_owned{owned_ho, cgh};
 			experimental::side_effect append_ref{ref_ho, cgh};
 			experimental::side_effect track_void{void_ho, cgh};
@@ -924,7 +984,7 @@ namespace detail {
 			});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			experimental::side_effect append_owned{owned_ho, cgh};
 			experimental::side_effect append_ref{ref_ho, cgh};
 			experimental::side_effect track_void{void_ho, cgh};
@@ -934,7 +994,7 @@ namespace detail {
 			});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			experimental::side_effect check_owned{owned_ho, cgh};
 			cgh.host_task(on_master_node, [=] { CHECK(*check_owned == std::vector{1, 2}); });
 		});
@@ -1069,13 +1129,13 @@ namespace detail {
 		experimental::host_object<int> ho{1};
 		distr_queue q;
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			experimental::side_effect e(ho, cgh);
 			cgh.host_task(on_master_node, [=] { *e = 2; });
 		});
 		auto v2 = experimental::fence(q, ho);
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			experimental::side_effect e(ho, cgh);
 			cgh.host_task(on_master_node, [=] { *e = 3; });
 		});
@@ -1089,7 +1149,7 @@ namespace detail {
 		buffer<int, 2> buf(range<2>(4, 4));
 		distr_queue q;
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc(buf, cgh, all{}, write_only, no_init);
 			cgh.parallel_for<class UKN(init)>(buf.get_range(), [=](celerity::item<2> item) { acc[item] = static_cast<int>(item.get_linear_id()); });
 		});
@@ -1114,7 +1174,7 @@ namespace detail {
 		buffer<int, 0> buf;
 		distr_queue q;
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc(buf, cgh, write_only, no_init);
 			cgh.parallel_for<class UKN(init)>(buf.get_range(), [=](celerity::item<0> item) { *acc = 42; });
 		});
@@ -1133,20 +1193,20 @@ namespace detail {
 
 		distr_queue q;
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_a(buf_a, cgh, write_only, no_init);
 			cgh.parallel_for<class UKN(device)>(range<0>(), [=](item<0> /* it */) { *acc_a = value_b; });
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_b(buf_b, cgh, write_only, no_init);
 			cgh.parallel_for<class UKN(device)>(nd_range<0>(), [=](nd_item<0> /* it */) { *acc_b = value_b; });
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_c(buf_c, cgh, write_only_host_task, no_init);
 			cgh.host_task(range<0>(), [=](partition<0> /* part */) { *acc_c = value_b; });
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc_a(buf_a, cgh, read_only_host_task);
 			accessor acc_b(buf_b, cgh, read_only_host_task);
 			accessor acc_c(buf_c, cgh, read_only_host_task);
diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc
index f38bd8b68..27a937ada 100644
--- a/test/system/distr_tests.cc
+++ b/test/system/distr_tests.cc
@@ -29,7 +29,7 @@ namespace detail {
 		distr_queue q;
 		const auto initialize_to_identity = cl::sycl::property::reduction::initialize_to_identity{};
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus<size_t>{}, initialize_to_identity);
 			auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum<size_t>{}, initialize_to_identity);
 			cgh.parallel_for<class UKN(kernel)>(range{N}, id{1}, sum_r, max_r, [=](celerity::item<1> item, auto& sum, auto& max) {
@@ -38,7 +38,7 @@ namespace detail {
 			});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor sum_acc{sum_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			accessor max_acc{max_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			cgh.host_task(on_master_node, [=] {
@@ -62,12 +62,12 @@ namespace detail {
 		const int N = 1000;
 		const int init = 42;
 		buffer<int, 1> sum(&init, range{1});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(kernel)>(
 			    range{N}, reduction(sum, cgh, cl::sycl::plus<int>{} /* don't initialize to identity */), [=](celerity::item<1> item, auto& sum) { sum += 1; });
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			cgh.host_task(on_master_node, [=] { CHECK(acc[0] == N + init); });
 		});
@@ -83,17 +83,17 @@ namespace detail {
 		const int N = 1000;
 
 		buffer<int, 1> sum(range{1});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(kernel)>(range{N}, reduction(sum, cgh, cl::sycl::plus<int>{}, cl::sycl::property::reduction::initialize_to_identity{}),
 			    [=](celerity::item<1> item, auto& sum) { sum += 1; });
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(kernel)>(range{N}, reduction(sum, cgh, cl::sycl::plus<int>{} /* include previous reduction result */),
 			    [=](celerity::item<1> item, auto& sum) { sum += 2; });
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			cgh.host_task(on_master_node, [=] { CHECK(acc[0] == 3 * N); });
 		});
@@ -110,19 +110,19 @@ namespace detail {
 		const int N = 1000;
 
 		buffer<int, 1> sum(range{1});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(produce)>(range{N}, reduction(sum, cgh, cl::sycl::plus<int>{}, cl::sycl::property::reduction::initialize_to_identity{}),
 			    [=](celerity::item<1> item, auto& sum) { sum += static_cast<int>(item.get_linear_id()); });
 		});
 
 		const int expected = (N - 1) * N / 2;
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			cgh.host_task(on_master_node, [=] { CHECK(acc[0] == expected); });
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			cgh.host_task(experimental::collective, [=](experimental::collective_partition p) {
 				INFO("Node " << p.get_node_index());
@@ -146,14 +146,14 @@ namespace detail {
 		{
 			distr_queue q;
 			buffer<int, 1> sum(range{1});
-			q.submit([=](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				cgh.parallel_for<class UKN(produce)>(range{100},
 				    reduction(sum, cgh, cl::sycl::plus<int>{}, cl::sycl::property::reduction::initialize_to_identity{}),
 				    [](celerity::item<1> item, auto& sum) {});
 			});
-			q.submit([=](handler& cgh) {
+			q.submit([&](handler& cgh) {
 				accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
-				cgh.host_task(on_master_node, [] {});
+				cgh.host_task(on_master_node, [=] { (void)acc; });
 			});
 		} // shutdown runtime and print graph
 
@@ -211,7 +211,7 @@ namespace detail {
 
 		buffer<geometry, Dims> geo(global_range);
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor g{geo, cgh, celerity::access::one_to_one{}, write_only, no_init};
 			cgh.parallel_for<kernel_name_nd_geometry<Dims>>(celerity::nd_range{global_range, local_range}, /* global_offset,*/ [=](nd_item<Dims> item) {
 				auto group = item.get_group();
@@ -224,7 +224,7 @@ namespace detail {
 			});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor g{geo, cgh, celerity::access::all{}, read_only_host_task};
 			cgh.host_task(on_master_node, [=] {
 				for(size_t global_linear_id = 0; global_linear_id < global_range.size(); ++global_linear_id) {
@@ -268,29 +268,29 @@ namespace detail {
 		constexpr int N = 1000;
 
 		buffer<int, 1> buff_a(range<1>{1});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor write_a{buff_a, cgh, celerity::access::all{}, celerity::write_only, celerity::no_init};
 			cgh.parallel_for<class UKN(write_a)>(range<1>{N}, [=](celerity::item<1> item) {});
 		});
 
 		buffer<int, 1> buff_b(range<1>{1});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor write_b{buff_b, cgh, celerity::access::all{}, celerity::write_only, celerity::no_init};
 			cgh.parallel_for<class UKN(write_b)>(range<1>{N}, [=](celerity::item<1> item) {});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor read_write_a{buff_a, cgh, celerity::access::all{}, celerity::read_write};
 			cgh.parallel_for<class UKN(read_write_a)>(range<1>{N}, [=](celerity::item<1> item) {});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor read_write_a{buff_a, cgh, celerity::access::all{}, celerity::read_write};
 			accessor read_write_b{buff_b, cgh, celerity::access::all{}, celerity::read_write};
 			cgh.parallel_for<class UKN(read_write_a_b)>(range<1>{N}, [=](celerity::item<1> item) {});
 		});
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor write_a{buff_a, cgh, celerity::access::all{}, celerity::write_only, celerity::no_init};
 			cgh.parallel_for<class UKN(write_a_again)>(range<1>{N}, [=](celerity::item<1> item) {});
 		});
@@ -336,7 +336,7 @@ namespace detail {
 			return celerity::access::one_to_one{}(chnk);
 		};
 
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc{buf, cgh, chunk_check_rm, write_only, no_init};
 			// The kernel has a size of 1 in dimension 0, so it will not be split into
 			// more than one chunk (assuming current naive split behavior).
@@ -349,11 +349,11 @@ namespace detail {
 		experimental::host_object<int> obj;
 
 		distr_queue q;
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			experimental::side_effect eff{obj, cgh};
 			cgh.host_task(experimental::collective, [=](experimental::collective_partition p) { *eff = static_cast<int>(p.get_node_index()); });
 		});
-		q.submit([=](handler& cgh) {
+		q.submit([&](handler& cgh) {
 			accessor acc{buf, cgh, celerity::access::all{}, write_only_host_task, no_init};
 			cgh.host_task(on_master_node, [=] { acc[{1, 2, 3}] = 42; });
 		});
diff --git a/test/task_graph_tests.cc b/test/task_graph_tests.cc
index 9c45d7025..44acdd3bf 100644
--- a/test/task_graph_tests.cc
+++ b/test/task_graph_tests.cc
@@ -1,4 +1,3 @@
-#include "allscale/api/user/data/grid.h"
 #include "task.h"
 #include "task_manager.h"
 #include "types.h"
diff --git a/test/task_ring_buffer_tests.cc b/test/task_ring_buffer_tests.cc
index 6ed569090..b0a2dd6a2 100644
--- a/test/task_ring_buffer_tests.cc
+++ b/test/task_ring_buffer_tests.cc
@@ -24,7 +24,7 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "freeing task ring buffer capacity
 	celerity::buffer<int, 1> dependency{1};
 
 	for(size_t i = 0; i < task_ringbuffer_size + 10; ++i) {
-		q.submit(celerity::allow_by_ref, [=, &reached_ringbuffer_capacity](celerity::handler& cgh) {
+		q.submit([&](celerity::handler& cgh) {
 			celerity::accessor acc{dependency, cgh, celerity::access::all{}, celerity::read_write_host_task};
 			cgh.host_task(celerity::on_master_node, [=, &reached_ringbuffer_capacity] {
 				while(!reached_ringbuffer_capacity.load())
diff --git a/test/test_main.cc b/test/test_main.cc
index 853e3c924..e9ea739a2 100644
--- a/test/test_main.cc
+++ b/test/test_main.cc
@@ -26,6 +26,9 @@ int main(int argc, char* argv[]) {
 
 struct global_setup_and_teardown : Catch::EventListenerBase {
 	using EventListenerBase::EventListenerBase;
+
+	void testRunStarting(const Catch::TestRunInfo& /* info */) override { celerity::detail::closure_hydrator::make_available(); }
+
 	void testCasePartialEnded(const Catch::TestCaseStats&, uint64_t) override {
 		// Reset REQUIRE_LOOP after each test case, section or generator value.
 		celerity::test_utils::require_loop_assertion_registry::get_instance().reset();
diff --git a/test/test_utils.h b/test/test_utils.h
index 0ebdff599..8a3737d67 100644
--- a/test/test_utils.h
+++ b/test/test_utils.h
@@ -158,10 +158,7 @@ namespace test_utils {
 	  public:
 		template <cl::sycl::access::mode Mode, typename Functor>
 		void get_access(handler& cgh, Functor rmfn) {
-			if(detail::is_prepass_handler(cgh)) {
-				auto& prepass_cgh = dynamic_cast<detail::prepass_handler&>(cgh); // No live pass in tests
-				prepass_cgh.add_requirement(m_id, std::make_unique<detail::range_mapper<Dims, Functor>>(rmfn, Mode, m_size));
-			}
+			(void)detail::add_requirement(cgh, m_id, std::make_unique<detail::range_mapper<Dims, Functor>>(rmfn, Mode, m_size));
 		}
 
 		detail::buffer_id get_id() const { return m_id; }
@@ -179,12 +176,7 @@ namespace test_utils {
 
 	class mock_host_object {
 	  public:
-		void add_side_effect(handler& cgh, const experimental::side_effect_order order) {
-			if(detail::is_prepass_handler(cgh)) {
-				auto& prepass_cgh = static_cast<detail::prepass_handler&>(cgh);
-				prepass_cgh.add_requirement(m_id, order);
-			}
-		}
+		void add_side_effect(handler& cgh, const experimental::side_effect_order order) { (void)detail::add_requirement(cgh, m_id, order); }
 
 		detail::host_object_id get_id() const { return m_id; }
 
@@ -412,7 +404,7 @@ namespace test_utils {
 
 	template <int Dims>
 	void add_reduction(handler& cgh, mock_reduction_factory& mrf, const mock_buffer<Dims>& vars, bool include_current_buffer_value) {
-		static_cast<detail::prepass_handler&>(cgh).add_reduction(mrf.create_reduction(vars.get_id(), include_current_buffer_value));
+		detail::add_reduction(cgh, mrf.create_reduction(vars.get_id(), include_current_buffer_value));
 	}
 
 	// This fixture (or a subclass) must be used by all tests that transitively use MPI.