From 0a743c7bec9091f2c85eb87bedfca7104e33db9a Mon Sep 17 00:00:00 2001 From: Philip Salzmann Date: Tue, 11 Apr 2023 14:39:42 +0200 Subject: [PATCH] Remove multi-pass mechanism (invoke CGFs only once) - Instead of storing the entire CGF, we now only store the inner "command function" lambda - Accessors captured into command function closures are "hydrated" before launching kernel - Buffers and host objects can (and should) now be captured by reference into CGFs; added deprecation warnings - Accessors, side-effects and reductions may now be created from non-const buffers - Deprecate allow_by_ref - Introduce new mechnaism to tie buffer and host object lifetimes to tasks (required as we no longer store captured copies inside the CGF) Other changes: - Fix a bug in test "horizons correctly deal with antidependencies", which relied on fixed order of dependencies - Change output of "command graph printing is unchanged" smoke test (again due to change in ordering) --- examples/convolution/convolution.cc | 6 +- examples/distr_io/distr_io.cc | 12 +- examples/matmul/matmul.cc | 6 +- examples/reduction/reduction.cc | 8 +- examples/syncing/syncing.cc | 4 +- examples/wave_sim/wave_sim.cc | 12 +- include/accessor.h | 262 ++++++----- include/buffer.h | 37 +- include/closure_hydrator.h | 157 +++++++ include/distr_queue.h | 27 +- include/handler.h | 666 +++++++++++++--------------- include/host_object.h | 25 +- include/lifetime_extending_state.h | 43 ++ include/runtime.h | 6 +- include/side_effect.h | 36 +- include/task.h | 100 +++-- include/task_manager.h | 6 +- include/types.h | 1 + src/executor.cc | 3 + src/graph_generator.cc | 2 +- src/print_graph.cc | 2 +- src/runtime.cc | 9 + src/task.cc | 47 +- src/task_manager.cc | 2 +- src/worker_job.cc | 50 ++- test/accessor_tests.cc | 193 ++++++-- test/buffer_manager_test_utils.h | 26 +- test/buffer_manager_tests.cc | 55 +-- test/graph_compaction_tests.cc | 2 +- test/integration/backend.cc | 8 +- test/print_graph_tests.cc | 6 +- test/runtime_deprecation_tests.cc | 27 ++ test/runtime_tests.cc | 156 +++++-- test/system/distr_tests.cc | 46 +- test/task_graph_tests.cc | 1 - test/task_ring_buffer_tests.cc | 2 +- test/test_main.cc | 3 + test/test_utils.h | 14 +- 38 files changed, 1309 insertions(+), 759 deletions(-) create mode 100644 include/closure_hydrator.h create mode 100644 include/lifetime_extending_state.h diff --git a/examples/convolution/convolution.cc b/examples/convolution/convolution.cc index 4461ba5e1..9b3972a23 100644 --- a/examples/convolution/convolution.cc +++ b/examples/convolution/convolution.cc @@ -60,7 +60,7 @@ int main(int argc, char* argv[]) { celerity::buffer gaussian_mat_buf(gaussian_matrix.data(), celerity::range<2>(filter_size, filter_size)); // Do a gaussian blur - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor in{image_input_buf, cgh, celerity::access::neighborhood{filter_size / 2, filter_size / 2}, celerity::read_only}; celerity::accessor gauss{gaussian_mat_buf, cgh, celerity::access::all{}, celerity::read_only}; celerity::accessor out{image_tmp_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) { celerity::buffer image_output_buf(celerity::range<2>(image_height, image_width)); // Now apply a sharpening kernel - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor in{image_tmp_buf, cgh, celerity::access::neighborhood{1, 1}, celerity::read_only}; celerity::accessor out{image_output_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; @@ -107,7 +107,7 @@ int main(int argc, char* argv[]) { }); }); - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor out{image_output_buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(celerity::on_master_node, [=] { diff --git a/examples/distr_io/distr_io.cc b/examples/distr_io/distr_io.cc index e56c6a81a..1560ce2f7 100644 --- a/examples/distr_io/distr_io.cc +++ b/examples/distr_io/distr_io.cc @@ -27,8 +27,8 @@ static std::pair allocation_window_to_dataspace(const celerity::bu } -static void read_hdf5_file(celerity::distr_queue& q, const celerity::buffer& buffer, const char* file_name) { - q.submit([=](celerity::handler& cgh) { +static void read_hdf5_file(celerity::distr_queue& q, celerity::buffer& buffer, const char* file_name) { + q.submit([&](celerity::handler& cgh) { celerity::accessor a{buffer, cgh, celerity::experimental::access::even_split<2>{}, celerity::write_only_host_task, celerity::no_init}; cgh.host_task(celerity::experimental::collective, [=](celerity::experimental::collective_partition part) { auto plist = H5Pcreate(H5P_FILE_ACCESS); @@ -54,8 +54,8 @@ static void read_hdf5_file(celerity::distr_queue& q, const celerity::buffer& buffer, const char* file_name) { - q.submit([=](celerity::handler& cgh) { +static void write_hdf5_file(celerity::distr_queue& q, celerity::buffer& buffer, const char* file_name) { + q.submit([&](celerity::handler& cgh) { celerity::accessor a{buffer, cgh, celerity::experimental::access::even_split<2>{}, celerity::read_only_host_task}; cgh.host_task(celerity::experimental::collective, [=](celerity::experimental::collective_partition part) { auto plist = H5Pcreate(H5P_FILE_ACCESS); @@ -116,7 +116,7 @@ int main(int argc, char* argv[]) { read_hdf5_file(q, in, argv[2]); - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor a{in, cgh, celerity::access::one_to_one{}, celerity::read_only}; celerity::accessor b{out, cgh, transposed, celerity::write_only, celerity::no_init}; cgh.parallel_for(celerity::range<2>{N, N}, [=](celerity::item<2> item) { @@ -139,7 +139,7 @@ int main(int argc, char* argv[]) { read_hdf5_file(q, left, argv[2]); read_hdf5_file(q, right, argv[3]); - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor a{left, cgh, celerity::access::all{}, celerity::read_only_host_task}; celerity::accessor b{right, cgh, celerity::access::all{}, celerity::read_only_host_task}; celerity::accessor e{equal, cgh, celerity::write_only_host_task, celerity::no_init}; diff --git a/examples/matmul/matmul.cc b/examples/matmul/matmul.cc index dce991534..a8a5eb52d 100644 --- a/examples/matmul/matmul.cc +++ b/examples/matmul/matmul.cc @@ -6,7 +6,7 @@ const size_t MAT_SIZE = 1024; template void set_identity(celerity::distr_queue queue, celerity::buffer mat) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor dw{mat, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(mat.get_range(), [=](celerity::item<2> item) { dw[item] = item[0] == item[1]; }); }); @@ -14,7 +14,7 @@ void set_identity(celerity::distr_queue queue, celerity::buffer mat) { template void multiply(celerity::distr_queue queue, celerity::buffer mat_a, celerity::buffer mat_b, celerity::buffer mat_c) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor a{mat_a, cgh, celerity::access::slice<2>(1), celerity::read_only}; celerity::accessor b{mat_b, cgh, celerity::access::slice<2>(0), celerity::read_only}; celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; @@ -48,7 +48,7 @@ void multiply(celerity::distr_queue queue, celerity::buffer mat_a, celerit // TODO this should really reduce into a buffer on the device, but not all backends currently support reductions template void verify(celerity::distr_queue& queue, celerity::buffer mat_c, celerity::experimental::host_object passed_obj) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::read_only_host_task}; celerity::experimental::side_effect passed{passed_obj, cgh}; diff --git a/examples/reduction/reduction.cc b/examples/reduction/reduction.cc index 6671463f0..28266b85c 100644 --- a/examples/reduction/reduction.cc +++ b/examples/reduction/reduction.cc @@ -73,7 +73,7 @@ int main(int argc, char* argv[]) { celerity::buffer lab_buf{image_size}; celerity::buffer minmax_buf{celerity::range{1}}; - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::one_to_one{}, celerity::read_only}; celerity::accessor rgb_acc{lab_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; auto minmax_r = celerity::reduction(minmax_buf, cgh, minmax_identity, minmax, celerity::property::reduction::initialize_to_identity{}); @@ -86,12 +86,12 @@ int main(int argc, char* argv[]) { }); }); - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor minmax_acc{minmax_buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(celerity::on_master_node, [=] { printf("Before contrast stretch: min = %f, max = %f\n", minmax_acc[0][0], minmax_acc[0][1]); }); }); - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor rgb_acc{lab_buf, cgh, celerity::access::one_to_one{}, celerity::read_only}; celerity::accessor minmax_acc{minmax_buf, cgh, celerity::access::all{}, celerity::read_only}; celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) { }); }); - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(celerity::on_master_node, [=] { stbi_write_jpg("output.jpg", image_width, image_height, 4, srgb_255_acc.get_pointer(), 90); }); }); diff --git a/examples/syncing/syncing.cc b/examples/syncing/syncing.cc index 4ddb2ccad..84f085cb5 100644 --- a/examples/syncing/syncing.cc +++ b/examples/syncing/syncing.cc @@ -9,14 +9,14 @@ int main(int argc, char* argv[]) { celerity::buffer buf(buf_size); // Initialize buffer in a distributed device kernel - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor b{buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(buf.get_range(), [=](celerity::item<1> item) { b[item] = item.get_linear_id(); }); }); // Process values on the host std::vector host_buf(buf_size); - queue.submit(celerity::allow_by_ref, [=, &host_buf](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor b{buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(celerity::experimental::collective, [=, &host_buf](celerity::experimental::collective_partition) { std::this_thread::sleep_for(std::chrono::milliseconds(100)); // give the synchronization more time to fail diff --git a/examples/wave_sim/wave_sim.cc b/examples/wave_sim/wave_sim.cc index 6be99ab1a..49356ef47 100644 --- a/examples/wave_sim/wave_sim.cc +++ b/examples/wave_sim/wave_sim.cc @@ -6,7 +6,7 @@ #include void setup_wave(celerity::distr_queue& queue, celerity::buffer u, sycl::float2 center, float amplitude, sycl::float2 sigma) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor dw_u{u, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(u.get_range(), [=, c = center, a = amplitude, s = sigma](celerity::item<2> item) { const float dx = item[1] - c.x(); @@ -17,7 +17,7 @@ void setup_wave(celerity::distr_queue& queue, celerity::buffer u, sycl } void zero(celerity::distr_queue& queue, celerity::buffer buf) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor dw_buf{buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(buf.get_range(), [=](celerity::item<2> item) { dw_buf[item] = 0.f; }); }); @@ -37,7 +37,7 @@ struct update_config { template void step(celerity::distr_queue& queue, celerity::buffer up, celerity::buffer u, float dt, sycl::float2 delta) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor rw_up{up, cgh, celerity::access::one_to_one{}, celerity::read_write}; celerity::accessor r_u{u, cgh, celerity::access::neighborhood{1, 1}, celerity::read_only}; @@ -64,7 +64,7 @@ void update(celerity::distr_queue& queue, celerity::buffer up, celerit } void stream_open(celerity::distr_queue& queue, size_t N, size_t num_samples, celerity::experimental::host_object os) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::experimental::side_effect os_eff{os, cgh}; cgh.host_task(celerity::on_master_node, [=] { os_eff->open("wave_sim_result.bin", std::ios_base::out | std::ios_base::binary); @@ -77,7 +77,7 @@ void stream_open(celerity::distr_queue& queue, size_t N, size_t num_samples, cel template void stream_append(celerity::distr_queue& queue, celerity::buffer up, celerity::experimental::host_object os) { const auto range = up.get_range(); - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::accessor up_r{up, cgh, celerity::access::all{}, celerity::read_only_host_task}; celerity::experimental::side_effect os_eff{os, cgh}; cgh.host_task(celerity::on_master_node, [=] { os_eff->write(reinterpret_cast(up_r.get_pointer()), range.size() * sizeof(T)); }); @@ -85,7 +85,7 @@ void stream_append(celerity::distr_queue& queue, celerity::buffer up, cele } void stream_close(celerity::distr_queue& queue, celerity::experimental::host_object os) { - queue.submit([=](celerity::handler& cgh) { + queue.submit([&](celerity::handler& cgh) { celerity::experimental::side_effect os_eff{os, cgh}; cgh.host_task(celerity::on_master_node, [=] { os_eff->close(); }); }); diff --git a/include/accessor.h b/include/accessor.h index 70766d2c5..ba42eef0e 100644 --- a/include/accessor.h +++ b/include/accessor.h @@ -7,6 +7,7 @@ #include "access_modes.h" #include "buffer.h" #include "buffer_storage.h" +#include "closure_hydrator.h" #include "handler.h" #include "sycl_wrappers.h" @@ -153,6 +154,8 @@ class buffer_allocation_window { friend class accessor; }; +#define CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR [[deprecated("Creating accessor from const buffer is deprecated, capture buffer by reference instead")]] + /** * Celerity wrapper around SYCL accessors. * @@ -171,47 +174,70 @@ class accessor : public detail::accessor_base accessor() noexcept = default; template - accessor(const buffer& buff, handler& cgh, const Functor& rmfn) : accessor(ctor_internal_tag(), buff, cgh, rmfn) {} + accessor(buffer& buff, handler& cgh, const Functor& rmfn) : accessor(ctor_internal_tag(), buff, cgh, rmfn) {} template - accessor(const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */) + accessor(buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */) : accessor(ctor_internal_tag(), buff, cgh, rmfn) {} template - accessor(const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */, + accessor(buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */, const property::no_init& /* no_init */) : accessor(ctor_internal_tag(), buff, cgh, rmfn) {} - template - accessor(const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */, - const property_list& /* prop_list */) { - static_assert(detail::constexpr_false, - "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init " - "as a last argument in the constructor"); - } - template = 0> - accessor(const buffer& buff, handler& cgh) : accessor(buff, cgh, access::all()) {} + accessor(buffer& buff, handler& cgh) : accessor(buff, cgh, access::all()) {} template = 0> - accessor(const buffer& buff, handler& cgh, const detail::access_tag tag) + accessor(buffer& buff, handler& cgh, const detail::access_tag tag) : accessor(buff, cgh, access::all(), tag) {} template = 0> - accessor(const buffer& buff, handler& cgh, const detail::access_tag tag, const property::no_init& no_init) + accessor(buffer& buff, handler& cgh, const detail::access_tag tag, const property::no_init& no_init) : accessor(buff, cgh, access::all(), tag, no_init) {} template = 0> - accessor( - const buffer& buff, handler& cgh, const detail::access_tag tag, const property_list& prop_list) + accessor(buffer& buff, handler& cgh, const detail::access_tag tag, const property_list& prop_list) : accessor(buff, cgh, access::all(), tag, prop_list) {} + template + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer& buff, handler& cgh, const Functor& rmfn) + : accessor(ctor_internal_tag(), buff, cgh, rmfn) {} + + template + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor( + const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */) + : accessor(ctor_internal_tag(), buff, cgh, rmfn) {} + + template + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer& buff, handler& cgh, const Functor& rmfn, + const detail::access_tag /* tag */, const property::no_init& /* no_init */) + : accessor(ctor_internal_tag(), buff, cgh, rmfn) {} + + template + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer& buff, handler& cgh, const Functor& rmfn, + const detail::access_tag /* tag */, const property_list& /* prop_list */) { + static_assert(detail::constexpr_false, + "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init " + "as a last argument in the constructor"); + } + // explicitly defaulted because we define operator=(value_type) for Dims == 0 - accessor(const accessor&) = default; accessor(accessor&&) noexcept = default; - accessor& operator=(const accessor&) = default; accessor& operator=(accessor&&) noexcept = default; +#if !defined(__SYCL_DEVICE_ONLY__) + accessor(const accessor& other) { copy_and_hydrate(other); } + + accessor& operator=(const accessor& other) { + if(this != &other) { copy_and_hydrate(other); } + return *this; + } +#else + accessor(const accessor&) = default; + accessor& operator=(const accessor&) = default; +#endif + // SYCL allows assigning values to accessors directly in the 0-dimensional case template = 0> @@ -284,38 +310,45 @@ class accessor : public detail::accessor_base CELERITY_DETAIL_NO_UNIQUE_ADDRESS id m_index_offset; CELERITY_DETAIL_NO_UNIQUE_ADDRESS range m_buffer_range = detail::zero_range; + template + accessor(const ctor_internal_tag /* tag */, const buffer& buff, handler& cgh, const Functor& rmfn) { + using range_mapper = detail::range_mapper>; // decay function type to function pointer + const auto hid = detail::add_requirement(cgh, detail::get_buffer_id(buff), std::make_unique(rmfn, Mode, buff.get_range())); + detail::extend_lifetime(cgh, std::move(detail::get_lifetime_extending_state(buff))); + m_device_ptr = detail::embed_hydration_id(hid); + } + // Constructor for tests, called through accessor_testspy. - accessor(DataT* ptr, id index_offset, range buffer_range) : m_device_ptr(ptr), m_index_offset(index_offset), m_buffer_range(buffer_range) { + accessor(DataT* const ptr, const id& index_offset, const range& buffer_range) + : m_device_ptr(ptr), m_index_offset(index_offset), m_buffer_range(buffer_range) { +#if defined(__SYCL_DEVICE_ONLY__) #if CELERITY_WORKAROUND_HIPSYCL // hipSYCL does not yet implement is_device_copyable_v static_assert(std::is_trivially_copyable_v); #else static_assert(sycl::is_device_copyable_v); +#endif #endif } - template - accessor(const ctor_internal_tag /* tag */, const buffer& buff, handler& cgh, const Functor& rmfn) { - if(detail::is_prepass_handler(cgh)) { - auto& prepass_cgh = dynamic_cast(cgh); - using range_mapper = detail::range_mapper>; // decay function type to function pointer - prepass_cgh.add_requirement(detail::get_buffer_id(buff), std::make_unique(rmfn, Mode, buff.get_range())); - } else { - if(detail::get_handler_execution_target(cgh) != detail::execution_target::device) { - throw std::runtime_error( - "Calling accessor constructor with device target is only allowed in parallel_for tasks." - "If you want to access this buffer from within a host task, please specialize the call using one of the *_host_task tags"); + // Constructor for tests, called through accessor_testspy. + accessor(const detail::hydration_id hid, const id& index_offset, const range& buffer_range) + : accessor(detail::embed_hydration_id(hid), index_offset, buffer_range) {} + + void copy_and_hydrate(const accessor& other) { + m_device_ptr = other.m_device_ptr; + m_index_offset = other.m_index_offset; + m_buffer_range = other.m_buffer_range; + +#if !defined(__SYCL_DEVICE_ONLY__) + if(detail::is_embedded_hydration_id(m_device_ptr)) { + if(detail::closure_hydrator::is_available() && detail::closure_hydrator::get_instance().is_hydrating()) { + const auto info = detail::closure_hydrator::get_instance().get_accessor_info(detail::extract_hydration_id(m_device_ptr)); + m_device_ptr = static_cast(info.ptr); + m_index_offset = detail::id_cast(info.buffer_offset); + m_buffer_range = detail::range_cast(info.buffer_range); } - - auto& live_cgh = dynamic_cast(cgh); - // It's difficult to figure out which stored range mapper corresponds to this constructor call, which is why we just call the raw mapper manually. - const auto mapped_sr = live_cgh.apply_range_mapper(rmfn, buff.get_range()); - auto access_info = - detail::runtime::get_instance().get_buffer_manager().access_device_buffer(detail::get_buffer_id(buff), Mode, mapped_sr); - - m_device_ptr = static_cast(access_info.ptr); - m_index_offset = detail::id_cast(access_info.backing_buffer_offset); - m_buffer_range = detail::range_cast(access_info.backing_buffer_range); } +#endif } size_t get_linear_offset(const id& index) const { return detail::get_linear_index(m_buffer_range, index - m_index_offset); } @@ -325,82 +358,81 @@ template class accessor : public detail::accessor_base { friend struct detail::accessor_testspy; + struct ctor_internal_tag {}; + public: static_assert(Mode != access_mode::atomic, "access_mode::atomic is not supported."); accessor() noexcept = default; template - accessor(const buffer& buff, handler& cgh, const Functor& rmfn) { - static_assert(!std::is_same_v>, "The accessor constructor overload for master-access tasks (now called 'host tasks') has " - "been removed with Celerity 0.2.0. Please provide a range mapper instead."); - - if(detail::is_prepass_handler(cgh)) { - auto& prepass_cgh = dynamic_cast(cgh); - prepass_cgh.add_requirement(detail::get_buffer_id(buff), std::make_unique>(rmfn, Mode, buff.get_range())); - } else { - if(detail::get_handler_execution_target(cgh) != detail::execution_target::host) { - throw std::runtime_error( - "Calling accessor constructor with host_buffer target is only allowed in host tasks." - "If you want to access this buffer from within a parallel_for task, please specialize the call using one of the non host tags"); - } - auto& live_cgh = dynamic_cast(cgh); - // It's difficult to figure out which stored range mapper corresponds to this constructor call, which is why we just call the raw mapper - // manually. - const auto sr = live_cgh.apply_range_mapper(rmfn, buff.get_range()); - auto access_info = detail::runtime::get_instance().get_buffer_manager().access_host_buffer(detail::get_buffer_id(buff), Mode, sr); - - m_mapped_subrange = sr; - m_index_offset = detail::id_cast(access_info.backing_buffer_offset); - m_buffer_range = detail::range_cast(access_info.backing_buffer_range); - m_virtual_buffer_range = buff.get_range(); - m_host_ptr = static_cast(access_info.ptr); - } - } + accessor(buffer& buff, handler& cgh, const Functor& rmfn) : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {} template - accessor(const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */) - : accessor(buff, cgh, rmfn) {} + accessor(buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */) + : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {} /** * TODO: As of ComputeCpp 2.5.0 they do not support no_init prop, hence this constructor is needed along with discard deduction guide. * but once they do this should be replace for a constructor that takes a prop list as an argument. */ template > - accessor(const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */, + accessor(buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */, const property::no_init& /* no_init */) - : accessor(buff, cgh, rmfn) {} - - template - accessor(const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */, - const property_list& /* prop_list */) { - static_assert(detail::constexpr_false, - "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init " - "as a last argument in the constructor"); - } + : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {} template = 0> - accessor(const buffer& buff, handler& cgh) : accessor(buff, cgh, access::all()) {} + accessor(buffer& buff, handler& cgh) : accessor(buff, cgh, access::all()) {} template = 0> - accessor(const buffer& buff, handler& cgh, const detail::access_tag tag) + accessor(buffer& buff, handler& cgh, const detail::access_tag tag) : accessor(buff, cgh, access::all(), tag) {} template = 0> - accessor(const buffer& buff, handler& cgh, const detail::access_tag tag, const property::no_init& no_init) + accessor(buffer& buff, handler& cgh, const detail::access_tag tag, const property::no_init& no_init) : accessor(buff, cgh, access::all(), tag, no_init) {} template = 0> - accessor( - const buffer& buff, handler& cgh, const detail::access_tag tag, const property_list& prop_list) + accessor(buffer& buff, handler& cgh, const detail::access_tag tag, const property_list& prop_list) : accessor(buff, cgh, access::all(), tag, prop_list) {} + template + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer& buff, handler& cgh, const Functor& rmfn) + : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {} + + template + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor( + const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag /* tag */) + : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {} + + /** + * TODO: As of ComputeCpp 2.5.0 they do not support no_init prop, hence this constructor is needed along with discard deduction guide. + * but once they do this should be replace for a constructor that takes a prop list as an argument. + */ + template > + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer& buff, handler& cgh, const Functor& rmfn, + const detail::access_tag /* tag */, const property::no_init& /* no_init */) + : accessor(ctor_internal_tag{}, buff, cgh, rmfn) {} + + template + CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR accessor(const buffer& buff, handler& cgh, const Functor& rmfn, + const detail::access_tag /* tag */, const property_list& /* prop_list */) { + static_assert(detail::constexpr_false, + "Currently it is not accepted to pass a property list to an accessor constructor. Please use the property celerity::no_init " + "as a last argument in the constructor"); + } + // explicitly defaulted because we define operator=(value_type) for Dims == 0 - accessor(const accessor&) = default; accessor(accessor&&) noexcept = default; - accessor& operator=(const accessor&) = default; accessor& operator=(accessor&&) noexcept = default; + accessor(const accessor& other) { copy_and_hydrate(other); } + + accessor& operator=(const accessor& other) { + if(this != &other) { copy_and_hydrate(other); } + return *this; + } + // SYCL allows assigning values to accessors directly in the 0-dimensional case template = 0> @@ -560,15 +592,49 @@ class accessor : public detail::accessor_b // m_host_ptr must be defined *last* for it to overlap with the sequence of range and id members in the 0-dimensional case CELERITY_DETAIL_NO_UNIQUE_ADDRESS DataT* m_host_ptr = nullptr; + template + accessor(ctor_internal_tag /* tag */, const buffer& buff, handler& cgh, const Functor& rmfn) : m_virtual_buffer_range(buff.get_range()) { + using range_mapper = detail::range_mapper>; // decay function type to function pointer + const auto hid = detail::add_requirement(cgh, detail::get_buffer_id(buff), std::make_unique(rmfn, Mode, buff.get_range())); + detail::extend_lifetime(cgh, std::move(detail::get_lifetime_extending_state(buff))); + m_host_ptr = detail::embed_hydration_id(hid); + } + // Constructor for tests, called through accessor_testspy. - accessor(subrange mapped_subrange, DataT* ptr, id backing_buffer_offset, range backing_buffer_range, range virtual_buffer_range) + accessor(const subrange mapped_subrange, DataT* const ptr, const id& backing_buffer_offset, const range& backing_buffer_range, + const range& virtual_buffer_range) : m_mapped_subrange(mapped_subrange), m_index_offset(backing_buffer_offset), m_buffer_range(backing_buffer_range), m_virtual_buffer_range(virtual_buffer_range), m_host_ptr(ptr) {} + // Constructor for tests, called through accessor_testspy. + accessor(const subrange& mapped_subrange, const detail::hydration_id hid, const id& backing_buffer_offset, + const range& backing_buffer_range, range virtual_buffer_range) + : accessor(mapped_subrange, detail::embed_hydration_id(hid), backing_buffer_offset, backing_buffer_range, virtual_buffer_range) {} + + void copy_and_hydrate(const accessor& other) { + m_mapped_subrange = other.m_mapped_subrange; + m_host_ptr = other.m_host_ptr; + m_index_offset = other.m_index_offset; + m_buffer_range = other.m_buffer_range; + m_virtual_buffer_range = other.m_virtual_buffer_range; + + if(detail::is_embedded_hydration_id(m_host_ptr)) { + if(detail::closure_hydrator::is_available() && detail::closure_hydrator::get_instance().is_hydrating()) { + const auto info = detail::closure_hydrator::get_instance().get_accessor_info(detail::extract_hydration_id(m_host_ptr)); + m_host_ptr = static_cast(info.ptr); + m_index_offset = detail::id_cast(info.buffer_offset); + m_buffer_range = detail::range_cast(info.buffer_range); + m_mapped_subrange = detail::subrange_cast(info.accessor_sr); + } + } + } + size_t get_linear_offset(const id& index) const { return detail::get_linear_index(m_buffer_range, index - m_index_offset); } }; +#undef CELERITY_DETAIL_ACCESSOR_DEPRECATED_CTOR +// TODO: Make buffer non-const once corresponding (deprecated!) constructor overloads are removed template accessor(const buffer& buff, handler& cgh, const Functor& rmfn, const detail::access_tag tag) -> accessor; @@ -597,6 +663,7 @@ class local_accessor { friend struct detail::accessor_testspy; static_assert(Dims <= 3); + friend struct detail::accessor_testspy; private: constexpr static int sycl_dims = std::max(1, Dims); @@ -618,17 +685,15 @@ class local_accessor { template > local_accessor(handler& cgh) : local_accessor(range<0>(), cgh) {} -#if !defined(__SYCL_DEVICE_ONLY__) && !defined(SYCL_DEVICE_ONLY) - local_accessor(const range& allocation_size, handler& cgh) : m_sycl_acc{}, m_allocation_size(allocation_size) { - if(!detail::is_prepass_handler(cgh)) { - auto& device_handler = dynamic_cast(cgh); - m_eventual_sycl_cgh = device_handler.get_eventual_sycl_cgh(); - } - } +#if !defined(__SYCL_DEVICE_ONLY__) + local_accessor(const range& allocation_size, handler& cgh) : m_sycl_acc{}, m_allocation_size(allocation_size) {} local_accessor(const local_accessor& other) - : m_sycl_acc(other.sycl_cgh() ? sycl_accessor{other.sycl_allocation_size(), *other.sycl_cgh()} : other.m_sycl_acc), - m_allocation_size(other.m_allocation_size), m_eventual_sycl_cgh(other.sycl_cgh() ? nullptr : other.m_eventual_sycl_cgh) {} + : m_sycl_acc( + detail::closure_hydrator::is_available() && detail::closure_hydrator::get_instance().is_hydrating() && other.sycl_allocation_size().size() > 0 + ? sycl_accessor{other.sycl_allocation_size(), detail::closure_hydrator::get_instance().get_sycl_handler()} + : other.m_sycl_acc), + m_allocation_size(other.m_allocation_size) {} #else local_accessor(const range& allocation_size, handler& cgh); local_accessor(const local_accessor&) = default; @@ -696,12 +761,11 @@ class local_accessor { private: sycl_accessor m_sycl_acc; CELERITY_DETAIL_NO_UNIQUE_ADDRESS range m_allocation_size; - cl::sycl::handler* const* m_eventual_sycl_cgh = nullptr; - // TODO after multi-pass removal: verify that sizeof(celerity::local_accessor) == sizeof(sycl::local_accessor) in accessor_tests (currently [!shouldfail]) - - cl::sycl::handler* sycl_cgh() const { return m_eventual_sycl_cgh != nullptr ? *m_eventual_sycl_cgh : nullptr; } sycl::range sycl_allocation_size() const { return sycl::range(detail::range_cast(m_allocation_size)); } + + // Constructor for tests, called through accessor_testspy. + explicit local_accessor(const range& allocation_size) : m_sycl_acc{}, m_allocation_size(allocation_size) {} }; } // namespace celerity diff --git a/include/buffer.h b/include/buffer.h index a9ed29862..38f933a40 100644 --- a/include/buffer.h +++ b/include/buffer.h @@ -6,6 +6,7 @@ #include #include "buffer_manager.h" +#include "lifetime_extending_state.h" #include "range_mapper.h" #include "ranges.h" #include "runtime.h" @@ -36,17 +37,14 @@ template class accessor; template -class buffer { +class buffer final : public detail::lifetime_extending_state_wrapper { public: static_assert(Dims <= 3); template > buffer() : buffer(nullptr, {}) {} - explicit buffer(const DataT* host_ptr, range range) { - if(!detail::runtime::is_initialized()) { detail::runtime::init(nullptr, nullptr); } - m_impl = std::make_shared(range, host_ptr); - } + explicit buffer(const DataT* host_ptr, range range) : m_impl(std::make_shared(range, host_ptr)) {} explicit buffer(range range) : buffer(nullptr, range) {} @@ -62,35 +60,52 @@ class buffer { ~buffer() {} template 0), int> = 0> - accessor get_access(handler& cgh, Functor rmfn) const { + accessor get_access(handler& cgh, Functor rmfn) { return get_access(cgh, rmfn); } template = 0> - accessor get_access(handler& cgh) const { + accessor get_access(handler& cgh) { return get_access(cgh); } template 0), int> = 0> - accessor get_access(handler& cgh, Functor rmfn) const { + accessor get_access(handler& cgh, Functor rmfn) { return accessor(*this, cgh, rmfn); } template = 0> - accessor get_access(handler& cgh) const { + accessor get_access(handler& cgh) { return accessor(*this, cgh); } + template 0), int> = 0> + [[deprecated("Calling get_access on a const buffer is deprecated")]] accessor get_access( + handler& cgh, Functor rmfn) const { + return get_access(cgh, rmfn); + } + + template 0), int> = 0> + [[deprecated("Calling get_access on a const buffer is deprecated")]] accessor get_access(handler& cgh, Functor rmfn) const { + return accessor(*this, cgh, rmfn); + } + const range& get_range() const { return m_impl->range; } + protected: + std::shared_ptr get_lifetime_extending_state() const override { return m_impl; } + private: - struct impl { + struct impl final : public detail::lifetime_extending_state { impl(range rng, const DataT* host_init_ptr) : range(rng) { + if(!detail::runtime::is_initialized()) { detail::runtime::init(nullptr, nullptr); } id = detail::runtime::get_instance().get_buffer_manager().register_buffer(detail::range_cast<3>(range), host_init_ptr); } impl(const impl&) = delete; impl(impl&&) = delete; - ~impl() noexcept { detail::runtime::get_instance().get_buffer_manager().unregister_buffer(id); } + impl& operator=(const impl&) = delete; + impl& operator=(impl&&) = delete; + ~impl() override { detail::runtime::get_instance().get_buffer_manager().unregister_buffer(id); } detail::buffer_id id; celerity::range range; std::string debug_name; diff --git a/include/closure_hydrator.h b/include/closure_hydrator.h new file mode 100644 index 000000000..554ebc7a0 --- /dev/null +++ b/include/closure_hydrator.h @@ -0,0 +1,157 @@ +#pragma once + +#include +#include +#include + +#include "log.h" +#include "ranges.h" +#include "sycl_wrappers.h" +#include "types.h" + +namespace celerity::detail { + +// To avoid additional register pressure, we embed hydration IDs into pointers for +// accessors, with the assumption that a real pointer will never be in the +// range [0, max_hydration_id]. Embedding / extracting are currently no-ops +// and the associated helper functions only exist for documentation purposes. +// This number puts an effective limit on the number of task objects (accessors +// etc.) that can be captured into a command function. +constexpr size_t max_hydration_id = 128; + +template +constexpr bool can_embed_hydration_id = std::bool_constant::value; + +template +T embed_hydration_id(const hydration_id hid) { + static_assert(can_embed_hydration_id); + assert(hid > 0); // Has to be greater than zero so nullptr is not considered an embedded id + assert(hid <= max_hydration_id); + T result; + std::memcpy(&result, &hid, sizeof(hid)); + return result; +} + +template +hydration_id extract_hydration_id(const T value) { + static_assert(can_embed_hydration_id); + hydration_id result; + std::memcpy(&result, &value, sizeof(value)); + return result; +} + +template +bool is_embedded_hydration_id(const T value) { + static_assert(can_embed_hydration_id); + const auto hid = extract_hydration_id(value); + return hid > 0 && hid <= max_hydration_id; +} + +/** + * The closure hydrator is used to inject information into objects (currently host/device and local accessors) that have been captured into a lambda closure. + * We abuse the copy constructor of the captured objects to modify them while the containing closure is being copied by the hydrate() function. + * Accessors request their corresponding information by means of per-closure unique "hydration ids" that are assigned upon accessor creation. + * + * The hydrator is implemented as a thread-local singleton that needs to be explicitly enabled per-thread. This is because kernel command function + * closures may be copied any number of times after having been passed to SYCL, which should not trigger the hydration mechanism. + */ +class closure_hydrator { + public: + struct accessor_info { + void* ptr; + range<3> buffer_range; + id<3> buffer_offset; + subrange<3> accessor_sr; + }; + + closure_hydrator(const closure_hydrator&) = delete; + closure_hydrator(closure_hydrator&&) = delete; + closure_hydrator& operator=(const closure_hydrator&) = delete; + closure_hydrator& operator=(closure_hydrator&&) = delete; + ~closure_hydrator() = default; + + static void make_available() { + assert(m_instance == nullptr); + m_instance = std::unique_ptr(new closure_hydrator()); + } + + static bool is_available() { return m_instance != nullptr; } + + static void teardown() { m_instance.reset(); } + + static closure_hydrator& get_instance() { + assert(m_instance != nullptr); + return *m_instance; + } + + /** + * Puts the hydrator into the "armed" state, after which hydrate() can be called to hydrate kernel functions. + * + * accessor_infos must contain one entry for each hydration id that has been assigned to accessors in the + * closure that is to be hydrated, in matching order. + */ + void arm(const target tgt, std::vector accessor_infos) { + assert(!m_armed_for.has_value()); + assert(accessor_infos.size() < max_hydration_id); + m_armed_for = tgt; + m_accessor_infos = std::move(accessor_infos); + } + + /** + * Hydrates the provided closure by copying it in a context where calls to get_accessor_info and get_sycl_handler are allowed. + */ + template = 0> + [[nodiscard]] auto hydrate(sycl::handler& cgh, const Closure& closure) { + return hydrate(target::device, &cgh, closure); + } + + /** + * Hydrates the provided closure by copying it in a context where calls to get_accessor_info are allowed. + */ + template = 0> + [[nodiscard]] auto hydrate(const Closure& closure) { + return hydrate(target::host_task, nullptr, closure); + } + + bool is_hydrating() const { return m_is_hydrating; } + + template + accessor_info get_accessor_info(const hydration_id hid) { + assert(m_armed_for.has_value() && *m_armed_for == Tgt); + assert(!m_accessor_infos.empty()); + assert(hid > 0); + assert(hid <= m_accessor_infos.size()); + return m_accessor_infos[hid - 1]; + } + + sycl::handler& get_sycl_handler() { + assert(m_sycl_cgh != nullptr); + return *m_sycl_cgh; + } + + private: + inline static thread_local std::unique_ptr m_instance; // NOLINT(cppcoreguidelines-avoid-non-const-global-variables) + std::vector m_accessor_infos; + std::optional m_armed_for = std::nullopt; + bool m_is_hydrating = false; + sycl::handler* m_sycl_cgh = nullptr; + + closure_hydrator() = default; + + template + [[nodiscard]] auto hydrate(const target tgt, sycl::handler* cgh, const Closure& closure) { + static_assert(std::is_copy_constructible_v>); + assert(m_armed_for.has_value() && *m_armed_for == tgt); + assert(tgt == target::host_task || cgh != nullptr); + m_sycl_cgh = cgh; + m_is_hydrating = true; + Closure hydrated{closure}; + m_is_hydrating = false; + m_sycl_cgh = nullptr; + m_accessor_infos.clear(); + m_armed_for = std::nullopt; + return hydrated; + } +}; + +}; // namespace celerity::detail \ No newline at end of file diff --git a/include/distr_queue.h b/include/distr_queue.h index 53d8992e5..7a00e8f65 100644 --- a/include/distr_queue.h +++ b/include/distr_queue.h @@ -15,14 +15,14 @@ namespace detail { ~distr_queue_tracker() { runtime::get_instance().shutdown(); } }; - template - constexpr bool is_safe_cgf = std::is_standard_layout::value; - } // namespace detail -struct allow_by_ref_t {}; +struct [[deprecated("This tag type is no longer required to capture by reference")]] allow_by_ref_t{}; +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" inline constexpr allow_by_ref_t allow_by_ref{}; +#pragma GCC diagnostic pop class distr_queue { public: @@ -51,28 +51,19 @@ class distr_queue { * Submits a command group to the queue. * * Invoke via `q.submit(celerity::allow_by_ref, [&](celerity::handler &cgh) {...})`. - * - * With this overload, CGF may capture by-reference. This may lead to lifetime issues with asynchronous execution, so using the `submit(cgf)` overload is - * preferred in the common case. */ template - void submit(allow_by_ref_t, CGF cgf) { // NOLINT(readability-convert-member-functions-to-static) - // (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.) - detail::runtime::get_instance().get_task_manager().submit_command_group(std::move(cgf)); + [[deprecated("This overload is no longer required to capture by reference")]] void submit(allow_by_ref_t /* tag */, CGF cgf) { + submit(std::move(cgf)); } /** * Submits a command group to the queue. - * - * CGF must not capture by reference. This is a conservative safety check to avoid lifetime issues when command groups are executed asynchronously. - * - * If you know what you are doing, you can use the `allow_by_ref` overload of `submit` to bypass this check. */ template - void submit(CGF cgf) { - static_assert(detail::is_safe_cgf, "The provided command group function is not multi-pass execution safe. Please make sure to only capture by " - "value. If you know what you're doing, use submit(celerity::allow_by_ref, ...)."); - submit(allow_by_ref, std::move(cgf)); + void submit(CGF cgf) { // NOLINT(readability-convert-member-functions-to-static) + // (Note while this function could be made static, it must not be! Otherwise we can't be sure the runtime has been initialized.) + detail::runtime::get_instance().get_task_manager().submit_command_group(std::move(cgf)); } /** diff --git a/include/handler.h b/include/handler.h index 2ed53e644..7a472a585 100644 --- a/include/handler.h +++ b/include/handler.h @@ -1,13 +1,16 @@ #pragma once +#include #include #include #include +#include #include #include #include "buffer.h" +#include "closure_hydrator.h" #include "device_queue.h" #include "host_queue.h" #include "item.h" @@ -30,10 +33,13 @@ class handler; namespace detail { class device_queue; class task_manager; - class prepass_handler; - inline bool is_prepass_handler(const handler& cgh); - inline execution_target get_handler_execution_target(const handler& cgh); + handler make_command_group_handler(const task_id tid, const size_t num_collective_nodes); + std::unique_ptr into_task(handler&& cgh); + hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr rm); + void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order); + void add_reduction(handler& cgh, const reduction_info& rinfo); + void extend_lifetime(handler& cgh, std::shared_ptr state); template std::string kernel_debug_name() { @@ -171,10 +177,132 @@ namespace experimental { inline constexpr collective_tag_factory collective; } // namespace experimental +namespace detail { + template + inline void invoke_kernel(const Kernel& kernel, const sycl::id& s_id, const range& global_range, const id& global_offset, + const id& chunk_offset, Reducers&... reducers) { + kernel(make_item(id_cast(id(s_id)) + chunk_offset, global_offset, global_range), reducers...); + } + + template + inline void invoke_kernel(const Kernel& kernel, const cl::sycl::nd_item& s_item, const range& global_range, + const id& global_offset, const id& chunk_offset, const range& group_range, const id& group_offset, Reducers&... reducers) { + kernel(make_nd_item(s_item, global_range, global_offset, chunk_offset, group_range, group_offset), reducers...); + } + + template + auto bind_simple_kernel(const Kernel& kernel, const range& global_range, const id& global_offset, const id& chunk_offset) { + return [=](auto s_item_or_id, auto&... reducers) { + constexpr int sycl_dims = std::max(1, Dims); + static_assert(std::is_invocable_v, decltype(reducers)...>, + "Kernel function must be invocable with celerity::item and as many reducer objects as reductions passed to parallel_for"); + if constexpr(CELERITY_WORKAROUND(DPCPP) && std::is_same_v, decltype(s_item_or_id)>) { + // CELERITY_WORKAROUND_LESS_OR_EQUAL: DPC++ passes a sycl::id instead of a sycl::item to kernels alongside reductions + invoke_kernel(kernel, s_item_or_id, global_range, global_offset, chunk_offset, reducers...); + } else { + // Explicit item constructor: ComputeCpp does not pass a sycl::item, but an implicitly convertible sycl::item_base (?) which does not have + // `sycl::id<> get_id()` + invoke_kernel(kernel, cl::sycl::item{s_item_or_id}.get_id(), global_range, global_offset, chunk_offset, reducers...); + } + }; + } + + template + auto bind_nd_range_kernel(const Kernel& kernel, const range& global_range, const id& global_offset, const id chunk_offset, + const range& group_range, const id& group_offset) { + return [=](sycl::nd_item s_item, auto&... reducers) { + static_assert(std::is_invocable_v, decltype(reducers)...>, + "Kernel function must be invocable with celerity::nd_item or and as many reducer objects as reductions passed to parallel_for"); + invoke_kernel(kernel, s_item, global_range, global_offset, chunk_offset, group_range, group_offset, reducers...); + }; + } + + template + inline void invoke_sycl_parallel_for(cl::sycl::handler& cgh, Params&&... args) { + static_assert(CELERITY_FEATURE_UNNAMED_KERNELS || !is_unnamed_kernel, + "Your SYCL implementation does not support unnamed kernels, add a kernel name template parameter to this parallel_for invocation"); + if constexpr(detail::is_unnamed_kernel) { +#if CELERITY_FEATURE_UNNAMED_KERNELS // see static_assert above + cgh.parallel_for(std::forward(args)...); +#endif + } else { + cgh.parallel_for>(std::forward(args)...); + } + } + + template + class reduction_descriptor; + + template + auto make_sycl_reduction(const reduction_descriptor& d, void* ptr, const bool is_initializer) { +#if !CELERITY_FEATURE_SCALAR_REDUCTIONS + static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); +#else + cl::sycl::property_list props; + if(!d.m_include_current_buffer_value || !is_initializer) { props = {cl::sycl::property::reduction::initialize_to_identity{}}; } + if constexpr(WithExplicitIdentity) { + return sycl::reduction(static_cast(ptr), d.m_identity, d.m_op, props); + } else { + return sycl::reduction(static_cast(ptr), d.m_op, props); + } +#endif + } + + template + class reduction_descriptor { + public: + reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT /* identity */, bool include_current_buffer_value) + : m_bid(bid), m_op(combiner), m_include_current_buffer_value(include_current_buffer_value) {} + + private: + friend auto make_sycl_reduction(const reduction_descriptor&, void*, const bool); + + buffer_id m_bid; + BinaryOperation m_op; + bool m_include_current_buffer_value; + }; + + template + class reduction_descriptor { + public: + reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT identity, bool include_current_buffer_value) + : m_bid(bid), m_op(combiner), m_identity(identity), m_include_current_buffer_value(include_current_buffer_value) {} + + private: + friend auto make_sycl_reduction(const reduction_descriptor&, void*, const bool); + + buffer_id m_bid; + BinaryOperation m_op; + DataT m_identity{}; + bool m_include_current_buffer_value; + }; + + template + auto make_reduction(const buffer& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) { +#if !CELERITY_FEATURE_SCALAR_REDUCTIONS + static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); +#else + if(vars.get_range().size() != 1) { + // Like SYCL 2020, Celerity only supports reductions to unit-sized buffers. This allows us to avoid tracking different parts of the buffer + // as distributed_state and pending_reduction_state. + throw std::runtime_error("Only unit-sized buffers can be reduction targets"); + } + + const auto bid = detail::get_buffer_id(vars); + const auto include_current_buffer_value = !prop_list.has_property(); + + const auto rid = detail::runtime::get_instance().get_reduction_manager().create_reduction(bid, op, identity); + add_reduction(cgh, reduction_info{rid, bid, include_current_buffer_value}); + extend_lifetime(cgh, std::move(get_lifetime_extending_state(vars))); + + return detail::reduction_descriptor{bid, op, identity, include_current_buffer_value}; +#endif + } + +} // namespace detail + class handler { public: - virtual ~handler() = default; - template void parallel_for(range global_range, ReductionsAndKernel... reductions_and_kernel) { static_assert(sizeof...(reductions_and_kernel) > 0, "No kernel given"); @@ -208,7 +336,10 @@ class handler { * may require different lifetimes for captures. See `celerity::allow_by_ref` for more information on this topic. */ template - void host_task(on_master_node_tag, Functor kernel); + void host_task(on_master_node_tag /* tag */, Functor kernel) { + auto launcher = make_host_task_launcher<0, false>(detail::zero_range, 0, kernel); + create_master_node_task(std::move(launcher)); + } /** * Schedules `kernel` to be executed collectively on all nodes participating in the specified collective group. Call via @@ -224,7 +355,11 @@ class handler { * and all host kernel invocations are executed in a single thread on each host. */ template - void host_task(experimental::collective_tag tag, Functor kernel); + void host_task(experimental::collective_tag tag, Functor kernel) { + // FIXME: We should not have to know how the global range is determined for collective tasks to create the launcher + auto launcher = make_host_task_launcher<1, true>(range<3>{m_num_collective_nodes, 1, 1}, tag.m_cgid, kernel); + create_collective_task(tag.m_cgid, std::move(launcher)); + } /** * Schedules a distributed execution of `kernel` by splitting the iteration space in a runtime-defined manner. The kernel is assumed to be invocable @@ -239,7 +374,11 @@ class handler { * another node. If you need guarantees about execution order */ template - void host_task(range global_range, id global_offset, Functor kernel); + void host_task(range global_range, id global_offset, Functor kernel) { + const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), {1, 1, 1}}; + auto launcher = make_host_task_launcher(detail::range_cast<3>(global_range), 0, kernel); + create_host_compute_task(geometry, std::move(launcher)); + } /** * Like `host_task(range global_range, id global_offset, Functor kernel)`, but with a `global_offset` of zero. @@ -249,17 +388,25 @@ class handler { host_task(global_range, {}, task); } - protected: - friend bool detail::is_prepass_handler(const handler& cgh); - friend detail::execution_target detail::get_handler_execution_target(const handler& cgh); - - handler() = default; - - virtual bool is_prepass() const = 0; - - virtual const detail::task& get_task() const = 0; - private: + friend handler detail::make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes); + friend std::unique_ptr detail::into_task(handler&& cgh); + friend detail::hydration_id detail::add_requirement(handler& cgh, const detail::buffer_id bid, std::unique_ptr rm); + friend void detail::add_requirement(handler& cgh, const detail::host_object_id hoid, const experimental::side_effect_order order); + friend void detail::add_reduction(handler& cgh, const detail::reduction_info& rinfo); + friend void detail::extend_lifetime(handler& cgh, std::shared_ptr state); + + detail::task_id m_tid; + detail::buffer_access_map m_access_map; + detail::side_effect_map m_side_effects; + detail::reduction_set m_reductions; + std::unique_ptr m_task = nullptr; + size_t m_num_collective_nodes; + detail::hydration_id m_next_accessor_hydration_id = 1; + std::vector> m_attached_state; + + handler(detail::task_id tid, size_t num_collective_nodes) : m_tid(tid), m_num_collective_nodes(num_collective_nodes) {} + template void parallel_for_reductions_and_kernel(range global_range, id global_offset, typename detail::kernel_flavor_traits::local_size_type local_size, std::index_sequence indices, @@ -272,399 +419,212 @@ class handler { template void parallel_for_kernel_and_reductions(range global_range, id global_offset, - typename detail::kernel_flavor_traits::local_size_type local_range, Kernel& kernel, Reductions&... reductions); -}; - -namespace detail { - - inline bool is_prepass_handler(const handler& cgh) { return cgh.is_prepass(); } - inline execution_target get_handler_execution_target(const handler& cgh) { return cgh.get_task().get_execution_target(); } - - class prepass_handler final : public handler { - public: - explicit prepass_handler(task_id tid, std::unique_ptr cgf, size_t num_collective_nodes) - : m_tid(tid), m_cgf(std::move(cgf)), m_num_collective_nodes(num_collective_nodes) {} - - void add_requirement(buffer_id bid, std::unique_ptr rm) { - assert(m_task == nullptr); - m_access_map.add_access(bid, std::move(rm)); - } - - void add_requirement(const host_object_id hoid, const experimental::side_effect_order order) { - assert(m_task == nullptr); - m_side_effects.add_side_effect(hoid, order); - } - - void add_reduction(const reduction_info& rinfo) { m_reductions.push_back(rinfo); } - - void create_host_compute_task(task_geometry geometry) { - assert(m_task == nullptr); - if(geometry.global_size.size() == 0) { - // TODO this can be easily supported by not creating a task in case the execution range is empty - throw std::runtime_error{"The execution range of distributed host tasks must have at least one item"}; - } - m_task = - detail::task::make_host_compute(m_tid, geometry, std::move(m_cgf), std::move(m_access_map), std::move(m_side_effects), std::move(m_reductions)); + typename detail::kernel_flavor_traits::local_size_type local_range, Kernel& kernel, Reductions&... reductions) { + if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) { + static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); } - void create_device_compute_task(task_geometry geometry, std::string debug_name) { - assert(m_task == nullptr); - if(geometry.global_size.size() == 0) { - // TODO unless reductions are involved, this can be easily supported by not creating a task in case the execution range is empty. - // Edge case: If the task includes reductions that specify property::reduction::initialize_to_identity, we need to create a task that sets - // the buffer state to an empty pending_reduction_state in the graph_generator. This will cause a trivial reduction_command to be generated on - // each node that reads from the reduction output buffer, initializing it to the identity value locally. - throw std::runtime_error{"The execution range of device tasks must have at least one item"}; + range<3> granularity = {1, 1, 1}; + if constexpr(detail::kernel_flavor_traits::has_local_size) { + for(int d = 0; d < Dims; ++d) { + granularity[d] = local_range[d]; } - if(!m_side_effects.empty()) { throw std::runtime_error{"Side effects cannot be used in device kernels"}; } - m_task = - detail::task::make_device_compute(m_tid, geometry, std::move(m_cgf), std::move(m_access_map), std::move(m_reductions), std::move(debug_name)); } + const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), granularity}; + auto launcher = make_device_kernel_launcher( + global_range, global_offset, local_range, kernel, std::index_sequence_for(), reductions...); + create_device_compute_task(geometry, detail::kernel_debug_name(), std::move(launcher)); + } - void create_collective_task(collective_group_id cgid) { - assert(m_task == nullptr); - m_task = detail::task::make_collective(m_tid, cgid, m_num_collective_nodes, std::move(m_cgf), std::move(m_access_map), std::move(m_side_effects)); - } + [[nodiscard]] detail::hydration_id add_requirement(const detail::buffer_id bid, std::unique_ptr rm) { + assert(m_task == nullptr); + m_access_map.add_access(bid, std::move(rm)); + return m_next_accessor_hydration_id++; + } - void create_master_node_task() { - assert(m_task == nullptr); - m_task = detail::task::make_master_node(m_tid, std::move(m_cgf), std::move(m_access_map), std::move(m_side_effects)); - } + void add_requirement(const detail::host_object_id hoid, const experimental::side_effect_order order) { + assert(m_task == nullptr); + m_side_effects.add_side_effect(hoid, order); + } - std::unique_ptr into_task() && { return std::move(m_task); } + void add_reduction(const detail::reduction_info& rinfo) { + assert(m_task == nullptr); + m_reductions.push_back(rinfo); + } - protected: - bool is_prepass() const override { return true; } + void extend_lifetime(std::shared_ptr state) { m_attached_state.emplace_back(std::move(state)); } - const class task& get_task() const override { - assert(m_task != nullptr); - return *m_task; + void create_host_compute_task(detail::task_geometry geometry, std::unique_ptr launcher) { + assert(m_task == nullptr); + if(geometry.global_size.size() == 0) { + // TODO this can be easily supported by not creating a task in case the execution range is empty + throw std::runtime_error{"The execution range of distributed host tasks must have at least one item"}; } + m_task = + detail::task::make_host_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_side_effects), std::move(m_reductions)); + } - private: - task_id m_tid; - std::unique_ptr m_cgf; - buffer_access_map m_access_map; - side_effect_map m_side_effects; - reduction_set m_reductions; - std::unique_ptr m_task = nullptr; - size_t m_num_collective_nodes; - }; - - class live_pass_handler : public handler { - public: - bool is_prepass() const final { return false; } - - const class task& get_task() const final { return *m_task; } - - template - subrange apply_range_mapper(RangeMapper rm, const range& buffer_range) const { - return invoke_range_mapper(m_task->get_dimensions(), rm, chunk{m_sr.offset, m_sr.range, m_task->get_global_size()}, buffer_range); + void create_device_compute_task(detail::task_geometry geometry, std::string debug_name, std::unique_ptr launcher) { + assert(m_task == nullptr); + if(geometry.global_size.size() == 0) { + // TODO unless reductions are involved, this can be easily supported by not creating a task in case the execution range is empty. + // Edge case: If the task includes reductions that specify property::reduction::initialize_to_identity, we need to create a task that sets + // the buffer state to an empty pending_reduction_state in the graph_generator. This will cause a trivial reduction_command to be generated on + // each node that reads from the reduction output buffer, initializing it to the identity value locally. + throw std::runtime_error{"The execution range of device tasks must have at least one item"}; } + if(!m_side_effects.empty()) { throw std::runtime_error{"Side effects cannot be used in device kernels"}; } + m_task = + detail::task::make_device_compute(m_tid, geometry, std::move(launcher), std::move(m_access_map), std::move(m_reductions), std::move(debug_name)); + } - subrange<3> get_iteration_range() { return m_sr; } - - bool is_reduction_initializer() const { return m_initialize_reductions; } - - protected: - live_pass_handler(const class task* task, subrange<3> sr, bool initialize_reductions) - : m_task(task), m_sr(sr), m_initialize_reductions(initialize_reductions) {} + void create_collective_task(detail::collective_group_id cgid, std::unique_ptr launcher) { + assert(m_task == nullptr); + m_task = detail::task::make_collective(m_tid, cgid, m_num_collective_nodes, std::move(launcher), std::move(m_access_map), std::move(m_side_effects)); + } - const class task* m_task = nullptr; + void create_master_node_task(std::unique_ptr launcher) { + assert(m_task == nullptr); + m_task = detail::task::make_master_node(m_tid, std::move(launcher), std::move(m_access_map), std::move(m_side_effects)); + } - // The subrange, when combined with the tasks global size, defines the chunk this handler executes. - subrange<3> m_sr; + template + auto make_device_kernel_launcher(const range& global_range, const id& global_offset, + typename detail::kernel_flavor_traits::local_size_type local_range, Kernel kernel, + std::index_sequence /* indices */, Reductions... reductions) { + static_assert(std::is_copy_constructible_v>, "Kernel functor must be copyable"); // Required for hydration + + auto fn = [=](detail::device_queue& q, const subrange<3> execution_sr, const std::vector& reduction_ptrs, const bool is_reduction_initializer) { + return q.submit([&](sycl::handler& cgh) { + constexpr int sycl_dims = std::max(1, Dims); + // Copy once to hydrate accessors + auto hydrated_kernel = detail::closure_hydrator::get_instance().hydrate(cgh, kernel); + if constexpr(std::is_same_v) { + const auto sycl_global_range = sycl::range(detail::range_cast(execution_sr.range)); + detail::invoke_sycl_parallel_for(cgh, sycl_global_range, + detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices], is_reduction_initializer)..., + detail::bind_simple_kernel(hydrated_kernel, global_range, global_offset, detail::id_cast(execution_sr.offset))); + } else if constexpr(std::is_same_v) { + const auto sycl_global_range = sycl::range(detail::range_cast(execution_sr.range)); + const auto sycl_local_range = sycl::range(detail::range_cast(local_range)); + detail::invoke_sycl_parallel_for(cgh, cl::sycl::nd_range{sycl_global_range, sycl_local_range}, + detail::make_sycl_reduction(reductions, reduction_ptrs[ReductionIndices], is_reduction_initializer)..., + detail::bind_nd_range_kernel(hydrated_kernel, global_range, global_offset, detail::id_cast(execution_sr.offset), + global_range / local_range, detail::id_cast(execution_sr.offset) / local_range)); + } else { + static_assert(detail::constexpr_false); + } + }); + }; - bool m_initialize_reductions; - }; + return std::make_unique>(std::move(fn)); + } - class live_pass_host_handler final : public live_pass_handler { - public: - live_pass_host_handler(const class task* task, subrange<3> sr, bool initialize_reductions, host_queue& queue) - : live_pass_handler(task, sr, initialize_reductions), m_queue(&queue) {} + template + auto make_host_task_launcher(const range<3>& global_range, const detail::collective_group_id cgid, Kernel kernel) { + static_assert(std::is_copy_constructible_v>, "Kernel functor must be copyable"); // Required for hydration + static_assert(Dims >= 0); - template - void schedule(Kernel kernel) { - static_assert(Dims >= 0); - m_future = m_queue->submit(m_task->get_collective_group_id(), [kernel, global_size = m_task->get_global_size(), sr = m_sr](MPI_Comm) { + auto fn = [kernel, cgid, global_range](detail::host_queue& q, const subrange<3>& execution_sr) { + auto hydrated_kernel = detail::closure_hydrator::get_instance().hydrate(kernel); + return q.submit(cgid, [hydrated_kernel, global_range, execution_sr](MPI_Comm comm) { + (void)global_range; if constexpr(Dims > 0) { - const auto part = make_partition(range_cast(global_size), subrange_cast(sr)); - kernel(part); + if constexpr(Collective) { + static_assert(Dims == 1); + const auto part = detail::make_collective_partition(detail::range_cast<1>(global_range), detail::subrange_cast<1>(execution_sr), comm); + hydrated_kernel(part); + } else { + const auto part = detail::make_partition(detail::range_cast(global_range), detail::subrange_cast(execution_sr)); + hydrated_kernel(part); + } } else if constexpr(std::is_invocable_v&>) { - (void)sr; - const auto part = make_0d_partition(); - kernel(part); + (void)execution_sr; + const auto part = detail::make_0d_partition(); + hydrated_kernel(part); } else { - (void)sr; - kernel(); + (void)execution_sr; + hydrated_kernel(); } }); - } - - template - void schedule_collective(Kernel kernel) { - m_future = m_queue->submit(m_task->get_collective_group_id(), [kernel, global_size = m_task->get_global_size(), sr = m_sr](MPI_Comm comm) { - const auto part = make_collective_partition(range_cast<1>(global_size), subrange_cast<1>(sr), comm); - kernel(part); - }); - } - - std::future into_future() { return std::move(m_future); } - - private: - host_queue* m_queue; - std::future m_future; - }; - - template - inline void invoke_kernel(const Kernel& kernel, const sycl::id& s_id, const range& global_range, const id& global_offset, - const id& chunk_offset, Reducers&... reducers) { - kernel(make_item(id_cast(id(s_id)) + chunk_offset, global_offset, global_range), reducers...); - } - - template - inline void invoke_kernel(const Kernel& kernel, const cl::sycl::nd_item& s_item, const range& global_range, - const id& global_offset, const id& chunk_offset, const range& group_range, const id& group_offset, Reducers&... reducers) { - kernel(make_nd_item(s_item, global_range, global_offset, chunk_offset, group_range, group_offset), reducers...); - } - - template - auto bind_simple_kernel(const Kernel& kernel, const range& global_range, const id& global_offset, const id& chunk_offset) { - // The current mechanism for hydrating the SYCL placeholder accessors inside Celerity accessors requires that the kernel functor - // capturing those accessors is copied at least once during submission (see also live_pass_device_handler::submit_to_sycl). - // As of SYCL 2020 kernel functors are passed as const references, so we explicitly capture by value here. - return [=](auto s_item_or_id, auto&... reducers) { - constexpr int sycl_dims = std::max(1, Dims); - static_assert(std::is_invocable_v, decltype(reducers)...>, - "Kernel function must be invocable with celerity::item and as many reducer objects as reductions passed to parallel_for"); - if constexpr(CELERITY_WORKAROUND(DPCPP) && std::is_same_v, decltype(s_item_or_id)>) { - // CELERITY_WORKAROUND_LESS_OR_EQUAL: DPC++ passes a sycl::id instead of a sycl::item to kernels alongside reductions - invoke_kernel(kernel, s_item_or_id, global_range, global_offset, chunk_offset, reducers...); - } else { - // Explicit item constructor: ComputeCpp does not pass a sycl::item, but an implicitly convertible sycl::item_base (?) which does not have - // `sycl::id<> get_id()` - invoke_kernel(kernel, cl::sycl::item{s_item_or_id}.get_id(), global_range, global_offset, chunk_offset, reducers...); - } }; - } - template - auto bind_nd_range_kernel(const Kernel& kernel, const range& global_range, const id& global_offset, const id chunk_offset, - const range& group_range, const id& group_offset) { - return [=](sycl::nd_item s_item, auto&... reducers) { - static_assert(std::is_invocable_v, decltype(reducers)...>, - "Kernel function must be invocable with celerity::nd_item or and as many reducer objects as reductions passed to parallel_for"); - invoke_kernel(kernel, s_item, global_range, global_offset, chunk_offset, group_range, group_offset, reducers...); - }; + return std::make_unique>(std::move(fn)); } - template - inline void invoke_sycl_parallel_for(cl::sycl::handler& cgh, Params&&... args) { - static_assert(CELERITY_FEATURE_UNNAMED_KERNELS || !is_unnamed_kernel, - "Your SYCL implementation does not support unnamed kernels, add a kernel name template parameter to this parallel_for invocation"); - if constexpr(detail::is_unnamed_kernel) { -#if CELERITY_FEATURE_UNNAMED_KERNELS // see static_assert above - cgh.parallel_for(std::forward(args)...); -#endif - } else { - cgh.parallel_for>(std::forward(args)...); + std::unique_ptr into_task() && { + assert(m_task != nullptr); + for(auto state : m_attached_state) { + m_task->extend_lifetime(std::move(state)); } + return std::move(m_task); } +}; - class live_pass_device_handler final : public live_pass_handler { - public: - live_pass_device_handler(const class task* task, subrange<3> sr, bool initialize_reductions, device_queue& d_queue) - : live_pass_handler(task, sr, initialize_reductions), m_d_queue(&d_queue) {} - - template - void submit_to_sycl(CGF&& cgf) { - m_event = m_d_queue->submit([&](cl::sycl::handler& cgh) { - this->m_eventual_cgh = &cgh; - std::forward(cgf)(cgh); - this->m_eventual_cgh = nullptr; - }); - } +namespace detail { - cl::sycl::event get_submission_event() const { return m_event; } + inline handler make_command_group_handler(const detail::task_id tid, const size_t num_collective_nodes) { return handler(tid, num_collective_nodes); } - cl::sycl::handler* const* get_eventual_sycl_cgh() const { return &m_eventual_cgh; } + inline std::unique_ptr into_task(handler&& cgh) { return std::move(cgh).into_task(); } - private: - device_queue* m_d_queue; - cl::sycl::handler* m_eventual_cgh = nullptr; - cl::sycl::event m_event; - }; + [[nodiscard]] inline hydration_id add_requirement(handler& cgh, const buffer_id bid, std::unique_ptr rm) { + return cgh.add_requirement(bid, std::move(rm)); + } - template - class reduction_descriptor; + inline void add_requirement(handler& cgh, const host_object_id hoid, const experimental::side_effect_order order) { + return cgh.add_requirement(hoid, order); + } - template - auto make_sycl_reduction(const reduction_descriptor& d) { + inline void add_reduction(handler& cgh, const detail::reduction_info& rinfo) { return cgh.add_reduction(rinfo); } + + inline void extend_lifetime(handler& cgh, std::shared_ptr state) { cgh.extend_lifetime(std::move(state)); } + + // TODO: The _impl functions in detail only exist during the grace period for deprecated reductions on const buffers; move outside again afterwards. + template + auto reduction_impl(const buffer& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { #if !CELERITY_FEATURE_SCALAR_REDUCTIONS static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); #else - cl::sycl::property_list props; - if(!d.m_include_current_buffer_value) { props = {cl::sycl::property::reduction::initialize_to_identity{}}; } - if constexpr(WithExplicitIdentity) { - return sycl::reduction(d.m_device_ptr, d.m_identity, d.m_op, props); - } else { - return sycl::reduction(d.m_device_ptr, d.m_op, props); - } + static_assert(cl::sycl::has_known_identity_v, + "Celerity does not currently support reductions without an identity. Either specialize " + "cl::sycl::known_identity or use the reduction() overload taking an identity at runtime"); + return detail::make_reduction(vars, cgh, combiner, cl::sycl::known_identity_v, prop_list); #endif } template - class reduction_descriptor { - public: - reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT /* identity */, bool include_current_buffer_value, DataT* device_ptr) - : m_bid(bid), m_op(combiner), m_include_current_buffer_value(include_current_buffer_value), m_device_ptr(device_ptr) {} - - private: - friend auto make_sycl_reduction(const reduction_descriptor&); - - buffer_id m_bid; - BinaryOperation m_op; - bool m_include_current_buffer_value; - DataT* m_device_ptr; - }; - - template - class reduction_descriptor { - public: - reduction_descriptor(buffer_id bid, BinaryOperation combiner, DataT identity, bool include_current_buffer_value, DataT* device_ptr) - : m_bid(bid), m_op(combiner), m_identity(identity), m_include_current_buffer_value(include_current_buffer_value), m_device_ptr(device_ptr) {} - - private: - friend auto make_sycl_reduction(const reduction_descriptor&); - - buffer_id m_bid; - BinaryOperation m_op; - DataT m_identity{}; - bool m_include_current_buffer_value; - DataT* m_device_ptr; - }; - - template - auto make_reduction(const buffer& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) { + auto reduction_impl( + const buffer& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { #if !CELERITY_FEATURE_SCALAR_REDUCTIONS static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); #else - if(vars.get_range().size() != 1) { - // Like SYCL 2020, Celerity only supports reductions to unit-sized buffers. This allows us to avoid tracking different parts of the buffer - // as distributed_state and pending_reduction_state. - throw std::runtime_error("Only unit-sized buffers can be reduction targets"); - } - - auto bid = detail::get_buffer_id(vars); - auto include_current_buffer_value = !prop_list.has_property(); - DataT* device_ptr = nullptr; - - if(detail::is_prepass_handler(cgh)) { - auto rid = detail::runtime::get_instance().get_reduction_manager().create_reduction(bid, op, identity); - static_cast(cgh).add_reduction(reduction_info{rid, bid, include_current_buffer_value}); - } else { - auto& device_handler = static_cast(cgh); - include_current_buffer_value &= device_handler.is_reduction_initializer(); - - auto mode = access_mode::discard_write; - if(include_current_buffer_value) { mode = access_mode::read_write; } - device_ptr = static_cast( - runtime::get_instance().get_buffer_manager().access_device_buffer(bid, mode, subrange_cast(subrange<3>{{}, {1, 1, 1}})).ptr); - } - return detail::reduction_descriptor{bid, op, identity, include_current_buffer_value, device_ptr}; + static_assert(!cl::sycl::has_known_identity_v, "Identity is known to SYCL, remove the identity parameter from reduction()"); + return detail::make_reduction(vars, cgh, combiner, identity, prop_list); #endif } } // namespace detail -template -void handler::parallel_for_kernel_and_reductions(range global_range, id global_offset, - typename detail::kernel_flavor_traits::local_size_type local_range, Kernel& kernel, Reductions&... reductions) { - if(is_prepass()) { - range<3> granularity = {1, 1, 1}; - if constexpr(detail::kernel_flavor_traits::has_local_size) { - for(int d = 0; d < Dims; ++d) { - granularity[d] = local_range[d]; - } - } - const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), granularity}; - return dynamic_cast(*this).create_device_compute_task(geometry, detail::kernel_debug_name()); - } - - auto& device_handler = dynamic_cast(*this); - const auto sr = device_handler.get_iteration_range(); - auto chunk_range = detail::range_cast(sr.range); - auto chunk_offset = detail::id_cast(sr.offset); - - device_handler.submit_to_sycl([&](cl::sycl::handler& cgh) { - constexpr int sycl_dims = std::max(1, Dims); - - if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) { - static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); - } else if constexpr(std::is_same_v) { - const auto sycl_global_range = sycl::range(detail::range_cast(chunk_range)); - detail::invoke_sycl_parallel_for(cgh, sycl_global_range, detail::make_sycl_reduction(reductions)..., - detail::bind_simple_kernel(kernel, global_range, global_offset, chunk_offset)); - } else if constexpr(std::is_same_v) { - const auto sycl_global_range = sycl::range(detail::range_cast(chunk_range)); - const auto sycl_local_range = sycl::range(detail::range_cast(local_range)); - detail::invoke_sycl_parallel_for(cgh, cl::sycl::nd_range(sycl_global_range, sycl_local_range), - detail::make_sycl_reduction(reductions)..., - detail::bind_nd_range_kernel(kernel, global_range, global_offset, chunk_offset, global_range / local_range, chunk_offset / local_range)); - } else { - static_assert(detail::constexpr_false); - } - }); -} - -template -void handler::host_task(on_master_node_tag, Functor kernel) { - if(is_prepass()) { - dynamic_cast(*this).create_master_node_task(); - } else { - dynamic_cast(*this).schedule<0>(kernel); - } -} - -template -void handler::host_task(experimental::collective_tag tag, Functor kernel) { - if(is_prepass()) { - dynamic_cast(*this).create_collective_task(tag.m_cgid); - } else { - dynamic_cast(*this).schedule_collective(kernel); - } +template +auto reduction(buffer& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { + return detail::reduction_impl(vars, cgh, combiner, prop_list); } -template -void handler::host_task(range global_range, id global_offset, Functor kernel) { - if(is_prepass()) { - const detail::task_geometry geometry{Dims, detail::range_cast<3>(global_range), detail::id_cast<3>(global_offset), {1, 1, 1}}; - dynamic_cast(*this).create_host_compute_task(geometry); - } else { - dynamic_cast(*this).schedule(kernel); - } +template +auto reduction(buffer& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { + return detail::reduction_impl(vars, cgh, identity, combiner, prop_list); } template -auto reduction(const buffer& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { -#if !CELERITY_FEATURE_SCALAR_REDUCTIONS - static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); -#else - static_assert(cl::sycl::has_known_identity_v, - "Celerity does not currently support reductions without an identity. Either specialize " - "cl::sycl::known_identity or use the reduction() overload taking an identity at runtime"); - return detail::make_reduction(vars, cgh, combiner, cl::sycl::known_identity_v, prop_list); -#endif +[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction( + const buffer& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { + return detail::reduction_impl(vars, cgh, combiner, prop_list); } template -auto reduction(const buffer& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { -#if !CELERITY_FEATURE_SCALAR_REDUCTIONS - static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); -#else - static_assert(!cl::sycl::has_known_identity_v, "Identity is known to SYCL, remove the identity parameter from reduction()"); - return detail::make_reduction(vars, cgh, combiner, identity, prop_list); -#endif +[[deprecated("Creating reduction from const buffer is deprecated, capture buffer by reference instead")]] auto reduction( + const buffer& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { + return detail::reduction_impl(vars, cgh, identity, combiner, prop_list); } } // namespace celerity diff --git a/include/host_object.h b/include/host_object.h index bf2fcc219..518abc3dd 100644 --- a/include/host_object.h +++ b/include/host_object.h @@ -6,9 +6,9 @@ #include #include +#include "lifetime_extending_state.h" #include "runtime.h" - namespace celerity::experimental { template @@ -45,7 +45,7 @@ class host_object_manager { }; // Base for `state` structs in all host_object specializations: registers and unregisters host_objects with the host_object_manager. -struct host_object_tracker { +struct host_object_tracker : public lifetime_extending_state { detail::host_object_id id{}; host_object_tracker() { @@ -53,8 +53,10 @@ struct host_object_tracker { id = detail::runtime::get_instance().get_host_object_manager().create_host_object(); } + host_object_tracker(const host_object_tracker&) = delete; host_object_tracker(host_object_tracker&&) = delete; host_object_tracker& operator=(host_object_tracker&&) = delete; + host_object_tracker& operator=(const host_object_tracker&) = delete; ~host_object_tracker() { detail::runtime::get_instance().get_host_object_manager().destroy_host_object(id); } }; @@ -96,7 +98,7 @@ namespace celerity::experimental { * - `host_object` does not carry internal state and can be used to track access to global variables or functions like `printf()`. */ template -class host_object { +class host_object final : public detail::lifetime_extending_state_wrapper { static_assert(std::is_object_v); // disallow host_object and host_object public: @@ -113,6 +115,9 @@ class host_object { explicit host_object(const std::in_place_t /* tag */, CtorParams&&... ctor_args) // requiring std::in_place avoids overriding copy and move constructors : m_shared_state(std::make_shared(std::in_place, std::forward(ctor_args)...)) {} + protected: + std::shared_ptr get_lifetime_extending_state() const override { return m_shared_state; } + private: template friend detail::host_object_id detail::get_host_object_id(const experimental::host_object& ho); @@ -134,7 +139,7 @@ class host_object { }; template -class host_object { +class host_object final : public detail::lifetime_extending_state_wrapper { public: using instance_type = T; @@ -142,6 +147,9 @@ class host_object { explicit host_object(const std::reference_wrapper ref) : m_shared_state(std::make_shared(ref.get())) {} + protected: + std::shared_ptr get_lifetime_extending_state() const override { return m_shared_state; } + private: template friend detail::host_object_id detail::get_host_object_id(const experimental::host_object& ho); @@ -149,7 +157,7 @@ class host_object { template friend typename experimental::host_object::instance_type& detail::get_host_object_instance(const experimental::host_object& ho); - struct state : detail::host_object_tracker { + struct state final : detail::host_object_tracker { T& instance; explicit state(T& instance) : instance{instance} {} @@ -162,17 +170,20 @@ class host_object { }; template <> -class host_object { +class host_object final : public detail::lifetime_extending_state_wrapper { public: using instance_type = void; explicit host_object() : m_shared_state(std::make_shared()) {} + protected: + std::shared_ptr get_lifetime_extending_state() const override { return m_shared_state; } + private: template friend detail::host_object_id detail::get_host_object_id(const experimental::host_object& ho); - struct state : detail::host_object_tracker {}; + struct state final : detail::host_object_tracker {}; detail::host_object_id get_id() const { return m_shared_state->id; } diff --git a/include/lifetime_extending_state.h b/include/lifetime_extending_state.h new file mode 100644 index 000000000..59b0163ae --- /dev/null +++ b/include/lifetime_extending_state.h @@ -0,0 +1,43 @@ +#pragma once + +#include + +namespace celerity::detail { + +/** + * Helper type for creating objects with reference semantics, whose lifetime can be extended by tasks. + */ +class lifetime_extending_state { + public: + lifetime_extending_state() = default; + lifetime_extending_state(const lifetime_extending_state&) = delete; + lifetime_extending_state(lifetime_extending_state&&) = delete; + lifetime_extending_state& operator=(const lifetime_extending_state&) = delete; + lifetime_extending_state& operator=(lifetime_extending_state&&) = delete; + + virtual ~lifetime_extending_state() = default; +}; + +/** + * Wrapper type that allows to retrieve the contained lifetime extending state (creation and storage of which is to be implemented by sub-classes). + */ +class lifetime_extending_state_wrapper { + public: + lifetime_extending_state_wrapper() = default; + lifetime_extending_state_wrapper(const lifetime_extending_state_wrapper&) = default; + lifetime_extending_state_wrapper(lifetime_extending_state_wrapper&&) noexcept = default; + lifetime_extending_state_wrapper& operator=(const lifetime_extending_state_wrapper&) = default; + lifetime_extending_state_wrapper& operator=(lifetime_extending_state_wrapper&&) noexcept = default; + + virtual ~lifetime_extending_state_wrapper() = default; + + protected: + friend std::shared_ptr get_lifetime_extending_state(const lifetime_extending_state_wrapper& wrapper); + virtual std::shared_ptr get_lifetime_extending_state() const = 0; +}; + +inline std::shared_ptr get_lifetime_extending_state(const lifetime_extending_state_wrapper& wrapper) { + return wrapper.get_lifetime_extending_state(); +} + +} // namespace celerity::detail \ No newline at end of file diff --git a/include/runtime.h b/include/runtime.h index d948b7657..079aac8eb 100644 --- a/include/runtime.h +++ b/include/runtime.h @@ -167,11 +167,7 @@ namespace detail { } // Deletes the runtime instance, which happens only in tests. Called from runtime_fixture. - static void test_case_exit() { - assert(m_test_mode && m_test_active); - instance.reset(); - m_test_active = false; - } + static void test_case_exit(); private: inline static bool m_test_mode = false; diff --git a/include/side_effect.h b/include/side_effect.h index 39394d12b..06a46a617 100644 --- a/include/side_effect.h +++ b/include/side_effect.h @@ -14,29 +14,47 @@ namespace celerity::experimental { */ template class side_effect { + struct ctor_internal_tag {}; + public: using instance_type = typename host_object::instance_type; constexpr static inline side_effect_order order = Order; - explicit side_effect(const host_object& object, handler& cgh) : m_object(object) { - if(detail::is_prepass_handler(cgh)) { - auto& prepass_cgh = static_cast(cgh); - prepass_cgh.add_requirement(detail::get_host_object_id(object), order); - } - } + explicit side_effect(host_object& object, handler& cgh) : side_effect(ctor_internal_tag{}, object, cgh) {} + + [[deprecated("Creating side_effect from const host_object is deprecated, capture host_object by reference instead")]] explicit side_effect( + const host_object& object, handler& cgh) + : side_effect(ctor_internal_tag{}, object, cgh) {} template std::enable_if_t, instance_type>& operator*() const { - return detail::get_host_object_instance(m_object); + return *m_instance; } template std::enable_if_t, instance_type>* operator->() const { - return &detail::get_host_object_instance(m_object); + return m_instance; } private: - host_object m_object; + instance_type* m_instance; + + side_effect(ctor_internal_tag /* tag */, const host_object& object, handler& cgh) : m_instance{&detail::get_host_object_instance(object)} { + detail::add_requirement(cgh, detail::get_host_object_id(object), order); + detail::extend_lifetime(cgh, detail::get_lifetime_extending_state(object)); + } +}; + +template +class side_effect { + public: + using instance_type = typename host_object::instance_type; + constexpr static inline side_effect_order order = Order; + + explicit side_effect(const host_object& object, handler& cgh) { + detail::add_requirement(cgh, detail::get_host_object_id(object), order); + detail::extend_lifetime(cgh, detail::get_lifetime_extending_state(object)); + } }; template diff --git a/include/task.h b/include/task.h index 4a2db827b..1b4aa9fbe 100644 --- a/include/task.h +++ b/include/task.h @@ -3,10 +3,14 @@ #include #include #include +#include #include +#include "device_queue.h" #include "grid.h" +#include "host_queue.h" #include "intrusive_graph.h" +#include "lifetime_extending_state.h" #include "range_mapper.h" #include "types.h" @@ -38,26 +42,58 @@ namespace detail { shutdown, }; - struct command_group_storage_base { - virtual void operator()(handler& cgh) const = 0; - - virtual ~command_group_storage_base() = default; + class command_launcher_storage_base { + public: + command_launcher_storage_base() = default; + command_launcher_storage_base(const command_launcher_storage_base&) = delete; + command_launcher_storage_base(command_launcher_storage_base&&) = default; + command_launcher_storage_base& operator=(const command_launcher_storage_base&) = delete; + command_launcher_storage_base& operator=(command_launcher_storage_base&&) = default; + virtual ~command_launcher_storage_base() = default; + + virtual sycl::event operator()( + device_queue& q, const subrange<3> execution_sr, const std::vector& reduction_ptrs, const bool is_reduction_initializer) const = 0; + virtual std::future operator()(host_queue& q, const subrange<3>& execution_sr) const = 0; }; template - struct command_group_storage : command_group_storage_base { - Functor fun; + class command_launcher_storage : public command_launcher_storage_base { + public: + command_launcher_storage(Functor&& fun) : m_fun(std::move(fun)) {} + + sycl::event operator()( + device_queue& q, const subrange<3> execution_sr, const std::vector& reduction_ptrs, const bool is_reduction_initializer) const override { + return invoke(q, execution_sr, reduction_ptrs, is_reduction_initializer); + } + + std::future operator()(host_queue& q, const subrange<3>& execution_sr) const override { + return invoke>(q, execution_sr); + } - command_group_storage(Functor fun) : fun(fun) {} - void operator()(handler& cgh) const override { fun(cgh); } + private: + Functor m_fun; + + template + Ret invoke(Args&&... args) const { + if constexpr(std::is_invocable_v) { + return m_fun(args...); + } else { + throw std::runtime_error("Cannot launch command function with provided arguments"); + } + } }; class buffer_access_map { public: - void add_access(buffer_id bid, std::unique_ptr&& rm) { m_map.emplace(bid, std::move(rm)); } + void add_access(buffer_id bid, std::unique_ptr&& rm) { m_accesses.emplace_back(bid, std::move(rm)); } std::unordered_set get_accessed_buffers() const; std::unordered_set get_access_modes(buffer_id bid) const; + size_t get_num_accesses() const { return m_accesses.size(); } + std::pair get_nth_access(const size_t n) const { + const auto& [bid, rm] = m_accesses[n]; + return {bid, rm->get_access_mode()}; + } /** * @brief Computes the combined access-region for a given buffer, mode and subrange. @@ -68,11 +104,13 @@ namespace detail { * * @returns The region obtained by merging the results of all range-mappers for this buffer and mode */ - GridRegion<3> get_requirements_for_access( - buffer_id bid, cl::sycl::access::mode mode, int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const; + GridRegion<3> get_mode_requirements( + const buffer_id bid, const access_mode mode, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const; + + GridBox<3> get_requirements_for_nth_access(const size_t n, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const; private: - std::unordered_multimap> m_map; + std::vector>> m_accesses; }; using reduction_set = std::vector; @@ -126,8 +164,6 @@ namespace detail { const side_effect_map& get_side_effect_map() const { return m_side_effects; } - const command_group_storage_base& get_command_group() const { return *m_cgf; } - const task_geometry& get_geometry() const { return m_geometry; } int get_dimensions() const { return m_geometry.dimensions; } @@ -161,33 +197,40 @@ namespace detail { fence_promise* get_fence_promise() const { return m_fence_promise.get(); } + template + auto launch(Args&&... args) const { + return (*m_launcher)(std::forward(args)...); + } + + void extend_lifetime(std::shared_ptr state) { m_attached_state.emplace_back(std::move(state)); } + static std::unique_ptr make_epoch(task_id tid, detail::epoch_action action) { return std::unique_ptr(new task(tid, task_type::epoch, collective_group_id{}, task_geometry{}, nullptr, {}, {}, {}, {}, action, nullptr)); } - static std::unique_ptr make_host_compute(task_id tid, task_geometry geometry, std::unique_ptr cgf, + static std::unique_ptr make_host_compute(task_id tid, task_geometry geometry, std::unique_ptr launcher, buffer_access_map access_map, side_effect_map side_effect_map, reduction_set reductions) { - return std::unique_ptr(new task(tid, task_type::host_compute, collective_group_id{}, geometry, std::move(cgf), std::move(access_map), + return std::unique_ptr(new task(tid, task_type::host_compute, collective_group_id{}, geometry, std::move(launcher), std::move(access_map), std::move(side_effect_map), std::move(reductions), {}, {}, nullptr)); } - static std::unique_ptr make_device_compute(task_id tid, task_geometry geometry, std::unique_ptr cgf, + static std::unique_ptr make_device_compute(task_id tid, task_geometry geometry, std::unique_ptr launcher, buffer_access_map access_map, reduction_set reductions, std::string debug_name) { - return std::unique_ptr(new task(tid, task_type::device_compute, collective_group_id{}, geometry, std::move(cgf), std::move(access_map), {}, - std::move(reductions), std::move(debug_name), {}, nullptr)); + return std::unique_ptr(new task(tid, task_type::device_compute, collective_group_id{}, geometry, std::move(launcher), std::move(access_map), + {}, std::move(reductions), std::move(debug_name), {}, nullptr)); } static std::unique_ptr make_collective(task_id tid, collective_group_id cgid, size_t num_collective_nodes, - std::unique_ptr cgf, buffer_access_map access_map, side_effect_map side_effect_map) { + std::unique_ptr launcher, buffer_access_map access_map, side_effect_map side_effect_map) { const task_geometry geometry{1, detail::range_cast<3>(range(num_collective_nodes)), {}, {1, 1, 1}}; - return std::unique_ptr( - new task(tid, task_type::collective, cgid, geometry, std::move(cgf), std::move(access_map), std::move(side_effect_map), {}, {}, {}, nullptr)); + return std::unique_ptr(new task( + tid, task_type::collective, cgid, geometry, std::move(launcher), std::move(access_map), std::move(side_effect_map), {}, {}, {}, nullptr)); } static std::unique_ptr make_master_node( - task_id tid, std::unique_ptr cgf, buffer_access_map access_map, side_effect_map side_effect_map) { - return std::unique_ptr(new task(tid, task_type::master_node, collective_group_id{}, task_geometry{}, std::move(cgf), std::move(access_map), - std::move(side_effect_map), {}, {}, {}, nullptr)); + task_id tid, std::unique_ptr launcher, buffer_access_map access_map, side_effect_map side_effect_map) { + return std::unique_ptr(new task(tid, task_type::master_node, collective_group_id{}, task_geometry{}, std::move(launcher), + std::move(access_map), std::move(side_effect_map), {}, {}, {}, nullptr)); } static std::unique_ptr make_horizon(task_id tid) { @@ -205,18 +248,19 @@ namespace detail { task_type m_type; collective_group_id m_cgid; task_geometry m_geometry; - std::unique_ptr m_cgf; + std::unique_ptr m_launcher; buffer_access_map m_access_map; detail::side_effect_map m_side_effects; reduction_set m_reductions; std::string m_debug_name; detail::epoch_action m_epoch_action; std::unique_ptr m_fence_promise; + std::vector> m_attached_state; - task(task_id tid, task_type type, collective_group_id cgid, task_geometry geometry, std::unique_ptr cgf, + task(task_id tid, task_type type, collective_group_id cgid, task_geometry geometry, std::unique_ptr launcher, buffer_access_map access_map, detail::side_effect_map side_effects, reduction_set reductions, std::string debug_name, detail::epoch_action epoch_action, std::unique_ptr fence_promise) - : m_tid(tid), m_type(type), m_cgid(cgid), m_geometry(geometry), m_cgf(std::move(cgf)), m_access_map(std::move(access_map)), + : m_tid(tid), m_type(type), m_cgid(cgid), m_geometry(geometry), m_launcher(std::move(launcher)), m_access_map(std::move(access_map)), m_side_effects(std::move(side_effects)), m_reductions(std::move(reductions)), m_debug_name(std::move(debug_name)), m_epoch_action(epoch_action), m_fence_promise(std::move(fence_promise)) { assert(type == task_type::host_compute || type == task_type::device_compute || get_granularity().size() == 1); diff --git a/include/task_manager.h b/include/task_manager.h index 9ed82b0cd..f2041a467 100644 --- a/include/task_manager.h +++ b/include/task_manager.h @@ -67,10 +67,10 @@ namespace detail { auto reservation = m_task_buffer.reserve_task_entry(await_free_task_slot_callback()); const auto tid = reservation.get_tid(); - prepass_handler cgh(tid, std::make_unique>(cgf), m_num_collective_nodes); + handler cgh = make_command_group_handler(tid, m_num_collective_nodes); cgf(cgh); - auto unique_tsk = std::move(cgh).into_task(); + auto unique_tsk = into_task(std::move(cgh)); // Require the collective group before inserting the task into the ring buffer, otherwise the executor will try to schedule the collective host // task on a collective-group thread that does not yet exist. @@ -185,7 +185,7 @@ namespace detail { task_id m_epoch_for_new_tasks{initial_epoch_task}; // We store a map of which task last wrote to a certain region of a buffer. - // NOTE: This represents the state after the latest performed pre-pass. + // NOTE: This represents the state after the latest task created. buffer_writers_map m_buffers_last_writers; std::unordered_map m_last_collective_tasks; diff --git a/include/types.h b/include/types.h index baf75b6a4..3642791d9 100644 --- a/include/types.h +++ b/include/types.h @@ -55,6 +55,7 @@ MAKE_PHANTOM_TYPE(command_id, size_t) MAKE_PHANTOM_TYPE(collective_group_id, size_t) MAKE_PHANTOM_TYPE(reduction_id, size_t) MAKE_PHANTOM_TYPE(host_object_id, size_t) +MAKE_PHANTOM_TYPE(hydration_id, size_t); // declared in this header for include-dependency reasons diff --git a/src/executor.cc b/src/executor.cc index d534fa80a..df3bc75fe 100644 --- a/src/executor.cc +++ b/src/executor.cc @@ -2,6 +2,7 @@ #include +#include "closure_hydrator.h" #include "distr_queue.h" #include "frame.h" #include "log.h" @@ -45,6 +46,7 @@ namespace detail { } void executor::run() { + closure_hydrator::make_available(); bool done = false; std::queue> command_queue; while(!done || !m_jobs.empty()) { @@ -138,6 +140,7 @@ namespace detail { } assert(m_running_device_compute_jobs == 0); + closure_hydrator::teardown(); } bool executor::handle_command(const command_frame& frame) { diff --git a/src/graph_generator.cc b/src/graph_generator.cc index 999ce5d6b..5d5bce51c 100644 --- a/src/graph_generator.cc +++ b/src/graph_generator.cc @@ -213,7 +213,7 @@ namespace detail { for(const buffer_id bid : buffers) { const auto modes = access_map.get_access_modes(bid); for(auto m : modes) { - result[bid][m] = access_map.get_requirements_for_access(bid, m, tsk.get_dimensions(), sr, global_size); + result[bid][m] = access_map.get_mode_requirements(bid, m, tsk.get_dimensions(), sr, global_size); } } return result; diff --git a/src/print_graph.cc b/src/print_graph.cc index 0494a27de..fce8fc19c 100644 --- a/src/print_graph.cc +++ b/src/print_graph.cc @@ -57,7 +57,7 @@ namespace detail { const auto& bam = tsk.get_buffer_access_map(); for(const auto bid : bam.get_accessed_buffers()) { for(const auto mode : bam.get_access_modes(bid)) { - const auto req = bam.get_requirements_for_access(bid, mode, tsk.get_dimensions(), execution_range, tsk.get_global_size()); + const auto req = bam.get_mode_requirements(bid, mode, tsk.get_dimensions(), execution_range, tsk.get_global_size()); const std::string bl = get_buffer_label(bm, bid); // While uncommon, we do support chunks that don't require access to a particular buffer at all. if(!req.empty()) { fmt::format_to(std::back_inserter(label), "
{} {} {}", detail::access::mode_traits::name(mode), bl, req); } diff --git a/src/runtime.cc b/src/runtime.cc index 3fcd08ed6..cd89d27e5 100644 --- a/src/runtime.cc +++ b/src/runtime.cc @@ -290,5 +290,14 @@ namespace detail { if(done) { m_active_flushes.pop_front(); } } + void runtime::test_case_exit() { + assert(m_test_mode && m_test_active); + // We need to delete all tasks manually first, b/c objects that have their lifetime + // extended by tasks (buffers, host objects) will attempt to shut down the runtime. + if(instance != nullptr) { instance->m_task_mngr.reset(); } + instance.reset(); + m_test_active = false; + } + } // namespace detail } // namespace celerity diff --git a/src/task.cc b/src/task.cc index 464b7f599..b552879b1 100644 --- a/src/task.cc +++ b/src/task.cc @@ -7,7 +7,7 @@ namespace detail { std::unordered_set buffer_access_map::get_accessed_buffers() const { std::unordered_set result; - for(auto& [bid, _] : m_map) { + for(const auto& [bid, _] : m_accesses) { result.emplace(bid); } return result; @@ -15,8 +15,8 @@ namespace detail { std::unordered_set buffer_access_map::get_access_modes(buffer_id bid) const { std::unordered_set result; - for(auto [first, last] = m_map.equal_range(bid); first != last; ++first) { - result.insert(first->second->get_access_mode()); + for(const auto& [b, rm] : m_accesses) { + if(b == bid) { result.insert(rm->get_access_mode()); } } return result; } @@ -33,31 +33,32 @@ namespace detail { return subrange<3>{}; } - GridRegion<3> buffer_access_map::get_requirements_for_access( - buffer_id bid, cl::sycl::access::mode mode, int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const { - auto [first, last] = m_map.equal_range(bid); - if(first == m_map.end()) { return {}; } - + GridRegion<3> buffer_access_map::get_mode_requirements( + const buffer_id bid, const access_mode mode, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const { GridRegion<3> result; - for(auto iter = first; iter != last; ++iter) { - auto rm = iter->second.get(); - if(rm->get_access_mode() != mode) continue; - - chunk<3> chnk{sr.offset, sr.range, global_size}; - subrange<3> req; - switch(kernel_dims) { - case 0: req = apply_range_mapper<0>(rm, chunk_cast<0>(chnk)); break; - case 1: req = apply_range_mapper<1>(rm, chunk_cast<1>(chnk)); break; - case 2: req = apply_range_mapper<2>(rm, chunk_cast<2>(chnk)); break; - case 3: req = apply_range_mapper<3>(rm, chunk_cast<3>(chnk)); break; - default: assert(!"Unreachable"); - } - result = GridRegion<3>::merge(result, subrange_to_grid_box(req)); + for(size_t i = 0; i < m_accesses.size(); ++i) { + if(m_accesses[i].first != bid || m_accesses[i].second->get_access_mode() != mode) continue; + result = GridRegion<3>::merge(result, get_requirements_for_nth_access(i, kernel_dims, sr, global_size)); } - return result; } + GridBox<3> buffer_access_map::get_requirements_for_nth_access( + const size_t n, const int kernel_dims, const subrange<3>& sr, const range<3>& global_size) const { + const auto& [_, rm] = m_accesses[n]; + + chunk<3> chnk{sr.offset, sr.range, global_size}; + subrange<3> req; + switch(kernel_dims) { + case 0: req = apply_range_mapper<0>(rm.get(), chunk_cast<0>(chnk)); break; + case 1: req = apply_range_mapper<1>(rm.get(), chunk_cast<1>(chnk)); break; + case 2: req = apply_range_mapper<2>(rm.get(), chunk_cast<2>(chnk)); break; + case 3: req = apply_range_mapper<3>(rm.get(), chunk_cast<3>(chnk)); break; + default: assert(!"Unreachable"); + } + return subrange_to_grid_box(req); + } + void side_effect_map::add_side_effect(const host_object_id hoid, const experimental::side_effect_order order) { // TODO for multiple side effects on the same hoid, find the weakest order satisfying all of them emplace(hoid, order); diff --git a/src/task_manager.cc b/src/task_manager.cc index 2f931106b..4e6030dad 100644 --- a/src/task_manager.cc +++ b/src/task_manager.cc @@ -60,7 +60,7 @@ namespace detail { const subrange<3> full_range{tsk.get_global_offset(), tsk.get_global_size()}; GridRegion<3> result; for(auto m : modes) { - result = GridRegion<3>::merge(result, access_map.get_requirements_for_access(bid, m, tsk.get_dimensions(), full_range, tsk.get_global_size())); + result = GridRegion<3>::merge(result, access_map.get_mode_requirements(bid, m, tsk.get_dimensions(), full_range, tsk.get_global_size())); } return result; } diff --git a/src/worker_job.cc b/src/worker_job.cc index d45d93321..a768af2d6 100644 --- a/src/worker_job.cc +++ b/src/worker_job.cc @@ -3,6 +3,7 @@ #include #include "buffer_manager.h" +#include "closure_hydrator.h" #include "device_queue.h" #include "handler.h" #include "reduction_manager.h" @@ -149,14 +150,20 @@ namespace detail { if(!m_buffer_mngr.try_lock(pkg.cid, tsk->get_buffer_access_map().get_accessed_buffers())) { return false; } - CELERITY_TRACE("Execute live-pass, scheduling host task in thread pool"); + CELERITY_TRACE("Scheduling host task in thread pool"); - // Note that for host tasks, there is no indirection through a queue->submit step like there is for SYCL tasks. The CGF is executed directly, - // which then schedules task in the thread pool through the host_queue. - auto& cgf = tsk->get_command_group(); - live_pass_host_handler cgh(tsk, data.sr, data.initialize_reductions, m_queue); - cgf(cgh); - m_future = cgh.into_future(); + const auto& access_map = tsk->get_buffer_access_map(); + std::vector access_infos; + access_infos.reserve(access_map.get_num_accesses()); + for(size_t i = 0; i < access_map.get_num_accesses(); ++i) { + const auto [bid, mode] = access_map.get_nth_access(i); + const auto sr = grid_box_to_subrange(access_map.get_requirements_for_nth_access(i, tsk->get_dimensions(), data.sr, tsk->get_global_size())); + const auto info = m_buffer_mngr.access_host_buffer(bid, mode, sr); + access_infos.push_back(closure_hydrator::accessor_info{info.ptr, info.backing_buffer_range, info.backing_buffer_offset, sr}); + } + + closure_hydrator::get_instance().arm(target::host_task, std::move(access_infos)); + m_future = tsk->launch(m_queue, data.sr); assert(m_future.valid()); m_submitted = true; @@ -193,12 +200,31 @@ namespace detail { if(!m_buffer_mngr.try_lock(pkg.cid, tsk->get_buffer_access_map().get_accessed_buffers())) { return false; } - CELERITY_TRACE("Execute live-pass, submit kernel to SYCL"); + CELERITY_TRACE("Submit kernel to SYCL"); + + const auto& access_map = tsk->get_buffer_access_map(); + const auto& reductions = tsk->get_reductions(); + std::vector accessor_infos; + std::vector reduction_ptrs; + accessor_infos.reserve(access_map.get_num_accesses()); + reduction_ptrs.reserve(reductions.size()); + + for(size_t i = 0; i < access_map.get_num_accesses(); ++i) { + const auto [bid, mode] = access_map.get_nth_access(i); + const auto sr = grid_box_to_subrange(access_map.get_requirements_for_nth_access(i, tsk->get_dimensions(), data.sr, tsk->get_global_size())); + const auto info = m_buffer_mngr.access_device_buffer(bid, mode, sr); + accessor_infos.push_back(closure_hydrator::accessor_info{info.ptr, info.backing_buffer_range, info.backing_buffer_offset, sr}); + } + + for(size_t i = 0; i < reductions.size(); ++i) { + const auto& rd = reductions[i]; + const auto mode = rd.init_from_buffer ? access_mode::read_write : access_mode::discard_write; + const auto info = m_buffer_mngr.access_device_buffer(rd.bid, mode, subrange<3>{{}, range<3>{1, 1, 1}}); + reduction_ptrs.push_back(info.ptr); + } - live_pass_device_handler cgh(tsk, data.sr, data.initialize_reductions, m_queue); - auto& cgf = tsk->get_command_group(); - cgf(cgh); - m_event = cgh.get_submission_event(); + closure_hydrator::get_instance().arm(target::device, std::move(accessor_infos)); + m_event = tsk->launch(m_queue, data.sr, reduction_ptrs, data.initialize_reductions); m_submitted = true; CELERITY_TRACE("Kernel submitted to SYCL"); diff --git a/test/accessor_tests.cc b/test/accessor_tests.cc index 79a68df30..288eda9fa 100644 --- a/test/accessor_tests.cc +++ b/test/accessor_tests.cc @@ -2,12 +2,16 @@ #include #include +#include #include #include "ranges.h" #include "buffer_manager_test_utils.h" +#include "log_test_utils.h" + +// NOTE: There are some additional accessor tests in buffer_manager_tests.cc namespace celerity { namespace detail { @@ -20,12 +24,12 @@ namespace detail { distr_queue q; std::vector mem_a{42}; buffer buf_a(mem_a.data(), range<1>{1}); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { auto a = buf_a.get_access(cgh, fixed<1>({0, 1})); cgh.host_task(on_master_node, [=] { ++a[0]; }); }); int out = 0; - q.submit(celerity::allow_by_ref, [=, &out](handler& cgh) { + q.submit([&](handler& cgh) { auto a = buf_a.get_access(cgh, fixed<1>({0, 1})); cgh.host_task(on_master_node, [=, &out] { out = a[0]; }); }); @@ -38,9 +42,12 @@ namespace detail { using buf0d_t = buffer&; SECTION("device accessors") { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" // This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter. using acc0 = decltype(accessor{std::declval(), std::declval(), one_to_one{}, celerity::write_only, celerity::property_list{}}); STATIC_REQUIRE(std::is_same_v, acc0>); +#pragma GCC diagnostic pop using acc1 = decltype(accessor{std::declval(), std::declval(), one_to_one{}, celerity::write_only}); STATIC_REQUIRE(std::is_same_v, acc1>); @@ -57,9 +64,12 @@ namespace detail { using acc5 = decltype(accessor{std::declval(), std::declval(), one_to_one{}, celerity::read_write, celerity::no_init}); STATIC_REQUIRE(std::is_same_v, acc5>); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" // This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter. using acc6 = decltype(accessor{std::declval(), std::declval(), all(), celerity::write_only, celerity::property_list{}}); STATIC_REQUIRE(std::is_same_v, acc6>); +#pragma GCC diagnostic pop using acc7 = decltype(accessor{std::declval(), std::declval(), all(), celerity::read_only}); STATIC_REQUIRE(std::is_same_v, acc7>); @@ -79,10 +89,13 @@ namespace detail { } SECTION("host accessors") { +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" // This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter. using acc0 = decltype(accessor{std::declval(), std::declval(), one_to_one{}, celerity::write_only_host_task, celerity::property_list{}}); STATIC_REQUIRE(std::is_same_v, acc0>); +#pragma GCC diagnostic pop using acc1 = decltype(accessor{std::declval(), std::declval(), one_to_one{}, celerity::write_only_host_task}); STATIC_REQUIRE(std::is_same_v, acc1>); @@ -99,10 +112,13 @@ namespace detail { using acc5 = decltype(accessor{std::declval(), std::declval(), one_to_one{}, celerity::read_write_host_task, celerity::no_init}); STATIC_REQUIRE(std::is_same_v, acc5>); +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" // This currently throws an error at runtime, because we cannot infer whether the access is a discard_* from the property list parameter. using acc6 = decltype(accessor{std::declval(), std::declval(), all(), celerity::write_only_host_task, celerity::property_list{}}); STATIC_REQUIRE(std::is_same_v, acc6>); +#pragma GCC diagnostic pop using acc7 = decltype(accessor{std::declval(), std::declval(), all(), celerity::read_only_host_task}); STATIC_REQUIRE(std::is_same_v, acc7>); @@ -152,37 +168,31 @@ namespace detail { auto& q = accessor_fixture::get_device_queue(); auto sr = subrange<3>({}, range); - live_pass_device_handler cgh(nullptr, sr, true, q); // this kernel initializes the buffer what will be read after. - auto acc_write = - accessor_fixture::template get_device_accessor(cgh, bid, range_cast(range), {}); - cgh.parallel_for>( - range_cast(range), [=](celerity::item item) { acc_write[item] = item.get_linear_id(); }); - cgh.get_submission_event().wait(); + auto acc_write = accessor_fixture::template get_device_accessor(bid, range_cast(range), {}); + test_utils::run_parallel_for>(accessor_fixture::get_device_queue().get_sycl_queue(), + range_cast(range), {}, [=](celerity::item item) { acc_write[item] = item.get_linear_id(); }); SECTION("for device buffers") { - auto acc_read = - accessor_fixture::template get_device_accessor(cgh, bid, range_cast(range), {}); - auto acc = accessor_fixture::template get_device_accessor( - cgh, bid, range_cast(range), {}); - cgh.parallel_for>(range_cast(range), [=](celerity::item item) { - size_t i = item[0]; - size_t j = item[1]; - if constexpr(Dims == 2) { - acc[i][j] = acc_read[i][j]; - } else { - size_t k = item[2]; - acc[i][j][k] = acc_read[i][j][k]; - } - }); - cgh.get_submission_event().wait(); + auto acc_read = accessor_fixture::template get_device_accessor(bid, range_cast(range), {}); + auto acc = accessor_fixture::template get_device_accessor(bid, range_cast(range), {}); + test_utils::run_parallel_for>( + accessor_fixture::get_device_queue().get_sycl_queue(), range_cast(range), {}, [=](celerity::item item) { + size_t i = item[0]; + size_t j = item[1]; + if constexpr(Dims == 2) { + acc[i][j] = acc_read[i][j]; + } else { + size_t k = item[2]; + acc[i][j][k] = acc_read[i][j][k]; + } + }); } SECTION("for host buffers") { - auto acc_read = accessor_fixture::template get_host_accessor(bid, range_cast(range), {}); - auto acc = - accessor_fixture::template get_host_accessor(bid, range_cast(range), {}); + auto acc_read = accessor_fixture::template get_host_accessor(bid, range_cast(range), {}); + auto acc = accessor_fixture::template get_host_accessor(bid, range_cast(range), {}); for(size_t i = 0; i < range[0]; i++) { for(size_t j = 0; j < range[1]; j++) { for(size_t k = 0; k < (Dims == 2 ? 1 : range[2]); k++) { @@ -245,7 +255,7 @@ namespace detail { CAPTURE(Mode); bool verified = false; buffer verify_buf{&verified, 1}; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { // access with offset == buffer range just to mess with things const auto offset = id_cast<1>(test_buf.get_range()); const auto test_acc = test_buf.get_access(cgh, [=](chunk<1>) { return subrange<1>{offset, 0}; }); @@ -255,7 +265,7 @@ namespace detail { verify_acc[0] = true; }); }); - q.submit(allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { const accessor verify_acc{verify_buf, cgh, all{}, read_only_host_task}; cgh.host_task(on_master_node, [=, &verified] { verified = verify_acc[0]; }); }); @@ -291,7 +301,7 @@ namespace detail { std::vector memory1d(10); buffer buf1d(memory1d.data(), range<1>(10)); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor b{buf1d, cgh, all{}, celerity::write_only_host_task, celerity::no_init}; cgh.host_task(on_master_node, [=](partition<0> part) { auto aw = b.get_allocation_window(part); @@ -303,7 +313,7 @@ namespace detail { }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor b{buf1d, cgh, one_to_one{}, celerity::write_only_host_task, celerity::no_init}; cgh.host_task(range<1>(6), id<1>(2), [=](partition<1> part) { auto aw = b.get_allocation_window(part); @@ -319,7 +329,7 @@ namespace detail { std::vector memory2d(10 * 10); buffer buf2d(memory2d.data(), range<2>(10, 10)); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor b{buf2d, cgh, one_to_one{}, celerity::write_only_host_task, celerity::no_init}; cgh.host_task(range<2>(5, 6), id<2>(1, 2), [=](partition<2> part) { auto aw = b.get_allocation_window(part); @@ -338,7 +348,7 @@ namespace detail { std::vector memory3d(10 * 10 * 10); buffer buf3d(memory3d.data(), range<3>(10, 10, 10)); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor b{buf3d, cgh, one_to_one{}, celerity::write_only_host_task, celerity::no_init}; cgh.host_task(range<3>(5, 6, 7), id<3>(1, 2, 3), [=](partition<3> part) { auto aw = b.get_allocation_window(part); @@ -369,7 +379,7 @@ namespace detail { buffer buf_1(100); distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_0(buf_0, cgh, read_write_host_task); cgh.host_task(on_master_node, [=] { CHECK(acc_0 == value_a); @@ -380,7 +390,7 @@ namespace detail { *acc_0 = value_b; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_0(buf_0, cgh, read_only); accessor acc_1(buf_1, cgh, one_to_one(), write_only, no_init); cgh.parallel_for(buf_1.get_range(), [=](item<1> it) { @@ -390,7 +400,7 @@ namespace detail { acc_1[it] = acc_0[id<0>()]; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_1(buf_1, cgh, all(), read_only_host_task); cgh.host_task(on_master_node, [=] { for(size_t i = 0; i < buf_1.get_range().size(); ++i) { @@ -407,7 +417,7 @@ namespace detail { buffer buf_3d({5, 5, 5}); distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_0d(buf_0d, cgh, all(), write_only, no_init); accessor acc_1d(buf_1d, cgh, all(), write_only, no_init); accessor acc_2d(buf_2d, cgh, all(), write_only, no_init); @@ -422,7 +432,7 @@ namespace detail { acc_3d[4][4][4] = 4; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_0d(buf_0d, cgh, all(), read_write_host_task); accessor acc_1d(buf_1d, cgh, all(), read_write_host_task); accessor acc_2d(buf_2d, cgh, all(), read_write_host_task); @@ -438,7 +448,7 @@ namespace detail { acc_3d[4][4][4] += 9; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_0d(buf_0d, cgh, all(), read_only_host_task); accessor acc_1d(buf_1d, cgh, all(), read_only_host_task); accessor acc_2d(buf_2d, cgh, all(), read_only_host_task); @@ -462,11 +472,11 @@ namespace detail { buffer buf_1(32); distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_1(buf_1, cgh, one_to_one(), write_only, no_init); cgh.parallel_for(buf_1.get_range(), [=](item<1> it) { acc_1[it] = value_a; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_1(buf_1, cgh, one_to_one(), write_only); local_accessor local_0(cgh); cgh.parallel_for(nd_range(buf_1.get_range(), buf_1.get_range()), [=](nd_item<1> it) { @@ -483,7 +493,7 @@ namespace detail { acc_1[it.get_global_id()] = local_0[id<0>()]; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_1(buf_1, cgh, all(), read_only_host_task); cgh.host_task(on_master_node, [=] { for(size_t i = 0; i < buf_1.get_range().size(); ++i) { @@ -499,7 +509,7 @@ namespace detail { distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor device_acc_0; accessor device_acc_1; local_accessor local_acc_0; @@ -513,7 +523,7 @@ namespace detail { nd_range(1, 1), [=](nd_item<1> /* it */) { (void)device_acc_0, (void)local_acc_0, (void)device_acc_1, (void)local_acc_1; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor host_acc_0; accessor host_acc_1; host_acc_0 = decltype(host_acc_0)(buf_0, cgh, all()); @@ -530,13 +540,106 @@ namespace detail { CHECK(sizeof(accessor) == sizeof(int*)); } - TEST_CASE("0-dimensional local accessor has no overhead over SYCL", "[accessor][!shouldfail]") { - // TODO after multi-pass removal: drop !shouldfail (see TODO in local_accessor definition) + TEST_CASE("0-dimensional local accessor has no overhead over SYCL", "[accessor]") { if(!CELERITY_DETAIL_HAS_NO_UNIQUE_ADDRESS) SKIP("[[no_unique_address]] not available on this compiler"); // this check is not a static_assert because it depends on an (optional) compiler layout optimization CHECK(sizeof(local_accessor) == sizeof(accessor_testspy::declval_sycl_accessor>())); } + TEST_CASE_METHOD(accessor_fixture<0>, "closure_hydrator provides correct pointers to host and device accessors", "[closure_hydrator][accessor]") { + auto& bm = get_buffer_manager(); + auto bid = bm.register_buffer({100, 1, 1}); + auto& q = get_device_queue(); + + SECTION("host accessor") { + auto access_info = bm.access_host_buffer(bid, access_mode::discard_write, {{}, {100}}); + std::vector infos; + infos.push_back({access_info.ptr, access_info.backing_buffer_range, access_info.backing_buffer_offset, subrange<3>{{}, {100, 1, 1}}}); + auto acc = accessor_testspy::make_host_accessor(subrange<1>({}, {100}), hydration_id(1), + detail::id_cast<1>(access_info.backing_buffer_offset), detail::range_cast<1>(access_info.backing_buffer_range), + detail::range_cast<1>(bm.get_buffer_info(bid).range)); + CHECK(accessor_testspy::get_pointer(acc) != access_info.ptr); + closure_hydrator::get_instance().arm(target::host_task, std::move(infos)); + const auto run_check = closure_hydrator::get_instance().hydrate( + [&, hydrated_acc = acc] { CHECK(accessor_testspy::get_pointer(hydrated_acc) == access_info.ptr); }); + run_check(); + } + + SECTION("device accessor") { + auto access_info = bm.access_device_buffer(bid, access_mode::discard_write, {{}, {100}}); + std::vector infos; + infos.push_back({access_info.ptr, access_info.backing_buffer_range, access_info.backing_buffer_offset, subrange<3>{{}, {100, 1, 1}}}); + auto acc = accessor_testspy::make_device_accessor( + hydration_id(1), id_cast<1>(access_info.backing_buffer_offset), detail::range_cast<1>(access_info.backing_buffer_range)); + CHECK(accessor_testspy::get_pointer(acc) != access_info.ptr); + accessor hydrated_acc; + closure_hydrator::get_instance().arm(target::device, std::move(infos)); + q.get_sycl_queue().submit([&](sycl::handler& cgh) { + closure_hydrator::get_instance().hydrate(cgh, [&hydrated_acc, acc]() { hydrated_acc = acc; })(/* call to hydrate */); + cgh.single_task([] {}); + }); + CHECK(accessor_testspy::get_pointer(hydrated_acc) == access_info.ptr); + } + } + + TEST_CASE_METHOD(accessor_fixture<0>, "closure_hydrator correctly handles unused and duplicate accessors", "[closure_hydrator][accessor]") { + auto& bm = get_buffer_manager(); + auto& q = get_device_queue(); + + std::vector infos; + hydration_id next_hid = 1; + auto create_accessor = [&](const buffer_id bid) { + auto access_info = bm.access_host_buffer(bid, access_mode::discard_write, {{}, {10}}); + infos.push_back({access_info.ptr, access_info.backing_buffer_range, access_info.backing_buffer_offset, subrange<3>{{}, {10, 1, 1}}}); + auto acc = accessor_testspy::make_host_accessor(subrange<1>({}, {10}), next_hid++, + id_cast<1>(access_info.backing_buffer_offset), detail::range_cast<1>(access_info.backing_buffer_range), + detail::range_cast<1>(bm.get_buffer_info(bid).range)); + return std::pair{acc, access_info.ptr}; + }; + + const auto bid1 = bm.register_buffer({10, 1, 1}); + [[maybe_unused]] const auto [acc1, ptr1] = create_accessor(bid1); + const auto bid2 = bm.register_buffer({20, 1, 1}); + const auto [acc2, ptr2] = create_accessor(bid2); + const auto bid3 = bm.register_buffer({30, 1, 1}); + [[maybe_unused]] const auto [acc3, ptr3] = create_accessor(bid3); + const auto bid4 = bm.register_buffer({40, 1, 1}); + const auto [acc4, ptr4] = create_accessor(bid4); + auto acc5 = acc4; + + auto closure = [acc2 = acc2, acc4 = acc4, acc5 = acc5] { + return std::tuple{accessor_testspy::get_pointer(acc2), accessor_testspy::get_pointer(acc4), accessor_testspy::get_pointer(acc5)}; + }; + closure_hydrator::get_instance().arm(target::host_task, std::move(infos)); + auto hydrated_closure = closure_hydrator::get_instance().hydrate(closure); + CHECK(ptr2 == std::get<0>(hydrated_closure())); + CHECK(ptr4 == std::get<1>(hydrated_closure())); + CHECK(ptr4 == std::get<2>(hydrated_closure())); + } + + TEST_CASE_METHOD(accessor_fixture<0>, "closure_hydrator correctly hydrates local_accessor", "[closure_hydrator][accessor][smoke-test]") { + auto& q = get_device_queue(); + auto local_acc = accessor_testspy::make_local_accessor(range<1>(2)); + size_t* const result = sycl::malloc_device(2, q.get_sycl_queue()); + auto closure = [=](sycl::nd_item<1> itm) { + // We can't really check pointers or anything, so this is a smoke test + local_acc[itm.get_local_id()] = 100 + itm.get_local_id(0) * 10; + sycl::group_barrier(itm.get_group()); + // Write other item's value + result[itm.get_local_id(0)] = local_acc[1 - itm.get_local_id()]; + }; + closure_hydrator::get_instance().arm(target::device, {}); + q.submit([&](sycl::handler& cgh) { + auto hydrated_closure = closure_hydrator::get_instance().hydrate(cgh, closure); + cgh.parallel_for(sycl::nd_range<1>{{2}, {2}}, [=](sycl::nd_item<1> itm) { hydrated_closure(itm); }); + }).wait_and_throw(); + size_t result_host[2]; + q.get_sycl_queue().memcpy(&result_host[0], result, 2 * sizeof(size_t)).wait_and_throw(); + CHECK(result_host[0] == 110); + CHECK(result_host[1] == 100); + sycl::free(result, q.get_sycl_queue()); + } + } // namespace detail } // namespace celerity diff --git a/test/buffer_manager_test_utils.h b/test/buffer_manager_test_utils.h index cc12c2b2e..f6c52bfff 100644 --- a/test/buffer_manager_test_utils.h +++ b/test/buffer_manager_test_utils.h @@ -21,11 +21,34 @@ namespace detail { static typename LocalAccessor::sycl_accessor declval_sycl_accessor() { static_assert(constexpr_false, "declval_sycl_accessor cannot be used in an evaluated context"); } + + template + static local_accessor make_local_accessor(Args&&... args) { + return local_accessor{std::forward(args)...}; + } + + template + static DataT* get_pointer(const accessor& acc) { + if constexpr(Tgt == target::device) { + return acc.m_device_ptr; + } else { + return acc.m_host_ptr; + } + } }; } // namespace detail namespace test_utils { + // Convenience function for submitting parallel_for with global offset without having to create a CGF + template + void run_parallel_for(sycl::queue& q, const range& global_range, const id& global_offset, KernelFn fn) { + q.submit([=](sycl::handler& cgh) { + cgh.parallel_for(sycl::range{global_range}, detail::bind_simple_kernel(fn, global_range, global_offset, global_offset)); + }); + q.wait_and_throw(); + } + class buffer_manager_fixture : public device_queue_fixture { public: enum class access_target { host, device }; @@ -152,8 +175,7 @@ namespace test_utils { } template - accessor get_device_accessor( - detail::live_pass_device_handler& cgh, detail::buffer_id bid, const range& range, const id& offset) { + accessor get_device_accessor(detail::buffer_id bid, const range& range, const id& offset) { auto buf_info = m_bm->access_device_buffer(bid, Mode, {offset, range}); return detail::accessor_testspy::make_device_accessor(static_cast(buf_info.ptr), detail::id_cast(buf_info.backing_buffer_offset), detail::range_cast(buf_info.backing_buffer_range)); diff --git a/test/buffer_manager_tests.cc b/test/buffer_manager_tests.cc index ea1acfd72..cf345df7e 100644 --- a/test/buffer_manager_tests.cc +++ b/test/buffer_manager_tests.cc @@ -24,7 +24,7 @@ namespace detail { { celerity::buffer b(range<1>(128)); b_id = celerity::detail::get_buffer_id(b); - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor a{b, cgh, celerity::access::all(), celerity::write_only}; cgh.parallel_for(b.get_range(), [=](celerity::item<1> it) {}); }); @@ -34,11 +34,11 @@ namespace detail { // we need horizon_step_size * 3 + 1 tasks to generate the third horizon, // and one extra task to trigger the clean_up process for(int i = 0; i < (new_horizon_step * 3 + 2); i++) { - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor a{c, cgh, celerity::access::all(), celerity::write_only}; cgh.parallel_for(c.get_range(), [=](celerity::item<1>) {}); }); - // this sync is inside the loop because otherwise there is a race between the prepass and the executor informing the TDAG + // this sync is inside the loop because otherwise there is a race between this thread and the executor informing the TDAG // of the executed horizons, meaning that task deletion is not guaranteed. q.slow_full_sync(); } @@ -817,12 +817,11 @@ namespace detail { const auto range = celerity::range<2>(32, 32); const auto offset = id<2>(32, 0); auto sr = subrange<3>(id_cast<3>(offset), range_cast<3>(range)); - live_pass_device_handler cgh(nullptr, sr, true, dq); - get_device_accessor(cgh, bid, {48, 32}, {16, 0}); - auto acc = get_device_accessor(cgh, bid, {32, 32}, {32, 0}); - cgh.parallel_for(range, offset, [=](id<2> id) { acc[id] = id[0] + id[1]; }); - cgh.get_submission_event().wait(); + get_device_accessor(bid, {48, 32}, {16, 0}); + auto acc = get_device_accessor(bid, {32, 32}, {32, 0}); + + test_utils::run_parallel_for(dq.get_sycl_queue(), range, offset, [=](id<2> id) { acc[id] = id[0] + id[1]; }); auto buf_info = bm.access_host_buffer(bid, access_mode::read, {{32, 0}, {32, 32}}); for(size_t i = 32; i < 64; ++i) { @@ -863,33 +862,32 @@ namespace detail { SECTION("when using device buffers") { auto range = celerity::range<1>(32); auto sr = subrange<3>({}, range_cast<3>(range)); - live_pass_device_handler cgh(nullptr, sr, true, dq); // For device accessors we test this both on host and device // Copy ctor - auto device_acc_a = get_device_accessor(cgh, bid_a, {32}, {0}); + auto device_acc_a = get_device_accessor(bid_a, {32}, {0}); decltype(device_acc_a) device_acc_a1(device_acc_a); // Move ctor - auto device_acc_b = get_device_accessor(cgh, bid_b, {32}, {0}); + auto device_acc_b = get_device_accessor(bid_b, {32}, {0}); decltype(device_acc_b) device_acc_b1(std::move(device_acc_b)); // Copy assignment - auto device_acc_c = get_device_accessor(cgh, bid_c, {32}, {0}); - auto device_acc_c1 = get_device_accessor(cgh, bid_a, {32}, {0}); + auto device_acc_c = get_device_accessor(bid_c, {32}, {0}); + auto device_acc_c1 = get_device_accessor(bid_a, {32}, {0}); device_acc_c1 = device_acc_c; // Move assignment - auto device_acc_d = get_device_accessor(cgh, bid_d, {32}, {0}); - auto device_acc_d1 = get_device_accessor(cgh, bid_a, {32}, {0}); + auto device_acc_d = get_device_accessor(bid_d, {32}, {0}); + auto device_acc_d1 = get_device_accessor(bid_a, {32}, {0}); device_acc_d1 = std::move(device_acc_d); // Hidden friends (equality operators) REQUIRE(device_acc_a == device_acc_a1); REQUIRE(device_acc_a1 != device_acc_b1); - cgh.parallel_for(range, [=](id<1> id) { + test_utils::run_parallel_for(dq.get_sycl_queue(), range, {}, [=](id<1> id) { // Copy ctor decltype(device_acc_a1) device_acc_a2(device_acc_a1); device_acc_a2[id] = 1 * id[0]; @@ -911,8 +909,6 @@ namespace detail { // Hidden friends (equality operators) are only required to be defined on the host }); - cgh.get_submission_event().wait(); - auto host_acc_a = get_host_accessor(bid_a, {32}, {0}); auto host_acc_b = get_host_accessor(bid_b, {32}, {0}); auto host_acc_c = get_host_accessor(bid_c, {32}, {0}); @@ -1017,28 +1013,37 @@ namespace detail { const std::string error_msg = "Buffer cannot be accessed with expected stride"; + // TODO: Use single lambda once https://github.com/KhronosGroup/SYCL-Docs/pull/351 is merged and implemented + auto get_pointer_1d = [this](const buffer_id bid, const range<1>& range, const id<1>& offset) { + get_host_accessor(bid, range, offset).get_pointer(); + }; + + auto get_pointer_2d = [this](const buffer_id bid, const range<2>& range, const id<2>& offset) { + get_host_accessor(bid, range, offset).get_pointer(); + }; + // This is not allowed, as the backing buffer hasn't been allocated from offset 0, which means the pointer would point to offset 32. - REQUIRE_THROWS_WITH((get_host_accessor(bid_a, {32}, {32}).get_pointer()), error_msg); + REQUIRE_THROWS_WITH(get_pointer_1d(bid_a, {32}, {32}), error_msg); // This is fine, as the backing buffer has been resized to start from 0 now. - REQUIRE_NOTHROW(get_host_accessor(bid_a, {64}, {0}).get_pointer()); + REQUIRE_NOTHROW(get_pointer_1d(bid_a, {64}, {0})); // This is now also okay, as the backing buffer starts at 0, and the pointer points to offset 0. // (Same semantics as SYCL accessor with offset, i.e., UB outside of requested range). - REQUIRE_NOTHROW(get_host_accessor(bid_a, {32}, {32}).get_pointer()); + REQUIRE_NOTHROW(get_pointer_1d(bid_a, {32}, {32})); // In 2D (and 3D) it's trickier, as the stride of the backing buffer must also match what the user expects. // This is not allowed, even though the offset is 0. - REQUIRE_THROWS_WITH((get_host_accessor(bid_b, {64, 64}, {0, 0}).get_pointer()), error_msg); + REQUIRE_THROWS_WITH(get_pointer_2d(bid_b, {64, 64}, {0, 0}), error_msg); // This is allowed, as we request the full buffer. - REQUIRE_NOTHROW(get_host_accessor(bid_b, {128, 128}, {0, 0}).get_pointer()); + REQUIRE_NOTHROW(get_pointer_2d(bid_b, {128, 128}, {0, 0})); // This is now allowed, as the backing buffer has the expected stride. - REQUIRE_NOTHROW(get_host_accessor(bid_b, {64, 64}, {0, 0}).get_pointer()); + REQUIRE_NOTHROW(get_pointer_2d(bid_b, {64, 64}, {0, 0})); // Passing an offset is now also possible. - REQUIRE_NOTHROW(get_host_accessor(bid_b, {64, 64}, {32, 32}).get_pointer()); + REQUIRE_NOTHROW(get_pointer_2d(bid_b, {64, 64}, {32, 32})); } TEST_CASE_METHOD(test_utils::buffer_manager_fixture, "empty access ranges do not inflate backing buffer allocations", "[buffer_manager]") { diff --git a/test/graph_compaction_tests.cc b/test/graph_compaction_tests.cc index d34dc8c14..698ebe810 100644 --- a/test/graph_compaction_tests.cc +++ b/test/graph_compaction_tests.cc @@ -205,7 +205,7 @@ namespace detail { auto write_b_cmd = ctx.get_command_graph().get(*cmds.cbegin()); auto write_b_dependencies = write_b_cmd->get_dependencies(); CHECK(!write_b_dependencies.empty()); - CHECK(write_b_dependencies.front().kind == dependency_kind::anti_dep); + CHECK(std::any_of(write_b_dependencies.begin(), write_b_dependencies.end(), [](auto& dep) { return dep.kind == dependency_kind::anti_dep; })); test_utils::maybe_print_graphs(ctx); } diff --git a/test/integration/backend.cc b/test/integration/backend.cc index 81141e606..c9d2ad02d 100644 --- a/test/integration/backend.cc +++ b/test/integration/backend.cc @@ -14,7 +14,7 @@ void test_copy(celerity::distr_queue& q) { celerity::buffer buf(celerity::detail::range_cast(celerity::range<3>{5, 7, 9})); // Initialize on device - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor acc{buf, cgh, celerity::access::one_to_one<>{}, celerity::write_only, celerity::no_init}; cgh.parallel_for>(buf.get_range(), [=](celerity::item itm) { acc[itm] = itm.get_linear_id(); }); }); @@ -22,7 +22,7 @@ void test_copy(celerity::distr_queue& q) { // Check and modify partially on host const auto sr = celerity::detail::subrange_cast(celerity::subrange<3>{{1, 2, 3}, {3, 4, 5}}); const auto sr3 = celerity::detail::subrange_cast<3>(sr); - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor acc{buf, cgh, celerity::access::fixed{sr}, celerity::read_write_host_task}; cgh.host_task(celerity::on_master_node, [=]() { for(size_t k = 0; k < sr3.range[0]; ++k) { @@ -39,13 +39,13 @@ void test_copy(celerity::distr_queue& q) { }); // Modify everything on device - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor acc{buf, cgh, celerity::access::one_to_one<>{}, celerity::read_write}; cgh.parallel_for>(buf.get_range(), [=](celerity::item itm) { acc[itm] += 1; }); }); // Check everything on host - q.submit([=](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor acc{buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(celerity::on_master_node, [=]() { const auto r3 = celerity::detail::range_cast<3>(buf.get_range()); diff --git a/test/print_graph_tests.cc b/test/print_graph_tests.cc index c287f554b..d40b1fabb 100644 --- a/test/print_graph_tests.cc +++ b/test/print_graph_tests.cc @@ -75,8 +75,8 @@ TEST_CASE("command graph printing is unchanged", "[print_graph][command-graph]") // replace the `expected` value with the new dot graph. const auto expected = "digraph G{label=\"Command Graph\" subgraph cluster_2{label=<T2 (master-node host)>;color=darkgray;9[label=execution [[0,0,0] - [0,0,0]]
read B0 {[[0,0,0] - [1,1,1]]}
read_write B0 {[[0,0,0] - " - "[1,1,1]]}
write " + "N0
execution [[0,0,0] - [0,0,0]]
write B0 {[[0,0,0] - [1,1,1]]}
read_write B0 {[[0,0,0] - " + "[1,1,1]]}
read " "B0 {[[0,0,0] - [1,1,1]]}> fontcolor=black shape=box];}subgraph cluster_1{label=<T1 \"task_reduction_8\" " "(device-compute)>;color=darkgray;5[label=execution [[0,0,0] - [1,1,1]]
(R1) discard_write B0 {[[0,0,0] - " "[1,1,1]]}> fontcolor=black shape=box];6[label=execution [[1,0,0] - [2,1,1]]
(R1) discard_write B0 {[[0,0,0] - " @@ -107,7 +107,7 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "Buffer debug names show up in the celerity::debug::set_buffer_name(buff_a, buff_name); CHECK(celerity::debug::get_buffer_name(buff_a) == buff_name); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { celerity::accessor acc{buff_a, cgh, celerity::access::all{}, celerity::write_only}; cgh.parallel_for(range, [=](item<1> item) {}); }); diff --git a/test/runtime_deprecation_tests.cc b/test/runtime_deprecation_tests.cc index 6fbc478de..e2c1bbd3a 100644 --- a/test/runtime_deprecation_tests.cc +++ b/test/runtime_deprecation_tests.cc @@ -134,5 +134,32 @@ namespace detail { }); } + TEST_CASE_METHOD(test_utils::runtime_fixture, + "distr_queue::submit(allow_by_ref_t, ...) and creation of accessors/side-effects/reductions from const buffers/host-objects continues to work", + "[handler][deprecated]") { + distr_queue q; + buffer buf{32}; + buffer reduction_buf{1}; + experimental::host_object ho; + int my_int = 33; + q.submit(allow_by_ref, [= /* capture buffer/host-object by value */, &my_int](handler& cgh) { + accessor acc{buf, cgh, celerity::access::all{}, celerity::write_only_host_task}; + experimental::side_effect se{ho, cgh}; + cgh.host_task(on_master_node, [=, &my_int] { + (void)acc; + (void)se; + my_int = 42; + }); + }); + q.submit([= /* capture by value */](handler& cgh) { + accessor acc{buf, cgh, celerity::access::one_to_one{}, celerity::read_only}; +#if CELERITY_FEATURE_SCALAR_REDUCTIONS + auto red = reduction(reduction_buf, cgh, std::plus{}); +#endif + cgh.parallel_for(range<1>{32}, [=](item<1>) { (void)acc; }); + }); + SUCCEED(); + } + } // namespace detail } // namespace celerity diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc index f8d653c0b..dae7dd28b 100644 --- a/test/runtime_tests.cc +++ b/test/runtime_tests.cc @@ -93,7 +93,7 @@ namespace detail { buffer buf_a{range<2>{32, 64}}; auto& tm = runtime::get_instance().get_task_manager(); const auto tid = test_utils::add_compute_task( - tm, [buf_a /* capture by value */](handler& cgh) { buf_a.get_access(cgh, one_to_one{}); }, buf_a.get_range()); + tm, [&](handler& cgh) { buf_a.get_access(cgh, one_to_one{}); }, buf_a.get_range()); const auto tsk = tm.get_task(tid); const auto bufs = tsk->get_buffer_access_map().get_accessed_buffers(); REQUIRE(bufs.size() == 1); @@ -267,10 +267,10 @@ namespace detail { REQUIRE(std::find(bufs.cbegin(), bufs.cend(), buf_b.get_id()) != bufs.cend()); REQUIRE(bam.get_access_modes(buf_a.get_id()).count(cl::sycl::access::mode::read) == 1); REQUIRE(bam.get_access_modes(buf_b.get_id()).count(cl::sycl::access::mode::discard_read_write) == 1); - const auto reqs_a = bam.get_requirements_for_access( + const auto reqs_a = bam.get_mode_requirements( buf_a.get_id(), cl::sycl::access::mode::read, tsk->get_dimensions(), {tsk->get_global_offset(), tsk->get_global_size()}, tsk->get_global_size()); REQUIRE(reqs_a == subrange_to_grid_box(subrange<3>({32, 24, 0}, {32, 128, 1}))); - const auto reqs_b = bam.get_requirements_for_access(buf_b.get_id(), cl::sycl::access::mode::discard_read_write, tsk->get_dimensions(), + const auto reqs_b = bam.get_mode_requirements(buf_b.get_id(), cl::sycl::access::mode::discard_read_write, tsk->get_dimensions(), {tsk->get_global_offset(), tsk->get_global_size()}, tsk->get_global_size()); REQUIRE(reqs_b == subrange_to_grid_box(subrange<3>({}, {5, 18, 74}))); } @@ -279,24 +279,16 @@ namespace detail { buffer_access_map bam; bam.add_access(0, std::make_unique>>(subrange<2>{{3, 0}, {10, 20}}, cl::sycl::access::mode::read, range<2>{30, 30})); bam.add_access(0, std::make_unique>>(subrange<2>{{10, 0}, {7, 20}}, cl::sycl::access::mode::read, range<2>{30, 30})); - const auto req = bam.get_requirements_for_access(0, cl::sycl::access::mode::read, 2, subrange<3>({0, 0, 0}, {100, 100, 1}), {100, 100, 1}); + const auto req = bam.get_mode_requirements(0, cl::sycl::access::mode::read, 2, subrange<3>({0, 0, 0}, {100, 100, 1}), {100, 100, 1}); REQUIRE(req == subrange_to_grid_box(subrange<3>({3, 0, 0}, {14, 20, 1}))); } TEST_CASE("tasks gracefully handle get_requirements() calls for buffers they don't access", "[task]") { buffer_access_map bam; - const auto req = bam.get_requirements_for_access(0, cl::sycl::access::mode::read, 3, subrange<3>({0, 0, 0}, {100, 1, 1}), {100, 1, 1}); + const auto req = bam.get_mode_requirements(0, cl::sycl::access::mode::read, 3, subrange<3>({0, 0, 0}, {100, 1, 1}), {100, 1, 1}); REQUIRE(req == subrange_to_grid_box(subrange<3>({0, 0, 0}, {0, 0, 0}))); } - TEST_CASE("safe command group functions must not capture by reference", "[lifetime][dx]") { - int value = 123; - const auto unsafe = [&]() { return value + 1; }; - REQUIRE_FALSE(is_safe_cgf); - const auto safe = [=]() { return value + 1; }; - REQUIRE(is_safe_cgf); - } - namespace foo { class MySecondKernel; } @@ -321,12 +313,12 @@ namespace detail { buffer buff(N); std::vector host_buff(N); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { auto b = buff.get_access(cgh, one_to_one{}); cgh.parallel_for(range<1>(N), [=](celerity::item<1> item) { b[item] = item.get_linear_id(); }); }); - q.submit(allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { auto b = buff.get_access(cgh, celerity::access::fixed<1>{{{}, buff.get_range()}}); cgh.host_task(on_master_node, [=, &host_buff] { std::this_thread::sleep_for(std::chrono::milliseconds(10)); // give the synchronization more time to fail @@ -461,37 +453,37 @@ namespace detail { experimental::collective_group primary_group; experimental::collective_group secondary_group; - q.submit(celerity::allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { cgh.host_task(experimental::collective, [&](experimental::collective_partition part) { default1_thread = std::this_thread::get_id(); default1_comm = part.get_collective_mpi_comm(); }); }); - q.submit(celerity::allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { cgh.host_task(experimental::collective(primary_group), [&](experimental::collective_partition part) { primary1_thread = std::this_thread::get_id(); primary1_comm = part.get_collective_mpi_comm(); }); }); - q.submit(celerity::allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { cgh.host_task(experimental::collective(secondary_group), [&](experimental::collective_partition part) { secondary1_thread = std::this_thread::get_id(); secondary1_comm = part.get_collective_mpi_comm(); }); }); - q.submit(celerity::allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { cgh.host_task(experimental::collective, [&](experimental::collective_partition part) { default2_thread = std::this_thread::get_id(); default2_comm = part.get_collective_mpi_comm(); }); }); - q.submit(celerity::allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { cgh.host_task(experimental::collective(primary_group), [&](experimental::collective_partition part) { primary2_thread = std::this_thread::get_id(); primary2_comm = part.get_collective_mpi_comm(); }); }); - q.submit(celerity::allow_by_ref, [&](handler& cgh) { + q.submit([&](handler& cgh) { cgh.host_task(experimental::collective(secondary_group), [&](experimental::collective_partition part) { secondary2_thread = std::this_thread::get_id(); secondary2_comm = part.get_collective_mpi_comm(); @@ -582,31 +574,31 @@ namespace detail { distr_queue q; buffer buf{{10, 10}}; - CHECK_THROWS_WITH(q.submit([=](handler& cgh) { + CHECK_THROWS_WITH(q.submit([&](handler& cgh) { buf.get_access(cgh, one_to_one{}); cgh.parallel_for(range<1>{10}, [=](celerity::item<1>) {}); }), "Invalid range mapper dimensionality: 1-dimensional kernel submitted with a requirement whose range mapper is neither invocable for chunk<1> nor " "(chunk<1>, range<2>) to produce subrange<2>"); - CHECK_NOTHROW(q.submit([=](handler& cgh) { + CHECK_NOTHROW(q.submit([&](handler& cgh) { buf.get_access(cgh, one_to_one{}); cgh.parallel_for(range<2>{10, 10}, [=](celerity::item<2>) {}); })); - CHECK_THROWS_WITH(q.submit([=](handler& cgh) { + CHECK_THROWS_WITH(q.submit([&](handler& cgh) { buf.get_access(cgh, one_to_one{}); cgh.parallel_for(range<3>{10, 10, 10}, [=](celerity::item<3>) {}); }), "Invalid range mapper dimensionality: 3-dimensional kernel submitted with a requirement whose range mapper is neither invocable for chunk<3> nor " "(chunk<3>, range<2>) to produce subrange<2>"); - CHECK_NOTHROW(q.submit([=](handler& cgh) { + CHECK_NOTHROW(q.submit([&](handler& cgh) { buf.get_access(cgh, all{}); cgh.parallel_for(range<3>{10, 10, 10}, [=](celerity::item<3>) {}); })); - CHECK_NOTHROW(q.submit([=](handler& cgh) { + CHECK_NOTHROW(q.submit([&](handler& cgh) { buf.get_access(cgh, all{}); cgh.parallel_for(range<3>{10, 10, 10}, [=](celerity::item<3>) {}); })); @@ -626,7 +618,7 @@ namespace detail { const auto global_offset = detail::id_cast(id<3>{4, 5, 6}); buffer linear_id{{n, Dims + 1}}; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor a{linear_id, cgh, celerity::access::all{}, write_only, no_init}; // all RM is sane because runtime_tests runs single-node cgh.parallel_for>(detail::range_cast(range<3>{n, 1, 1}), global_offset, [=](celerity::item item) { auto i = (item.get_id() - item.get_offset())[0]; @@ -636,7 +628,7 @@ namespace detail { a[i][Dims] = item.get_linear_id(); }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor a{linear_id, cgh, celerity::access::all{}, read_only_host_task}; cgh.host_task(on_master_node, [=] { for(int i = 0; i < n; ++i) { @@ -737,7 +729,7 @@ namespace detail { // Note: We assume a local range size of 32 here, this should be supported by most devices. - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { local_accessor la{32, cgh}; accessor ga{out, cgh, celerity::access::one_to_one{}, write_only}; cgh.parallel_for(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) { @@ -747,7 +739,7 @@ namespace detail { }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor ga{out, cgh, celerity::access::all{}, read_only_host_task}; cgh.host_task(on_master_node, [=] { for(size_t i = 0; i < 64; ++i) { @@ -762,7 +754,7 @@ namespace detail { // Note: We assume a local range size of 16 here, this should be supported by most devices. buffer b{range<1>{1}}; - distr_queue{}.submit([=](handler& cgh) { + distr_queue{}.submit([&](handler& cgh) { cgh.parallel_for(celerity::nd_range{range<2>{8, 8}, range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}), [](nd_item<2> item, auto& sum) { sum += item.get_global_linear_id(); }); }); @@ -783,11 +775,11 @@ namespace detail { q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [](nd_item<1> item) {}); }); #if CELERITY_FEATURE_SCALAR_REDUCTIONS buffer b{{1}}; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for( range<1>{64}, reduction(b, cgh, cl::sycl::plus{}), [=](item<1> item, auto& r) { r += static_cast(item.get_linear_id()); }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, reduction(b, cgh, cl::sycl::plus{}), [=](nd_item<1> item, auto& r) { r += static_cast(item.get_global_linear_id()); }); }); @@ -797,11 +789,11 @@ namespace detail { q.submit([=](handler& cgh) { cgh.parallel_for(range<1>{64}, [=](item<1> item) {}); }); q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) {}); }); #if CELERITY_FEATURE_SCALAR_REDUCTIONS - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for( range<1>{64}, reduction(b, cgh, cl::sycl::plus{}), [=](item<1> item, auto& r) { r += static_cast(item.get_linear_id()); }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, reduction(b, cgh, cl::sycl::plus{}), [=](nd_item<1> item, auto& r) { r += static_cast(item.get_global_linear_id()); }); }); @@ -821,7 +813,7 @@ namespace detail { constexpr int extents = 16; buffer buf_a(extents); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{buf_a, cgh, celerity::access::all{}, celerity::write_only_host_task, celerity::no_init}; cgh.host_task(on_master_node, [] {}); }); @@ -831,7 +823,7 @@ namespace detail { constexpr int task_limit = 15; for(int i = 0; i < chain_length; ++i) { - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{buf_a, cgh, celerity::access::all{}, celerity::read_write_host_task}; cgh.host_task(on_master_node, [] {}); }); @@ -847,6 +839,74 @@ namespace detail { } } + TEST_CASE_METHOD(test_utils::runtime_fixture, "tasks extend the lifetime of buffers and host objects", "[task_manager]") { + std::weak_ptr buffer_state_1; + std::weak_ptr buffer_state_2; + std::weak_ptr buffer_state_3; + std::weak_ptr host_object_state; + + distr_queue q; + auto& tm = runtime::get_instance().get_task_manager(); + tm.set_horizon_step(2); + + std::promise wait_for_this; + + const size_t have_reduction = CELERITY_FEATURE_SCALAR_REDUCTIONS; + + { + // Buffers and host object go out of scope before tasks has completed + buffer buf_1{32}; + buffer buf_2{32}; + buffer buf_3{1}; + experimental::host_object ho; + buffer_state_1 = get_lifetime_extending_state(buf_1); + buffer_state_2 = get_lifetime_extending_state(buf_2); + buffer_state_3 = get_lifetime_extending_state(buf_3); + host_object_state = get_lifetime_extending_state(ho); + q.submit([&](handler& cgh) { + accessor acc{buf_1, cgh, celerity::access::all{}, celerity::write_only_host_task}; + experimental::side_effect se{ho, cgh}; + cgh.host_task(on_master_node, [=, &wait_for_this] { + (void)acc; + (void)se; + wait_for_this.get_future().wait(); + }); + }); + q.submit([&](handler& cgh) { + accessor acc_1{buf_1, cgh, celerity::access::one_to_one{}, celerity::read_only}; + accessor acc_2{buf_2, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init}; +#if CELERITY_FEATURE_SCALAR_REDUCTIONS + auto red = reduction(buf_3, cgh, std::plus<>()); + cgh.parallel_for(range<1>{32}, red, [=](item<1>, auto&) { +#else + cgh.parallel_for(range<1>{32}, [=](item<1>) { +#endif + (void)acc_1; + (void)acc_2; + }); + }); + CHECK(buffer_state_1.use_count() == 3); + CHECK(buffer_state_2.use_count() == 2); + CHECK(buffer_state_3.use_count() == 1 + have_reduction); + CHECK(host_object_state.use_count() == 2); + } + + CHECK(buffer_state_1.use_count() == 2); + CHECK(buffer_state_2.use_count() == 1); + CHECK(buffer_state_3.use_count() == have_reduction); + CHECK(host_object_state.use_count() == 1); + wait_for_this.set_value(); + // Now trigger deletion of the task (which should then also delete the buffer and host object). + q.slow_full_sync(); + // We currently only delete tasks when submitting new tasks (and not when creating epochs), so we have to submit a no-op here. + q.submit([](handler& cgh) { cgh.host_task(on_master_node, [] {}); }); + q.slow_full_sync(); + CHECK(buffer_state_1.use_count() == 0); + CHECK(buffer_state_2.use_count() == 0); + CHECK(buffer_state_3.use_count() == 0); + CHECK(host_object_state.use_count() == 0); + } + #ifndef __APPLE__ class restore_process_affinity_fixture { restore_process_affinity_fixture(const restore_process_affinity_fixture&) = delete; @@ -914,7 +974,7 @@ namespace detail { experimental::host_object ref_ho{std::ref(exterior)}; experimental::host_object void_ho; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { experimental::side_effect append_owned{owned_ho, cgh}; experimental::side_effect append_ref{ref_ho, cgh}; experimental::side_effect track_void{void_ho, cgh}; @@ -924,7 +984,7 @@ namespace detail { }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { experimental::side_effect append_owned{owned_ho, cgh}; experimental::side_effect append_ref{ref_ho, cgh}; experimental::side_effect track_void{void_ho, cgh}; @@ -934,7 +994,7 @@ namespace detail { }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { experimental::side_effect check_owned{owned_ho, cgh}; cgh.host_task(on_master_node, [=] { CHECK(*check_owned == std::vector{1, 2}); }); }); @@ -1069,13 +1129,13 @@ namespace detail { experimental::host_object ho{1}; distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { experimental::side_effect e(ho, cgh); cgh.host_task(on_master_node, [=] { *e = 2; }); }); auto v2 = experimental::fence(q, ho); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { experimental::side_effect e(ho, cgh); cgh.host_task(on_master_node, [=] { *e = 3; }); }); @@ -1089,7 +1149,7 @@ namespace detail { buffer buf(range<2>(4, 4)); distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc(buf, cgh, all{}, write_only, no_init); cgh.parallel_for(buf.get_range(), [=](celerity::item<2> item) { acc[item] = static_cast(item.get_linear_id()); }); }); @@ -1114,7 +1174,7 @@ namespace detail { buffer buf; distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc(buf, cgh, write_only, no_init); cgh.parallel_for(buf.get_range(), [=](celerity::item<0> item) { *acc = 42; }); }); @@ -1133,20 +1193,20 @@ namespace detail { distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_a(buf_a, cgh, write_only, no_init); cgh.parallel_for(range<0>(), [=](item<0> /* it */) { *acc_a = value_b; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_b(buf_b, cgh, write_only, no_init); cgh.parallel_for(nd_range<0>(), [=](nd_item<0> /* it */) { *acc_b = value_b; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_c(buf_c, cgh, write_only_host_task, no_init); cgh.host_task(range<0>(), [=](partition<0> /* part */) { *acc_c = value_b; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc_a(buf_a, cgh, read_only_host_task); accessor acc_b(buf_b, cgh, read_only_host_task); accessor acc_c(buf_c, cgh, read_only_host_task); diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc index f38bd8b68..27a937ada 100644 --- a/test/system/distr_tests.cc +++ b/test/system/distr_tests.cc @@ -29,7 +29,7 @@ namespace detail { distr_queue q; const auto initialize_to_identity = cl::sycl::property::reduction::initialize_to_identity{}; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus{}, initialize_to_identity); auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum{}, initialize_to_identity); cgh.parallel_for(range{N}, id{1}, sum_r, max_r, [=](celerity::item<1> item, auto& sum, auto& max) { @@ -38,7 +38,7 @@ namespace detail { }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor sum_acc{sum_buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; accessor max_acc{max_buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(on_master_node, [=] { @@ -62,12 +62,12 @@ namespace detail { const int N = 1000; const int init = 42; buffer sum(&init, range{1}); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for( range{N}, reduction(sum, cgh, cl::sycl::plus{} /* don't initialize to identity */), [=](celerity::item<1> item, auto& sum) { sum += 1; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(on_master_node, [=] { CHECK(acc[0] == N + init); }); }); @@ -83,17 +83,17 @@ namespace detail { const int N = 1000; buffer sum(range{1}); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for(range{N}, reduction(sum, cgh, cl::sycl::plus{}, cl::sycl::property::reduction::initialize_to_identity{}), [=](celerity::item<1> item, auto& sum) { sum += 1; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for(range{N}, reduction(sum, cgh, cl::sycl::plus{} /* include previous reduction result */), [=](celerity::item<1> item, auto& sum) { sum += 2; }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(on_master_node, [=] { CHECK(acc[0] == 3 * N); }); }); @@ -110,19 +110,19 @@ namespace detail { const int N = 1000; buffer sum(range{1}); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for(range{N}, reduction(sum, cgh, cl::sycl::plus{}, cl::sycl::property::reduction::initialize_to_identity{}), [=](celerity::item<1> item, auto& sum) { sum += static_cast(item.get_linear_id()); }); }); const int expected = (N - 1) * N / 2; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(on_master_node, [=] { CHECK(acc[0] == expected); }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(experimental::collective, [=](experimental::collective_partition p) { INFO("Node " << p.get_node_index()); @@ -146,14 +146,14 @@ namespace detail { { distr_queue q; buffer sum(range{1}); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { cgh.parallel_for(range{100}, reduction(sum, cgh, cl::sycl::plus{}, cl::sycl::property::reduction::initialize_to_identity{}), [](celerity::item<1> item, auto& sum) {}); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task}; - cgh.host_task(on_master_node, [] {}); + cgh.host_task(on_master_node, [=] { (void)acc; }); }); } // shutdown runtime and print graph @@ -211,7 +211,7 @@ namespace detail { buffer geo(global_range); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor g{geo, cgh, celerity::access::one_to_one{}, write_only, no_init}; cgh.parallel_for>(celerity::nd_range{global_range, local_range}, /* global_offset,*/ [=](nd_item item) { auto group = item.get_group(); @@ -224,7 +224,7 @@ namespace detail { }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor g{geo, cgh, celerity::access::all{}, read_only_host_task}; cgh.host_task(on_master_node, [=] { for(size_t global_linear_id = 0; global_linear_id < global_range.size(); ++global_linear_id) { @@ -268,29 +268,29 @@ namespace detail { constexpr int N = 1000; buffer buff_a(range<1>{1}); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor write_a{buff_a, cgh, celerity::access::all{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(range<1>{N}, [=](celerity::item<1> item) {}); }); buffer buff_b(range<1>{1}); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor write_b{buff_b, cgh, celerity::access::all{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(range<1>{N}, [=](celerity::item<1> item) {}); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor read_write_a{buff_a, cgh, celerity::access::all{}, celerity::read_write}; cgh.parallel_for(range<1>{N}, [=](celerity::item<1> item) {}); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor read_write_a{buff_a, cgh, celerity::access::all{}, celerity::read_write}; accessor read_write_b{buff_b, cgh, celerity::access::all{}, celerity::read_write}; cgh.parallel_for(range<1>{N}, [=](celerity::item<1> item) {}); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor write_a{buff_a, cgh, celerity::access::all{}, celerity::write_only, celerity::no_init}; cgh.parallel_for(range<1>{N}, [=](celerity::item<1> item) {}); }); @@ -336,7 +336,7 @@ namespace detail { return celerity::access::one_to_one{}(chnk); }; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{buf, cgh, chunk_check_rm, write_only, no_init}; // The kernel has a size of 1 in dimension 0, so it will not be split into // more than one chunk (assuming current naive split behavior). @@ -349,11 +349,11 @@ namespace detail { experimental::host_object obj; distr_queue q; - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { experimental::side_effect eff{obj, cgh}; cgh.host_task(experimental::collective, [=](experimental::collective_partition p) { *eff = static_cast(p.get_node_index()); }); }); - q.submit([=](handler& cgh) { + q.submit([&](handler& cgh) { accessor acc{buf, cgh, celerity::access::all{}, write_only_host_task, no_init}; cgh.host_task(on_master_node, [=] { acc[{1, 2, 3}] = 42; }); }); diff --git a/test/task_graph_tests.cc b/test/task_graph_tests.cc index 9c45d7025..44acdd3bf 100644 --- a/test/task_graph_tests.cc +++ b/test/task_graph_tests.cc @@ -1,4 +1,3 @@ -#include "allscale/api/user/data/grid.h" #include "task.h" #include "task_manager.h" #include "types.h" diff --git a/test/task_ring_buffer_tests.cc b/test/task_ring_buffer_tests.cc index 6ed569090..b0a2dd6a2 100644 --- a/test/task_ring_buffer_tests.cc +++ b/test/task_ring_buffer_tests.cc @@ -24,7 +24,7 @@ TEST_CASE_METHOD(test_utils::runtime_fixture, "freeing task ring buffer capacity celerity::buffer dependency{1}; for(size_t i = 0; i < task_ringbuffer_size + 10; ++i) { - q.submit(celerity::allow_by_ref, [=, &reached_ringbuffer_capacity](celerity::handler& cgh) { + q.submit([&](celerity::handler& cgh) { celerity::accessor acc{dependency, cgh, celerity::access::all{}, celerity::read_write_host_task}; cgh.host_task(celerity::on_master_node, [=, &reached_ringbuffer_capacity] { while(!reached_ringbuffer_capacity.load()) diff --git a/test/test_main.cc b/test/test_main.cc index 853e3c924..e9ea739a2 100644 --- a/test/test_main.cc +++ b/test/test_main.cc @@ -26,6 +26,9 @@ int main(int argc, char* argv[]) { struct global_setup_and_teardown : Catch::EventListenerBase { using EventListenerBase::EventListenerBase; + + void testRunStarting(const Catch::TestRunInfo& /* info */) override { celerity::detail::closure_hydrator::make_available(); } + void testCasePartialEnded(const Catch::TestCaseStats&, uint64_t) override { // Reset REQUIRE_LOOP after each test case, section or generator value. celerity::test_utils::require_loop_assertion_registry::get_instance().reset(); diff --git a/test/test_utils.h b/test/test_utils.h index 0ebdff599..8a3737d67 100644 --- a/test/test_utils.h +++ b/test/test_utils.h @@ -158,10 +158,7 @@ namespace test_utils { public: template void get_access(handler& cgh, Functor rmfn) { - if(detail::is_prepass_handler(cgh)) { - auto& prepass_cgh = dynamic_cast(cgh); // No live pass in tests - prepass_cgh.add_requirement(m_id, std::make_unique>(rmfn, Mode, m_size)); - } + (void)detail::add_requirement(cgh, m_id, std::make_unique>(rmfn, Mode, m_size)); } detail::buffer_id get_id() const { return m_id; } @@ -179,12 +176,7 @@ namespace test_utils { class mock_host_object { public: - void add_side_effect(handler& cgh, const experimental::side_effect_order order) { - if(detail::is_prepass_handler(cgh)) { - auto& prepass_cgh = static_cast(cgh); - prepass_cgh.add_requirement(m_id, order); - } - } + void add_side_effect(handler& cgh, const experimental::side_effect_order order) { (void)detail::add_requirement(cgh, m_id, order); } detail::host_object_id get_id() const { return m_id; } @@ -412,7 +404,7 @@ namespace test_utils { template void add_reduction(handler& cgh, mock_reduction_factory& mrf, const mock_buffer& vars, bool include_current_buffer_value) { - static_cast(cgh).add_reduction(mrf.create_reduction(vars.get_id(), include_current_buffer_value)); + detail::add_reduction(cgh, mrf.create_reduction(vars.get_id(), include_current_buffer_value)); } // This fixture (or a subclass) must be used by all tests that transitively use MPI.