From 11b9be48220689c99454959b7c9733d2a6fb832f Mon Sep 17 00:00:00 2001 From: Fabian Knorr Date: Thu, 2 Nov 2023 16:48:59 +0100 Subject: [PATCH] Stabilize experimental::fence --- CHANGELOG.md | 1 + docs/issues-and-limitations.md | 2 +- docs/pitfalls.md | 6 +-- examples/distr_io/distr_io.cc | 2 +- examples/hello_world/hello_world.cc | 2 +- examples/matmul/matmul.cc | 2 +- include/distr_queue.h | 40 +++++++++++++++++++ include/fence.h | 61 ++++++++++++----------------- test/runtime_deprecation_tests.cc | 21 ++++++++++ test/runtime_tests.cc | 10 ++--- test/system/distr_tests.cc | 4 +- test/system_benchmarks.cc | 4 +- 12 files changed, 102 insertions(+), 53 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ef8eff20..5e3719c54 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,7 @@ Versioning](http://semver.org/spec/v2.0.0.html). - Introduce new experimental `for_each_item` utility to iterate over a celerity range (#199) - Add new environment variables `CELERITY_HORIZON_STEP` and `CELERITY_HORIZON_MAX_PARALLELISM` to control Horizon generation (#199) - Add new `experimental::constrain_split` API to limit how a kernel can be split (#?) +- `distr_queue::fence` and `buffer_snapshot` are now stable, subsuming the `experimental::` APIs of the same name ## Changed diff --git a/docs/issues-and-limitations.md b/docs/issues-and-limitations.md index 1e1dc03d9..19052a446 100644 --- a/docs/issues-and-limitations.md +++ b/docs/issues-and-limitations.md @@ -36,7 +36,7 @@ for (;;) { [=](celerity::item<1> item, auto& err) { err += ...; }); }); // `fence` will capture buffer contents once all writes have completed - auto future = celerity::experimental::fence(q, error); + auto future = q.fence(error); // optionally submit more work here to avoid stalling the async execution const float err = *future.get(); if (err < epsilon) break; diff --git a/docs/pitfalls.md b/docs/pitfalls.md index 505354ec2..94da5df46 100644 --- a/docs/pitfalls.md +++ b/docs/pitfalls.md @@ -116,9 +116,9 @@ void some_function(celerity::distr_queue& q) { } ``` -> Celerity supports experimental APIs that can replace most if not all uses for reference captures. -> See `celerity::experimental::host_object`, `celerity::experimental::side_effect` and -> `celerity::experimental::fence`. +> Celerity supports APIs that can replace most if not all uses for reference captures. +> See `celerity::distr_queue::fence`, `celerity::experimental::host_object` and +> `celerity::experimental::side_effect`. ## Diverging Host-Execution on Different Nodes diff --git a/examples/distr_io/distr_io.cc b/examples/distr_io/distr_io.cc index 1560ce2f7..37fbc8c73 100644 --- a/examples/distr_io/distr_io.cc +++ b/examples/distr_io/distr_io.cc @@ -153,7 +153,7 @@ int main(int argc, char* argv[]) { }); }); - const bool files_equal = *celerity::experimental::fence(q, equal).get(); + const bool files_equal = *q.fence(equal).get(); fmt::print(stderr, "=> Files are {}equal\n", files_equal ? "" : "NOT "); return files_equal ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/examples/hello_world/hello_world.cc b/examples/hello_world/hello_world.cc index dcd0a183a..83530b00d 100644 --- a/examples/hello_world/hello_world.cc +++ b/examples/hello_world/hello_world.cc @@ -14,6 +14,6 @@ int main() { cgh.parallel_for(str_buffer.get_range(), [=](celerity::item<1> item) { str_acc[item] -= 1; }); }); - auto output = celerity::experimental::fence(queue, str_buffer); + auto output = queue.fence(str_buffer); std::cout << output.get().get_data() << std::endl; } diff --git a/examples/matmul/matmul.cc b/examples/matmul/matmul.cc index 9e88a5b49..fd23752b7 100644 --- a/examples/matmul/matmul.cc +++ b/examples/matmul/matmul.cc @@ -107,6 +107,6 @@ int main() { verify(queue, mat_a_buf, passed_obj); // The value of `passed` can differ between hosts if only part of the verification failed. - const bool passed = celerity::experimental::fence(queue, passed_obj).get(); + const bool passed = queue.fence(passed_obj).get(); return passed ? EXIT_SUCCESS : EXIT_FAILURE; } diff --git a/include/distr_queue.h b/include/distr_queue.h index 7a00e8f65..5f79651b1 100644 --- a/include/distr_queue.h +++ b/include/distr_queue.h @@ -7,7 +7,18 @@ #include "runtime.h" #include "task_manager.h" +namespace celerity::experimental { + +template +class host_object; + +} + namespace celerity { + +template +class buffer_snapshot; + namespace detail { class distr_queue_tracker { @@ -75,6 +86,35 @@ class distr_queue { */ void slow_full_sync() { detail::runtime::get_instance().sync(); } // NOLINT(readability-convert-member-functions-to-static) + /** + * Asynchronously captures the value of a host object by copy, introducing the same dependencies as a side-effect would. + * + * Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between + * fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed. + */ + template + [[nodiscard]] std::future fence(const experimental::host_object& obj); + + /** + * Asynchronously captures the contents of a buffer subrange, introducing the same dependencies as a read-accessor would. + * + * Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between + * fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed. + */ + template + [[nodiscard]] std::future> fence(const buffer& buf, const subrange& sr); + + /** + * Asynchronously captures the contents of an entire buffer, introducing the same dependencies as a read-accessor would. + * + * Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between + * fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed. + */ + template + [[nodiscard]] std::future> fence(const buffer& buf) { + return fence(buf, {{}, buf.get_range()}); + } + private: std::shared_ptr m_tracker; diff --git a/include/fence.h b/include/fence.h index 626dbcbe6..e9b608335 100644 --- a/include/fence.h +++ b/include/fence.h @@ -3,25 +3,23 @@ #include #include -#include "accessor.h" -#include "buffer_manager.h" +#include "buffer_storage.h" +#include "distr_queue.h" #include "host_object.h" #include "runtime.h" #include "task_manager.h" -namespace celerity { -class distr_queue; -} - namespace celerity::detail { + template class buffer_fence_promise; -} -namespace celerity::experimental { +} // namespace celerity::detail + +namespace celerity { /** - * Owned representation of buffer contents as captured by celerity::experimental::fence. + * Owned representation of buffer contents as captured by celerity::fence. */ template class buffer_snapshot { @@ -65,7 +63,7 @@ class buffer_snapshot { explicit buffer_snapshot(subrange sr, std::unique_ptr data) : m_subrange(sr), m_data(std::move(data)) {} }; -} // namespace celerity::experimental +} // namespace celerity namespace celerity::detail { @@ -88,7 +86,7 @@ class buffer_fence_promise final : public detail::fence_promise { public: explicit buffer_fence_promise(const buffer& buf, const subrange& sr) : m_buffer(buf), m_subrange(sr) {} - std::future> get_future() { return m_promise.get_future(); } + std::future> get_future() { return m_promise.get_future(); } void fulfill() override { const auto access_info = @@ -97,27 +95,21 @@ class buffer_fence_promise final : public detail::fence_promise { auto data = std::make_unique(m_subrange.range.size()); memcpy_strided_host(access_info.ptr, data.get(), sizeof(DataT), range_cast(access_info.backing_buffer_range), m_subrange.offset - id_cast(access_info.backing_buffer_offset), m_subrange.range, {}, m_subrange.range); - m_promise.set_value(experimental::buffer_snapshot(m_subrange, std::move(data))); + m_promise.set_value(buffer_snapshot(m_subrange, std::move(data))); } private: buffer m_buffer; subrange m_subrange; - std::promise> m_promise; + std::promise> m_promise; }; } // namespace celerity::detail -namespace celerity::experimental { +namespace celerity { -/** - * Asynchronously captures the value of a host object by copy, introducing the same dependencies as a side-effect would. - * - * Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between - * fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed. - */ template -[[nodiscard]] std::future fence(celerity::distr_queue& /* unused */, const experimental::host_object& obj) { +std::future distr_queue::fence(const experimental::host_object& obj) { static_assert(std::is_object_v, "host_object and host_object are not allowed as parameters to fence()"); detail::side_effect_map side_effects; @@ -128,14 +120,8 @@ template return future; } -/** - * Asynchronously captures the contents of a buffer subrange, introducing the same dependencies as a read-accessor would. - * - * Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between - * fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed. - */ template -[[nodiscard]] std::future> fence(celerity::distr_queue& /* unused */, const buffer& buf, const subrange& sr) { +std::future> distr_queue::fence(const buffer& buf, const subrange& sr) { detail::buffer_access_map access_map; access_map.add_access(detail::get_buffer_id(buf), std::make_unique>>(celerity::access::fixed(sr), access_mode::read, buf.get_range())); @@ -145,15 +131,16 @@ template return future; } -/** - * Asynchronously captures the contents of an entire buffer, introducing the same dependencies as a read-accessor would. - * - * Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between - * fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed. - */ -template -[[nodiscard]] std::future> fence(celerity::distr_queue& q, const buffer& buf) { - return fence(q, buf, {{}, buf.get_range()}); +} // namespace celerity + +namespace celerity::experimental { + +template +using buffer_snapshot [[deprecated("buffer_snapshot is no longer experimental, use celerity::buffer_snapshot")]] = celerity::buffer_snapshot; + +template +[[deprecated("fence is no longer experimental, use celerity::distr_queue::fence")]] [[nodiscard]] auto fence(celerity::distr_queue& q, const Params&... args) { + return q.fence(args...); } } // namespace celerity::experimental diff --git a/test/runtime_deprecation_tests.cc b/test/runtime_deprecation_tests.cc index 0cecfc6cd..c0a3ecb62 100644 --- a/test/runtime_deprecation_tests.cc +++ b/test/runtime_deprecation_tests.cc @@ -45,5 +45,26 @@ namespace detail { SUCCEED(); } + TEST_CASE_METHOD(test_utils::runtime_fixture, "experimental::fence continues to work", "[deprecated]") { + distr_queue q; + + std::vector init(16); + std::iota(init.begin(), init.end(), 0); + buffer buf(init.data(), init.size()); + + experimental::host_object ho(42); + + experimental::buffer_snapshot full_snapshot = experimental::fence(q, buf).get(); + experimental::buffer_snapshot partial_snapshot = experimental::fence(q, buf, subrange<1>(8, 8)).get(); + int ho_value = experimental::fence(q, ho).get(); + + CHECK(full_snapshot.get_range() == range<1>(16)); + CHECK(std::equal(init.begin(), init.end(), full_snapshot.get_data())); + CHECK(partial_snapshot.get_range() == range<1>(8)); + CHECK(partial_snapshot.get_offset() == id<1>(8)); + CHECK(std::equal(init.begin() + 8, init.end(), partial_snapshot.get_data())); + CHECK(ho_value == 42); + } + } // namespace detail } // namespace celerity diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc index 07cf515a4..35e808ff4 100644 --- a/test/runtime_tests.cc +++ b/test/runtime_tests.cc @@ -1257,7 +1257,7 @@ namespace detail { cgh.host_task(on_master_node, [=] { acc = true; }); }); - auto ret = experimental::fence(q, buf); + auto ret = q.fence(buf); REQUIRE(ret.wait_for(std::chrono::seconds(1)) == std::future_status::ready); CHECK_FALSE(*ret.get()); // extra check that the task was not actually executed @@ -1366,13 +1366,13 @@ namespace detail { experimental::side_effect e(ho, cgh); cgh.host_task(on_master_node, [=] { *e = 2; }); }); - auto v2 = experimental::fence(q, ho); + auto v2 = q.fence(ho); q.submit([&](handler& cgh) { experimental::side_effect e(ho, cgh); cgh.host_task(on_master_node, [=] { *e = 3; }); }); - auto v3 = experimental::fence(q, ho); + auto v3 = q.fence(ho); CHECK(v2.get() == 2); CHECK(v3.get() == 3); @@ -1388,7 +1388,7 @@ namespace detail { }); const auto check_snapshot = [&](const subrange<2>& sr, const std::vector& expected_data) { - const auto snapshot = experimental::fence(q, buf, sr).get(); + const auto snapshot = q.fence(buf, sr).get(); CHECK(snapshot.get_subrange() == sr); CHECK(memcmp(snapshot.get_data(), expected_data.data(), expected_data.size() * sizeof(int)) == 0); }; @@ -1412,7 +1412,7 @@ namespace detail { cgh.parallel_for(buf.get_range(), [=](celerity::item<0> item) { *acc = 42; }); }); - const auto snapshot = experimental::fence(q, buf).get(); + const auto snapshot = q.fence(buf).get(); CHECK(*snapshot == 42); } diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc index 4f9aac429..2ff73cd04 100644 --- a/test/system/distr_tests.cc +++ b/test/system/distr_tests.cc @@ -362,8 +362,8 @@ namespace detail { cgh.host_task(on_master_node, [=] { acc[{1, 2, 3}] = 42; }); }); - const auto gathered_from_master = experimental::fence(q, buf, subrange<3>({1, 2, 3}, {1, 1, 1})).get(); - const auto host_rank = experimental::fence(q, obj).get(); + const auto gathered_from_master = q.fence(buf, subrange<3>({1, 2, 3}, {1, 1, 1})).get(); + const auto host_rank = q.fence(obj).get(); REQUIRE(gathered_from_master.get_range() == range<3>{1, 1, 1}); CHECK(gathered_from_master[0][0][0] == 42); diff --git a/test/system_benchmarks.cc b/test/system_benchmarks.cc index a98f1ed0e..23415e83c 100644 --- a/test/system_benchmarks.cc +++ b/test/system_benchmarks.cc @@ -65,7 +65,7 @@ TEMPLATE_TEST_CASE_METHOD_SIG( }); }); }); - CHECK(*experimental::fence(queue, success_buffer).get() == true); + CHECK(*queue.fence(success_buffer).get() == true); } TEMPLATE_TEST_CASE_METHOD_SIG( @@ -135,5 +135,5 @@ TEMPLATE_TEST_CASE_METHOD_SIG( }); }); }); - CHECK(*experimental::fence(queue, success_buffer).get() == true); + CHECK(*queue.fence(success_buffer).get() == true); }