Skip to content

Commit

Permalink
Stabilize experimental::fence
Browse files Browse the repository at this point in the history
  • Loading branch information
fknorr committed Nov 2, 2023
1 parent 7dcffd3 commit 3115080
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 53 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ Versioning](http://semver.org/spec/v2.0.0.html).
- Introduce new experimental `for_each_item` utility to iterate over a celerity range (#199)
- Add new environment variables `CELERITY_HORIZON_STEP` and `CELERITY_HORIZON_MAX_PARALLELISM` to control Horizon generation (#199)
- Add new `experimental::constrain_split` API to limit how a kernel can be split (#?)
- `distr_queue::fence` and `buffer_snapshot` are now stable, subsuming the `experimental::` APIs of the same name (#225)

## Changed

Expand Down
2 changes: 1 addition & 1 deletion docs/issues-and-limitations.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ for (;;) {
[=](celerity::item<1> item, auto& err) { err += ...; });
});
// `fence` will capture buffer contents once all writes have completed
auto future = celerity::experimental::fence(q, error);
auto future = q.fence(error);
// optionally submit more work here to avoid stalling the async execution
const float err = *future.get();
if (err < epsilon) break;
Expand Down
6 changes: 3 additions & 3 deletions docs/pitfalls.md
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,9 @@ void some_function(celerity::distr_queue& q) {
}
```

> Celerity supports experimental APIs that can replace most if not all uses for reference captures.
> See `celerity::experimental::host_object`, `celerity::experimental::side_effect` and
> `celerity::experimental::fence`.
> Celerity supports APIs that can replace most if not all uses for reference captures.
> See `celerity::distr_queue::fence`, `celerity::experimental::host_object` and
> `celerity::experimental::side_effect`.
## Diverging Host-Execution on Different Nodes

Expand Down
2 changes: 1 addition & 1 deletion examples/distr_io/distr_io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ int main(int argc, char* argv[]) {
});
});

const bool files_equal = *celerity::experimental::fence(q, equal).get();
const bool files_equal = *q.fence(equal).get();
fmt::print(stderr, "=> Files are {}equal\n", files_equal ? "" : "NOT ");
return files_equal ? EXIT_SUCCESS : EXIT_FAILURE;
}
Expand Down
2 changes: 1 addition & 1 deletion examples/hello_world/hello_world.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,6 @@ int main() {
cgh.parallel_for(str_buffer.get_range(), [=](celerity::item<1> item) { str_acc[item] -= 1; });
});

auto output = celerity::experimental::fence(queue, str_buffer);
auto output = queue.fence(str_buffer);
std::cout << output.get().get_data() << std::endl;
}
2 changes: 1 addition & 1 deletion examples/matmul/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,6 @@ int main() {
verify(queue, mat_a_buf, passed_obj);

// The value of `passed` can differ between hosts if only part of the verification failed.
const bool passed = celerity::experimental::fence(queue, passed_obj).get();
const bool passed = queue.fence(passed_obj).get();
return passed ? EXIT_SUCCESS : EXIT_FAILURE;
}
40 changes: 40 additions & 0 deletions include/distr_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,18 @@
#include "runtime.h"
#include "task_manager.h"

namespace celerity::experimental {

template <typename T>
class host_object;

}

namespace celerity {

template <typename T, int Dims>
class buffer_snapshot;

namespace detail {

class distr_queue_tracker {
Expand Down Expand Up @@ -75,6 +86,35 @@ class distr_queue {
*/
void slow_full_sync() { detail::runtime::get_instance().sync(); } // NOLINT(readability-convert-member-functions-to-static)

/**
* Asynchronously captures the value of a host object by copy, introducing the same dependencies as a side-effect would.
*
* Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between
* fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed.
*/
template <typename T>
[[nodiscard]] std::future<T> fence(const experimental::host_object<T>& obj);

/**
* Asynchronously captures the contents of a buffer subrange, introducing the same dependencies as a read-accessor would.
*
* Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between
* fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed.
*/
template <typename DataT, int Dims>
[[nodiscard]] std::future<buffer_snapshot<DataT, Dims>> fence(const buffer<DataT, Dims>& buf, const subrange<Dims>& sr);

/**
* Asynchronously captures the contents of an entire buffer, introducing the same dependencies as a read-accessor would.
*
* Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between
* fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed.
*/
template <typename DataT, int Dims>
[[nodiscard]] std::future<buffer_snapshot<DataT, Dims>> fence(const buffer<DataT, Dims>& buf) {
return fence(buf, {{}, buf.get_range()});
}

private:
std::shared_ptr<detail::distr_queue_tracker> m_tracker;

Expand Down
61 changes: 24 additions & 37 deletions include/fence.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,25 +3,23 @@
#include <memory>
#include <type_traits>

#include "accessor.h"
#include "buffer_manager.h"
#include "buffer_storage.h"
#include "distr_queue.h"
#include "host_object.h"
#include "runtime.h"
#include "task_manager.h"

namespace celerity {
class distr_queue;
}

namespace celerity::detail {

template <typename DataT, int Dims>
class buffer_fence_promise;
}

namespace celerity::experimental {
} // namespace celerity::detail

namespace celerity {

/**
* Owned representation of buffer contents as captured by celerity::experimental::fence.
* Owned representation of buffer contents as captured by celerity::distr_queue::fence.
*/
template <typename T, int Dims>
class buffer_snapshot {
Expand Down Expand Up @@ -65,7 +63,7 @@ class buffer_snapshot {
explicit buffer_snapshot(subrange<Dims> sr, std::unique_ptr<T[]> data) : m_subrange(sr), m_data(std::move(data)) {}
};

} // namespace celerity::experimental
} // namespace celerity

namespace celerity::detail {

Expand All @@ -88,7 +86,7 @@ class buffer_fence_promise final : public detail::fence_promise {
public:
explicit buffer_fence_promise(const buffer<DataT, Dims>& buf, const subrange<Dims>& sr) : m_buffer(buf), m_subrange(sr) {}

std::future<experimental::buffer_snapshot<DataT, Dims>> get_future() { return m_promise.get_future(); }
std::future<buffer_snapshot<DataT, Dims>> get_future() { return m_promise.get_future(); }

void fulfill() override {
const auto access_info =
Expand All @@ -97,27 +95,21 @@ class buffer_fence_promise final : public detail::fence_promise {
auto data = std::make_unique<DataT[]>(m_subrange.range.size());
memcpy_strided_host(access_info.ptr, data.get(), sizeof(DataT), range_cast<Dims>(access_info.backing_buffer_range),
m_subrange.offset - id_cast<Dims>(access_info.backing_buffer_offset), m_subrange.range, {}, m_subrange.range);
m_promise.set_value(experimental::buffer_snapshot<DataT, Dims>(m_subrange, std::move(data)));
m_promise.set_value(buffer_snapshot<DataT, Dims>(m_subrange, std::move(data)));
}

private:
buffer<DataT, Dims> m_buffer;
subrange<Dims> m_subrange;
std::promise<experimental::buffer_snapshot<DataT, Dims>> m_promise;
std::promise<buffer_snapshot<DataT, Dims>> m_promise;
};

} // namespace celerity::detail

namespace celerity::experimental {
namespace celerity {

/**
* Asynchronously captures the value of a host object by copy, introducing the same dependencies as a side-effect would.
*
* Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between
* fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed.
*/
template <typename T>
[[nodiscard]] std::future<T> fence(celerity::distr_queue& /* unused */, const experimental::host_object<T>& obj) {
std::future<T> distr_queue::fence(const experimental::host_object<T>& obj) {
static_assert(std::is_object_v<T>, "host_object<T&> and host_object<void> are not allowed as parameters to fence()");

detail::side_effect_map side_effects;
Expand All @@ -128,14 +120,8 @@ template <typename T>
return future;
}

/**
* Asynchronously captures the contents of a buffer subrange, introducing the same dependencies as a read-accessor would.
*
* Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between
* fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed.
*/
template <typename DataT, int Dims>
[[nodiscard]] std::future<buffer_snapshot<DataT, Dims>> fence(celerity::distr_queue& /* unused */, const buffer<DataT, Dims>& buf, const subrange<Dims>& sr) {
std::future<buffer_snapshot<DataT, Dims>> distr_queue::fence(const buffer<DataT, Dims>& buf, const subrange<Dims>& sr) {
detail::buffer_access_map access_map;
access_map.add_access(detail::get_buffer_id(buf),
std::make_unique<detail::range_mapper<Dims, celerity::access::fixed<Dims>>>(celerity::access::fixed<Dims>(sr), access_mode::read, buf.get_range()));
Expand All @@ -145,15 +131,16 @@ template <typename DataT, int Dims>
return future;
}

/**
* Asynchronously captures the contents of an entire buffer, introducing the same dependencies as a read-accessor would.
*
* Waiting on the returned future in the application thread can stall scheduling of more work. To hide latency, either submit more command groups between
* fence and wait operations or ensure that other independent command groups are eligible to run while the fence is executed.
*/
template <typename DataT, int Dims>
[[nodiscard]] std::future<buffer_snapshot<DataT, Dims>> fence(celerity::distr_queue& q, const buffer<DataT, Dims>& buf) {
return fence(q, buf, {{}, buf.get_range()});
} // namespace celerity

namespace celerity::experimental {

template <typename T, int Dims>
using buffer_snapshot [[deprecated("buffer_snapshot is no longer experimental, use celerity::buffer_snapshot")]] = celerity::buffer_snapshot<T, Dims>;

template <typename... Params>
[[deprecated("fence is no longer experimental, use celerity::distr_queue::fence")]] [[nodiscard]] auto fence(celerity::distr_queue& q, const Params&... args) {
return q.fence(args...);
}

} // namespace celerity::experimental
21 changes: 21 additions & 0 deletions test/runtime_deprecation_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,26 @@ namespace detail {
SUCCEED();
}

TEST_CASE_METHOD(test_utils::runtime_fixture, "experimental::fence continues to work", "[deprecated][fence]") {
distr_queue q;

std::vector<int> init(16);
std::iota(init.begin(), init.end(), 0);
buffer<int, 1> buf(init.data(), init.size());

experimental::host_object<int> ho(42);

experimental::buffer_snapshot<int, 1> full_snapshot = experimental::fence(q, buf).get();
experimental::buffer_snapshot<int, 1> partial_snapshot = experimental::fence(q, buf, subrange<1>(8, 8)).get();
int ho_value = experimental::fence(q, ho).get();

CHECK(full_snapshot.get_range() == range<1>(16));
CHECK(std::equal(init.begin(), init.end(), full_snapshot.get_data()));
CHECK(partial_snapshot.get_range() == range<1>(8));
CHECK(partial_snapshot.get_offset() == id<1>(8));
CHECK(std::equal(init.begin() + 8, init.end(), partial_snapshot.get_data()));
CHECK(ho_value == 42);
}

} // namespace detail
} // namespace celerity
10 changes: 5 additions & 5 deletions test/runtime_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1257,7 +1257,7 @@ namespace detail {
cgh.host_task(on_master_node, [=] { acc = true; });
});

auto ret = experimental::fence(q, buf);
auto ret = q.fence(buf);
REQUIRE(ret.wait_for(std::chrono::seconds(1)) == std::future_status::ready);
CHECK_FALSE(*ret.get()); // extra check that the task was not actually executed

Expand Down Expand Up @@ -1366,13 +1366,13 @@ namespace detail {
experimental::side_effect e(ho, cgh);
cgh.host_task(on_master_node, [=] { *e = 2; });
});
auto v2 = experimental::fence(q, ho);
auto v2 = q.fence(ho);

q.submit([&](handler& cgh) {
experimental::side_effect e(ho, cgh);
cgh.host_task(on_master_node, [=] { *e = 3; });
});
auto v3 = experimental::fence(q, ho);
auto v3 = q.fence(ho);

CHECK(v2.get() == 2);
CHECK(v3.get() == 3);
Expand All @@ -1388,7 +1388,7 @@ namespace detail {
});

const auto check_snapshot = [&](const subrange<2>& sr, const std::vector<int>& expected_data) {
const auto snapshot = experimental::fence(q, buf, sr).get();
const auto snapshot = q.fence(buf, sr).get();
CHECK(snapshot.get_subrange() == sr);
CHECK(memcmp(snapshot.get_data(), expected_data.data(), expected_data.size() * sizeof(int)) == 0);
};
Expand All @@ -1412,7 +1412,7 @@ namespace detail {
cgh.parallel_for<class UKN(init)>(buf.get_range(), [=](celerity::item<0> item) { *acc = 42; });
});

const auto snapshot = experimental::fence(q, buf).get();
const auto snapshot = q.fence(buf).get();
CHECK(*snapshot == 42);
}

Expand Down
4 changes: 2 additions & 2 deletions test/system/distr_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -362,8 +362,8 @@ namespace detail {
cgh.host_task(on_master_node, [=] { acc[{1, 2, 3}] = 42; });
});

const auto gathered_from_master = experimental::fence(q, buf, subrange<3>({1, 2, 3}, {1, 1, 1})).get();
const auto host_rank = experimental::fence(q, obj).get();
const auto gathered_from_master = q.fence(buf, subrange<3>({1, 2, 3}, {1, 1, 1})).get();
const auto host_rank = q.fence(obj).get();

REQUIRE(gathered_from_master.get_range() == range<3>{1, 1, 1});
CHECK(gathered_from_master[0][0][0] == 42);
Expand Down
4 changes: 2 additions & 2 deletions test/system_benchmarks.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ TEMPLATE_TEST_CASE_METHOD_SIG(
});
});
});
CHECK(*experimental::fence(queue, success_buffer).get() == true);
CHECK(*queue.fence(success_buffer).get() == true);
}

TEMPLATE_TEST_CASE_METHOD_SIG(
Expand Down Expand Up @@ -135,5 +135,5 @@ TEMPLATE_TEST_CASE_METHOD_SIG(
});
});
});
CHECK(*experimental::fence(queue, success_buffer).get() == true);
CHECK(*queue.fence(success_buffer).get() == true);
}

0 comments on commit 3115080

Please sign in to comment.