Skip to content

Commit

Permalink
Remove multi-pass mechanism (invoke CGFs only once)
Browse files Browse the repository at this point in the history
- Instead of storing the entire CGF, we now only store the inner "command function" lambda
- Accessors captured into command function closures are "hydrated" before launching kernel
- Buffers and host objects can (and should) now be captured by reference into CGFs; added deprecation warnings
   - Accessors, side-effects and reductions may now be created from non-const buffers
- Deprecate allow_by_ref
- Introduce new mechnaism to tie buffer and host object lifetimes to
  tasks (required as we no longer store captured copies inside the CGF)

Other changes:
- Fix a bug in test "horizons correctly deal with antidependencies",
  which relied on fixed order of dependencies
- Change output of "command graph printing is unchanged" smoke test
  (again due to change in ordering)
  • Loading branch information
psalz committed May 24, 2023
1 parent 3e6d9eb commit 0a743c7
Show file tree
Hide file tree
Showing 38 changed files with 1,309 additions and 759 deletions.
6 changes: 3 additions & 3 deletions examples/convolution/convolution.cc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ int main(int argc, char* argv[]) {
celerity::buffer<float, 2> gaussian_mat_buf(gaussian_matrix.data(), celerity::range<2>(filter_size, filter_size));

// Do a gaussian blur
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor in{image_input_buf, cgh, celerity::access::neighborhood{filter_size / 2, filter_size / 2}, celerity::read_only};
celerity::accessor gauss{gaussian_mat_buf, cgh, celerity::access::all{}, celerity::read_only};
celerity::accessor out{image_tmp_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
Expand All @@ -85,7 +85,7 @@ int main(int argc, char* argv[]) {
celerity::buffer<sycl::float3, 2> image_output_buf(celerity::range<2>(image_height, image_width));

// Now apply a sharpening kernel
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor in{image_tmp_buf, cgh, celerity::access::neighborhood{1, 1}, celerity::read_only};
celerity::accessor out{image_output_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};

Expand All @@ -107,7 +107,7 @@ int main(int argc, char* argv[]) {
});
});

queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor out{image_output_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};

cgh.host_task(celerity::on_master_node, [=] {
Expand Down
12 changes: 6 additions & 6 deletions examples/distr_io/distr_io.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ static std::pair<hid_t, hid_t> allocation_window_to_dataspace(const celerity::bu
}


static void read_hdf5_file(celerity::distr_queue& q, const celerity::buffer<float, 2>& buffer, const char* file_name) {
q.submit([=](celerity::handler& cgh) {
static void read_hdf5_file(celerity::distr_queue& q, celerity::buffer<float, 2>& buffer, const char* file_name) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor a{buffer, cgh, celerity::experimental::access::even_split<2>{}, celerity::write_only_host_task, celerity::no_init};
cgh.host_task(celerity::experimental::collective, [=](celerity::experimental::collective_partition part) {
auto plist = H5Pcreate(H5P_FILE_ACCESS);
Expand All @@ -54,8 +54,8 @@ static void read_hdf5_file(celerity::distr_queue& q, const celerity::buffer<floa
}


static void write_hdf5_file(celerity::distr_queue& q, const celerity::buffer<float, 2>& buffer, const char* file_name) {
q.submit([=](celerity::handler& cgh) {
static void write_hdf5_file(celerity::distr_queue& q, celerity::buffer<float, 2>& buffer, const char* file_name) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor a{buffer, cgh, celerity::experimental::access::even_split<2>{}, celerity::read_only_host_task};
cgh.host_task(celerity::experimental::collective, [=](celerity::experimental::collective_partition part) {
auto plist = H5Pcreate(H5P_FILE_ACCESS);
Expand Down Expand Up @@ -116,7 +116,7 @@ int main(int argc, char* argv[]) {

read_hdf5_file(q, in, argv[2]);

q.submit([=](celerity::handler& cgh) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor a{in, cgh, celerity::access::one_to_one{}, celerity::read_only};
celerity::accessor b{out, cgh, transposed, celerity::write_only, celerity::no_init};
cgh.parallel_for<class transpose>(celerity::range<2>{N, N}, [=](celerity::item<2> item) {
Expand All @@ -139,7 +139,7 @@ int main(int argc, char* argv[]) {
read_hdf5_file(q, left, argv[2]);
read_hdf5_file(q, right, argv[3]);

q.submit([=](celerity::handler& cgh) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor a{left, cgh, celerity::access::all{}, celerity::read_only_host_task};
celerity::accessor b{right, cgh, celerity::access::all{}, celerity::read_only_host_task};
celerity::accessor e{equal, cgh, celerity::write_only_host_task, celerity::no_init};
Expand Down
6 changes: 3 additions & 3 deletions examples/matmul/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ const size_t MAT_SIZE = 1024;

template <typename T>
void set_identity(celerity::distr_queue queue, celerity::buffer<T, 2> mat) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor dw{mat, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
cgh.parallel_for<class set_identity_kernel>(mat.get_range(), [=](celerity::item<2> item) { dw[item] = item[0] == item[1]; });
});
}

template <typename T>
void multiply(celerity::distr_queue queue, celerity::buffer<T, 2> mat_a, celerity::buffer<T, 2> mat_b, celerity::buffer<T, 2> mat_c) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor a{mat_a, cgh, celerity::access::slice<2>(1), celerity::read_only};
celerity::accessor b{mat_b, cgh, celerity::access::slice<2>(0), celerity::read_only};
celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
Expand Down Expand Up @@ -48,7 +48,7 @@ void multiply(celerity::distr_queue queue, celerity::buffer<T, 2> mat_a, celerit
// TODO this should really reduce into a buffer<bool> on the device, but not all backends currently support reductions
template <typename T>
void verify(celerity::distr_queue& queue, celerity::buffer<T, 2> mat_c, celerity::experimental::host_object<bool> passed_obj) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::read_only_host_task};
celerity::experimental::side_effect passed{passed_obj, cgh};

Expand Down
8 changes: 4 additions & 4 deletions examples/reduction/reduction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ int main(int argc, char* argv[]) {
celerity::buffer<sycl::float4, 2> lab_buf{image_size};
celerity::buffer<sycl::float2, 1> minmax_buf{celerity::range{1}};

q.submit([=](celerity::handler& cgh) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::one_to_one{}, celerity::read_only};
celerity::accessor rgb_acc{lab_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
auto minmax_r = celerity::reduction(minmax_buf, cgh, minmax_identity, minmax, celerity::property::reduction::initialize_to_identity{});
Expand All @@ -86,12 +86,12 @@ int main(int argc, char* argv[]) {
});
});

q.submit([=](celerity::handler& cgh) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor minmax_acc{minmax_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
cgh.host_task(celerity::on_master_node, [=] { printf("Before contrast stretch: min = %f, max = %f\n", minmax_acc[0][0], minmax_acc[0][1]); });
});

q.submit([=](celerity::handler& cgh) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor rgb_acc{lab_buf, cgh, celerity::access::one_to_one{}, celerity::read_only};
celerity::accessor minmax_acc{minmax_buf, cgh, celerity::access::all{}, celerity::read_only};
celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
Expand All @@ -105,7 +105,7 @@ int main(int argc, char* argv[]) {
});
});

q.submit([=](celerity::handler& cgh) {
q.submit([&](celerity::handler& cgh) {
celerity::accessor srgb_255_acc{srgb_255_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
cgh.host_task(celerity::on_master_node, [=] { stbi_write_jpg("output.jpg", image_width, image_height, 4, srgb_255_acc.get_pointer(), 90); });
});
Expand Down
4 changes: 2 additions & 2 deletions examples/syncing/syncing.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ int main(int argc, char* argv[]) {
celerity::buffer<size_t, 1> buf(buf_size);

// Initialize buffer in a distributed device kernel
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor b{buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
cgh.parallel_for<class write_linear_id>(buf.get_range(), [=](celerity::item<1> item) { b[item] = item.get_linear_id(); });
});

// Process values on the host
std::vector<size_t> host_buf(buf_size);
queue.submit(celerity::allow_by_ref, [=, &host_buf](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor b{buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
cgh.host_task(celerity::experimental::collective, [=, &host_buf](celerity::experimental::collective_partition) {
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // give the synchronization more time to fail
Expand Down
12 changes: 6 additions & 6 deletions examples/wave_sim/wave_sim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <celerity.h>

void setup_wave(celerity::distr_queue& queue, celerity::buffer<float, 2> u, sycl::float2 center, float amplitude, sycl::float2 sigma) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor dw_u{u, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
cgh.parallel_for<class setup_wave>(u.get_range(), [=, c = center, a = amplitude, s = sigma](celerity::item<2> item) {
const float dx = item[1] - c.x();
Expand All @@ -17,7 +17,7 @@ void setup_wave(celerity::distr_queue& queue, celerity::buffer<float, 2> u, sycl
}

void zero(celerity::distr_queue& queue, celerity::buffer<float, 2> buf) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor dw_buf{buf, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
cgh.parallel_for<class zero>(buf.get_range(), [=](celerity::item<2> item) { dw_buf[item] = 0.f; });
});
Expand All @@ -37,7 +37,7 @@ struct update_config {

template <typename T, typename Config, typename KernelName>
void step(celerity::distr_queue& queue, celerity::buffer<T, 2> up, celerity::buffer<T, 2> u, float dt, sycl::float2 delta) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor rw_up{up, cgh, celerity::access::one_to_one{}, celerity::read_write};
celerity::accessor r_u{u, cgh, celerity::access::neighborhood{1, 1}, celerity::read_only};

Expand All @@ -64,7 +64,7 @@ void update(celerity::distr_queue& queue, celerity::buffer<float, 2> up, celerit
}

void stream_open(celerity::distr_queue& queue, size_t N, size_t num_samples, celerity::experimental::host_object<std::ofstream> os) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::experimental::side_effect os_eff{os, cgh};
cgh.host_task(celerity::on_master_node, [=] {
os_eff->open("wave_sim_result.bin", std::ios_base::out | std::ios_base::binary);
Expand All @@ -77,15 +77,15 @@ void stream_open(celerity::distr_queue& queue, size_t N, size_t num_samples, cel
template <typename T>
void stream_append(celerity::distr_queue& queue, celerity::buffer<T, 2> up, celerity::experimental::host_object<std::ofstream> os) {
const auto range = up.get_range();
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::accessor up_r{up, cgh, celerity::access::all{}, celerity::read_only_host_task};
celerity::experimental::side_effect os_eff{os, cgh};
cgh.host_task(celerity::on_master_node, [=] { os_eff->write(reinterpret_cast<const char*>(up_r.get_pointer()), range.size() * sizeof(T)); });
});
}

void stream_close(celerity::distr_queue& queue, celerity::experimental::host_object<std::ofstream> os) {
queue.submit([=](celerity::handler& cgh) {
queue.submit([&](celerity::handler& cgh) {
celerity::experimental::side_effect os_eff{os, cgh};
cgh.host_task(celerity::on_master_node, [=] { os_eff->close(); });
});
Expand Down
Loading

0 comments on commit 0a743c7

Please sign in to comment.