diff --git a/include/buffer_manager.h b/include/buffer_manager.h index 8cc25febb..4429ac7b4 100644 --- a/include/buffer_manager.h +++ b/include/buffer_manager.h @@ -14,6 +14,7 @@ #include "access_modes.h" #include "buffer_storage.h" #include "device_queue.h" +#include "mpi_support.h" #include "ranges.h" #include "region_map.h" #include "types.h" @@ -21,9 +22,6 @@ namespace celerity { namespace detail { - class raw_buffer_data; - - /** * The buffer_manager keeps track of all Celerity buffers currently existing within the runtime. * @@ -92,6 +90,7 @@ namespace detail { struct buffer_info { cl::sycl::range<3> range = {1, 1, 1}; + size_t element_size = 0; bool is_host_initialized; }; @@ -124,7 +123,7 @@ namespace detail { { std::unique_lock lock(mutex); bid = buffer_count++; - buffer_infos[bid] = buffer_info{range, is_host_initialized}; + buffer_infos[bid] = buffer_info{range, sizeof(DataT), is_host_initialized}; newest_data_location.emplace(bid, region_map(range, data_location::NOWHERE)); #if defined(CELERITY_DETAIL_ENABLE_DEBUG) @@ -179,12 +178,8 @@ namespace detail { * TODO: * - Ideally we would transfer data directly out of the original buffer (at least on the host, need RDMA otherwise). * - We'd have to consider the data striding in the MPI data type we build. - * - * @param bid - * @param offset - * @param range */ - raw_buffer_data get_buffer_data(buffer_id bid, const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range); + void get_buffer_data(buffer_id bid, const subrange<3>& sr, void* out_linearized); /** * Updates a buffer's content with the provided @p data. @@ -195,7 +190,7 @@ namespace detail { * - Host buffer might not be large enough. * - H->D transfers currently work better for contiguous copies. */ - void set_buffer_data(buffer_id bid, cl::sycl::id<3> offset, raw_buffer_data&& data); + void set_buffer_data(buffer_id bid, const subrange<3>& sr, unique_payload_ptr in_linearized); template access_info get_device_buffer( @@ -338,8 +333,8 @@ namespace detail { }; struct transfer { - raw_buffer_data data; - cl::sycl::id<3> target_offset; + unique_payload_ptr linearized; + subrange<3> sr; }; struct resize_info { diff --git a/include/buffer_storage.h b/include/buffer_storage.h index d551f1a73..999055b15 100644 --- a/include/buffer_storage.h +++ b/include/buffer_storage.h @@ -6,6 +6,7 @@ #include +#include "mpi_support.h" #include "ranges.h" #include "workaround.h" @@ -24,61 +25,7 @@ namespace detail { const cl::sycl::id<3>& source_offset, const cl::sycl::range<3>& target_range, const cl::sycl::id<3>& target_offset, const cl::sycl::range<3>& copy_range); - /** - * Dense, linearized host-side storage for buffer data. - */ - class raw_buffer_data { - public: - raw_buffer_data() {} - - raw_buffer_data(size_t elem_size, cl::sycl::range<3> range) : elem_size(elem_size), range(range) { - const size_t size = get_size(); - data = std::make_unique(size); - } - - raw_buffer_data(const raw_buffer_data&) = delete; - - raw_buffer_data(raw_buffer_data&& other) noexcept { *this = std::move(other); } - - raw_buffer_data& operator=(raw_buffer_data&& other) noexcept { - elem_size = other.elem_size; - range = other.range; - data = std::move(other.data); - return *this; - } - - /** - * Changes the element size and range of this buffer. - * Note that the resulting data size must remain the same as before. - */ - void reinterpret(size_t elem_size, cl::sycl::range<3> range) { - assert(elem_size * range.size() == this->elem_size * this->range.size()); - this->elem_size = elem_size; - this->range = range; - } - - /** - * Returns the pointer to the dense, linearized data location. - */ - void* get_pointer() const { return data.get(); } - - cl::sycl::range<3> get_range() const { return range; } - - /** - * Returns the data size, in bytes. - */ - size_t get_size() const { return elem_size * range.size(); } - - /** - * Copies the specified data subrange into a new (unstrided) raw_buffer_data instance. - */ - raw_buffer_data copy(cl::sycl::id<3> offset, cl::sycl::range<3> copy_range); - - private: - size_t elem_size = 0; - cl::sycl::range<3> range = {1, 1, 1}; - std::unique_ptr data; - }; + void linearize_subrange(const void* source_base_ptr, void* target_ptr, size_t elem_size, const range<3>& source_range, const subrange<3>& copy_sr); template using device_buffer = cl::sycl::buffer; @@ -86,7 +33,7 @@ namespace detail { template class host_buffer { public: - host_buffer(cl::sycl::range range) : range(range) { + explicit host_buffer(cl::sycl::range range) : range(range) { auto r3 = range_cast<3>(range); data = std::make_unique(r3[0] * r3[1] * r3[2]); } @@ -111,9 +58,9 @@ namespace detail { /** * @param range The size of the buffer */ - buffer_storage(cl::sycl::range<3> range, buffer_type type) : range(range), type(type) {} + buffer_storage(celerity::range<3> range, buffer_type type) : range(range), type(type) {} - cl::sycl::range<3> get_range() const { return range; } + celerity::range<3> get_range() const { return range; } buffer_type get_type() const { return type; } @@ -122,9 +69,9 @@ namespace detail { */ virtual size_t get_size() const = 0; - virtual raw_buffer_data get_data(const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range) const = 0; + virtual void get_data(const subrange<3>& sr, void* out_linearized) const = 0; - virtual void set_data(cl::sycl::id<3> offset, raw_buffer_data data) = 0; + virtual void set_data(const subrange<3>& sr, const void* in_linearized) = 0; /** * Convenience function to create new buffer_storages of the same (templated) type, useful in contexts where template type information is not available. @@ -172,59 +119,54 @@ namespace detail { const device_buffer& get_device_buffer() const { return device_buf; } - raw_buffer_data get_data(const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range) const override { - assert(Dims > 1 || (offset[1] == 0 && range[1] == 1)); - assert(Dims > 2 || (offset[2] == 0 && range[2] == 1)); + void get_data(const subrange<3>& sr, void* out_linearized) const override { + assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1)); + assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1)); - auto result = raw_buffer_data{sizeof(DataT), range}; auto buf = get_device_buffer(); // ComputeCpp (as of version 2.5.0) expects the target pointer of an explicit copy operation to have the same size as the buffer. // As a workaround, we copy the data manually using a kernel. #if WORKAROUND_COMPUTECPP - cl::sycl::buffer tmp_dst_buf(reinterpret_cast(result.get_pointer()), range_cast(range)); + cl::sycl::buffer tmp_dst_buf(static_cast(out_linearized), range_cast(sr.range)); auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) { - const auto src_acc = buf.template get_access(cgh, range_cast(range), id_cast(offset)); + const auto src_acc = buf.template get_access(cgh, range_cast(sr.range), id_cast(sr.offset)); const auto dst_acc = tmp_dst_buf.template get_access(cgh); const auto src_buf_range = buf.get_range(); cgh.parallel_for>( - range_cast(range), [=](const sycl::id id) { dst_acc[id] = ranged_sycl_access(src_acc, src_buf_range, id); }); + range_cast(sr.range), [=](const sycl::id id) { dst_acc[id] = ranged_sycl_access(src_acc, src_buf_range, id); }); }); #else auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) { - auto acc = buf.template get_access(cgh, range_cast(range), id_cast(offset)); - cgh.copy(acc, reinterpret_cast(result.get_pointer())); + auto acc = buf.template get_access(cgh, range_cast(sr.range), id_cast(sr.offset)); + cgh.copy(acc, static_cast(out_linearized)); }); #endif // TODO: Ideally we'd not wait here and instead return some sort of async handle that can be waited upon event.wait(); - - return result; } - void set_data(cl::sycl::id<3> offset, raw_buffer_data data) override { - assert(Dims > 1 || (offset[1] == 0 && data.get_range()[1] == 1)); - assert(Dims > 2 || (offset[2] == 0 && data.get_range()[2] == 1)); - assert(data.get_size() == data.get_range().size() * sizeof(DataT)); - assert(data.get_size() <= device_buf.get_range().size() * sizeof(DataT)); + void set_data(const subrange<3>& sr, const void* in_linearized) override { + assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1)); + assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1)); auto buf = get_device_buffer(); // See above for why this workaround is needed. #if WORKAROUND_COMPUTECPP - cl::sycl::buffer tmp_src_buf(reinterpret_cast(data.get_pointer()), range_cast(data.get_range())); + cl::sycl::buffer tmp_src_buf(static_cast(in_linearized), range_cast(sr.range)); auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) { auto src_acc = tmp_src_buf.template get_access(cgh); - auto dst_acc = buf.template get_access(cgh, range_cast(data.get_range()), id_cast(offset)); + auto dst_acc = buf.template get_access(cgh, range_cast(sr.range), id_cast(sr.offset)); const auto dst_buf_range = buf.get_range(); cgh.parallel_for>( - range_cast(data.get_range()), [=](const sycl::id id) { ranged_sycl_access(dst_acc, dst_buf_range, id) = src_acc[id]; }); + range_cast(sr.range), [=](const sycl::id id) { ranged_sycl_access(dst_acc, dst_buf_range, id) = src_acc[id]; }); }); #else auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) { - auto acc = buf.template get_access(cgh, range_cast(data.get_range()), id_cast(offset)); - cgh.copy(reinterpret_cast(data.get_pointer()), acc); + auto acc = buf.template get_access(cgh, range_cast(sr.range), id_cast(sr.offset)); + cgh.copy(static_cast(in_linearized), acc); }); #endif @@ -258,28 +200,24 @@ namespace detail { template class host_buffer_storage : public buffer_storage { public: - host_buffer_storage(cl::sycl::range range) : buffer_storage(range_cast<3>(range), buffer_type::HOST_BUFFER), host_buf(range) {} + explicit host_buffer_storage(cl::sycl::range range) : buffer_storage(range_cast<3>(range), buffer_type::HOST_BUFFER), host_buf(range) {} size_t get_size() const override { return get_range().size() * sizeof(DataT); }; - raw_buffer_data get_data(const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range) const override { - assert(Dims > 1 || (offset[1] == 0 && range[1] == 1)); - assert(Dims > 2 || (offset[2] == 0 && range[2] == 1)); + void get_data(const subrange<3>& sr, void* out_linearized) const override { + assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1)); + assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1)); - auto result = raw_buffer_data{sizeof(DataT), range}; - memcpy_strided(host_buf.get_pointer(), result.get_pointer(), sizeof(DataT), range_cast(host_buf.get_range()), id_cast(offset), - range_cast(range), id_cast(cl::sycl::id<3>{0, 0, 0}), range_cast(range)); - return result; + memcpy_strided(host_buf.get_pointer(), out_linearized, sizeof(DataT), range_cast(host_buf.get_range()), id_cast(sr.offset), + range_cast(sr.range), id_cast(cl::sycl::id<3>{0, 0, 0}), range_cast(sr.range)); } - void set_data(cl::sycl::id<3> offset, raw_buffer_data data) override { - assert(Dims > 1 || (offset[1] == 0 && data.get_range()[1] == 1)); - assert(Dims > 2 || (offset[2] == 0 && data.get_range()[2] == 1)); - assert(data.get_size() == data.get_range().size() * sizeof(DataT)); - assert(data.get_size() <= host_buf.get_range().size() * sizeof(DataT)); + void set_data(const subrange<3>& sr, const void* in_linearized) override { + assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1)); + assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1)); - memcpy_strided(reinterpret_cast(data.get_pointer()), host_buf.get_pointer(), sizeof(DataT), range_cast(data.get_range()), - id_cast(cl::sycl::id<3>(0, 0, 0)), range_cast(host_buf.get_range()), id_cast(offset), range_cast(data.get_range())); + memcpy_strided(in_linearized, host_buf.get_pointer(), sizeof(DataT), range_cast(sr.range), id_cast(cl::sycl::id<3>(0, 0, 0)), + range_cast(host_buf.get_range()), id_cast(sr.offset), range_cast(sr.range)); } buffer_storage* make_new_of_same_type(cl::sycl::range<3> range) const override { return new host_buffer_storage(range_cast(range)); } @@ -322,7 +260,9 @@ namespace detail { // TODO: Optimize for contiguous copies - we could do a single SYCL H->D copy directly. else if(source.get_type() == buffer_type::HOST_BUFFER) { auto& host_source = dynamic_cast&>(source); - set_data(target_offset, host_source.get_data(source_offset, copy_range)); + unique_payload_ptr tmp{unique_payload_ptr::allocate_uninitialized, copy_range.size()}; + host_source.get_data(subrange{source_offset, copy_range}, static_cast(tmp.get_pointer())); + set_data(subrange{target_offset, copy_range}, static_cast(tmp.get_pointer())); } else { @@ -337,8 +277,10 @@ namespace detail { // TODO: Optimize for contiguous copies - we could do a single SYCL D->H copy directly. if(source.get_type() == buffer_type::DEVICE_BUFFER) { - auto data = source.get_data(source_offset, copy_range); - set_data(target_offset, std::move(data)); + // This looks more convoluted than using a vector, but that would break if DataT == bool + unique_payload_ptr tmp{unique_payload_ptr::allocate_uninitialized, copy_range.size()}; + source.get_data(subrange{source_offset, copy_range}, static_cast(tmp.get_pointer())); + set_data(subrange{target_offset, copy_range}, static_cast(tmp.get_pointer())); } else if(source.get_type() == buffer_type::HOST_BUFFER) { diff --git a/include/mpi_support.h b/include/mpi_support.h index 81a6b946c..af07daa35 100644 --- a/include/mpi_support.h +++ b/include/mpi_support.h @@ -28,6 +28,8 @@ class unique_frame_ptr : private std::unique_ptr { private: using impl = std::unique_ptr; + friend class unique_payload_ptr; + public: using payload_type = typename Frame::payload_type; @@ -74,4 +76,29 @@ class unique_frame_ptr : private std::unique_ptr { } }; +class unique_payload_ptr : private std::unique_ptr> { + private: + using impl = std::unique_ptr>; + + public: + template + struct allocate_uninitialized_tag {}; + + template + inline static constexpr allocate_uninitialized_tag allocate_uninitialized; + + unique_payload_ptr() noexcept = default; + + template + explicit unique_payload_ptr(allocate_uninitialized_tag, size_t count) : impl(operator new(count * sizeof(T)), [](void* p) { operator delete(p); }) {} + + template + explicit unique_payload_ptr(unique_frame_ptr frame) : impl(frame.release() + 1, [](void* p) { delete(static_cast(p) - 1); }) {} + + void* get_pointer() { return impl::get(); } + const void* get_pointer() const { return impl::get(); } + + using impl::operator bool; +}; + } // namespace celerity::detail diff --git a/include/reduction_manager.h b/include/reduction_manager.h index 63d755129..f0425badc 100644 --- a/include/reduction_manager.h +++ b/include/reduction_manager.h @@ -19,7 +19,7 @@ namespace detail { explicit abstract_buffer_reduction(buffer_id bid, bool include_current_buffer_value) : info{bid, include_current_buffer_value} {} virtual ~abstract_buffer_reduction() = default; - void push_overlapping_data(node_id source_nid, raw_buffer_data data) { overlapping_data.emplace_back(source_nid, std::move(data)); } + void push_overlapping_data(node_id source_nid, unique_payload_ptr data) { overlapping_data.emplace_back(source_nid, std::move(data)); } virtual void reduce_to_buffer() = 0; @@ -27,7 +27,7 @@ namespace detail { protected: reduction_info info; - std::vector> overlapping_data; + std::vector> overlapping_data; }; template @@ -41,9 +41,7 @@ namespace detail { DataT acc = init; for(auto& [nid, data] : overlapping_data) { - assert(data.get_range() == cl::sycl::range<3>(1, 1, 1)); - DataT other = *static_cast(data.get_pointer()); - acc = op(acc, other); + acc = op(acc, *static_cast(data.get_pointer())); } auto host_buf = runtime::get_instance().get_buffer_manager().get_host_buffer( @@ -76,7 +74,7 @@ namespace detail { return reductions.at(rid)->get_info(); } - void push_overlapping_reduction_data(reduction_id rid, node_id source_nid, raw_buffer_data data) { + void push_overlapping_reduction_data(reduction_id rid, node_id source_nid, unique_payload_ptr data) { std::lock_guard lock{mutex}; reductions.at(rid)->push_overlapping_data(source_nid, std::move(data)); } diff --git a/src/buffer_manager.cc b/src/buffer_manager.cc index 39588df05..a56800a9a 100644 --- a/src/buffer_manager.cc +++ b/src/buffer_manager.cc @@ -30,10 +30,10 @@ namespace detail { lifecycle_cb(buffer_lifecycle_event::UNREGISTERED, bid); } - raw_buffer_data buffer_manager::get_buffer_data(buffer_id bid, const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range) { + void buffer_manager::get_buffer_data(buffer_id bid, const subrange<3>& sr, void* out_linearized) { std::unique_lock lock(mutex); assert(buffers.count(bid) == 1 && (buffers.at(bid).device_buf.is_allocated() || buffers.at(bid).host_buf.is_allocated())); - auto data_locations = newest_data_location.at(bid).get_region_values(subrange_to_grid_box(subrange<3>(offset, range))); + auto data_locations = newest_data_location.at(bid).get_region_values(subrange_to_grid_box(sr)); // Slow path: We need to obtain current data from both host and device. if(data_locations.size() > 1) { @@ -42,34 +42,33 @@ namespace detail { // Make sure newest data resides on the host. // But first, we need to check whether the current host buffer is able to hold the full data range. - const auto info = is_resize_required(existing_buf, range, offset); + const auto info = is_resize_required(existing_buf, sr.range, sr.offset); backing_buffer replacement_buf; if(info.resize_required) { - // TODO: Do we really want to allocate host memory for this..? We could also make raw_buffer_data "coherent" directly. + // TODO: Do we really want to allocate host memory for this..? We could also make the buffer storage "coherent" directly. replacement_buf = backing_buffer{std::unique_ptr(existing_buf.storage->make_new_of_same_type(info.new_range)), info.new_offset}; } - existing_buf = make_buffer_subrange_coherent(bid, access_mode::read, std::move(existing_buf), {offset, range}, std::move(replacement_buf)); + existing_buf = make_buffer_subrange_coherent(bid, access_mode::read, std::move(existing_buf), sr, std::move(replacement_buf)); - data_locations = {{subrange_to_grid_box(subrange<3>(offset, range)), data_location::HOST}}; + data_locations = {{subrange_to_grid_box(sr), data_location::HOST}}; } // get_buffer_data will race with pending transfers for the same subrange. In case there are pending transfers and a host buffer does not exist yet, // these transfers cannot easily be flushed here as creating a host buffer requires a templated context that knows about DataT. - assert(std::none_of(scheduled_transfers[bid].begin(), scheduled_transfers[bid].end(), [&](const transfer& t) { - return subrange_to_grid_box({offset, range}).intersectsWith(subrange_to_grid_box({t.target_offset, t.data.get_range()})); - })); + assert(std::none_of(scheduled_transfers[bid].begin(), scheduled_transfers[bid].end(), + [&](const transfer& t) { return subrange_to_grid_box(sr).intersectsWith(subrange_to_grid_box(t.sr)); })); if(data_locations[0].second == data_location::HOST || data_locations[0].second == data_location::HOST_AND_DEVICE) { - return buffers.at(bid).host_buf.storage->get_data(buffers.at(bid).host_buf.get_local_offset(offset), range); + return buffers.at(bid).host_buf.storage->get_data({buffers.at(bid).host_buf.get_local_offset(sr.offset), sr.range}, out_linearized); } - return buffers.at(bid).device_buf.storage->get_data(buffers.at(bid).device_buf.get_local_offset(offset), range); + return buffers.at(bid).device_buf.storage->get_data({buffers.at(bid).device_buf.get_local_offset(sr.offset), sr.range}, out_linearized); } - void buffer_manager::set_buffer_data(buffer_id bid, cl::sycl::id<3> offset, raw_buffer_data&& data) { + void buffer_manager::set_buffer_data(buffer_id bid, const subrange<3>& sr, unique_payload_ptr in_linearized) { std::unique_lock lock(mutex); assert(buffer_infos.count(bid) == 1); - scheduled_transfers[bid].push_back({std::move(data), offset}); + scheduled_transfers[bid].push_back({std::move(in_linearized), sr}); } bool buffer_manager::try_lock(buffer_lock_id id, const std::unordered_set& buffers) { @@ -150,7 +149,7 @@ namespace detail { auto& scheduled_buffer_transfers = scheduled_transfers[bid]; remaining_transfers.reserve(scheduled_buffer_transfers.size() / 2); for(auto& t : scheduled_buffer_transfers) { - auto t_region = subrange_to_grid_box({t.target_offset, t.data.get_range()}); + auto t_region = subrange_to_grid_box(t.sr); // Check whether this transfer applies to the current request. auto t_minus_coherent_region = GridRegion<3>::difference(t_region, coherent_box); @@ -167,10 +166,13 @@ namespace detail { assert(detail::access::mode_traits::is_consumer(mode)); auto intersection = GridRegion<3>::intersect(t_region, coherent_box); remaining_region_after_transfers = GridRegion<3>::difference(remaining_region_after_transfers, intersection); + const auto element_size = buffer_infos.at(bid).element_size; intersection.scanByBoxes([&](const GridBox<3>& box) { auto sr = grid_box_to_subrange(box); - auto partial_t = t.data.copy(sr.offset - t.target_offset, sr.range); - target_buffer.storage->set_data(target_buffer.get_local_offset(sr.offset), std::move(partial_t)); + // TODO can this temp buffer be avoided? + unique_payload_ptr tmp{unique_payload_ptr::allocate_uninitialized, sr.range.size() * element_size}; + linearize_subrange(t.linearized.get_pointer(), tmp.get_pointer(), element_size, t.sr.range, {sr.offset - t.sr.offset, sr.range}); + target_buffer.storage->set_data({target_buffer.get_local_offset(sr.offset), sr.range}, tmp.get_pointer()); updated_region = GridRegion<3>::merge(updated_region, box); }); } @@ -182,7 +184,7 @@ namespace detail { // Transfer applies fully. assert(detail::access::mode_traits::is_consumer(mode)); remaining_region_after_transfers = GridRegion<3>::difference(remaining_region_after_transfers, t_region); - target_buffer.storage->set_data(target_buffer.get_local_offset(t.target_offset), std::move(t.data)); + target_buffer.storage->set_data({target_buffer.get_local_offset(t.sr.offset), t.sr.range}, t.linearized.get_pointer()); updated_region = GridRegion<3>::merge(updated_region, t_region); } // The target buffer now has the newest data in this region. diff --git a/src/buffer_storage.cc b/src/buffer_storage.cc index 720b97c64..fab1e7b97 100644 --- a/src/buffer_storage.cc +++ b/src/buffer_storage.cc @@ -7,8 +7,8 @@ namespace detail { const cl::sycl::id<1>& source_offset, const cl::sycl::range<1>& target_range, const cl::sycl::id<1>& target_offset, const cl::sycl::range<1>& copy_range) { const size_t line_size = elem_size * copy_range[0]; - std::memcpy(reinterpret_cast(target_base_ptr) + elem_size * get_linear_index(target_range, target_offset), - reinterpret_cast(source_base_ptr) + elem_size * get_linear_index(source_range, source_offset), line_size); + std::memcpy(static_cast(target_base_ptr) + elem_size * get_linear_index(target_range, target_offset), + static_cast(source_base_ptr) + elem_size * get_linear_index(source_range, source_offset), line_size); } // TODO Optimize for contiguous copies? @@ -19,8 +19,8 @@ namespace detail { const auto source_base_offset = get_linear_index(source_range, source_offset); const auto target_base_offset = get_linear_index(target_range, target_offset); for(size_t i = 0; i < copy_range[0]; ++i) { - std::memcpy(reinterpret_cast(target_base_ptr) + elem_size * (target_base_offset + i * target_range[1]), - reinterpret_cast(source_base_ptr) + elem_size * (source_base_offset + i * source_range[1]), line_size); + std::memcpy(static_cast(target_base_ptr) + elem_size * (target_base_offset + i * target_range[1]), + static_cast(source_base_ptr) + elem_size * (source_base_offset + i * source_range[1]), line_size); } } @@ -34,31 +34,29 @@ namespace detail { const auto target_base_offset = get_linear_index(target_range, target_offset) - get_linear_index(cl::sycl::range<2>{target_range[1], target_range[2]}, {target_offset[1], target_offset[2]}); for(size_t i = 0; i < copy_range[0]; ++i) { - const auto source_ptr = reinterpret_cast(source_base_ptr) + elem_size * (source_base_offset + i * (source_range[1] * source_range[2])); - const auto target_ptr = reinterpret_cast(target_base_ptr) + elem_size * (target_base_offset + i * (target_range[1] * target_range[2])); + const auto source_ptr = static_cast(source_base_ptr) + elem_size * (source_base_offset + i * (source_range[1] * source_range[2])); + const auto target_ptr = static_cast(target_base_ptr) + elem_size * (target_base_offset + i * (target_range[1] * target_range[2])); memcpy_strided(source_ptr, target_ptr, elem_size, cl::sycl::range<2>{source_range[1], source_range[2]}, {source_offset[1], source_offset[2]}, {target_range[1], target_range[2]}, {target_offset[1], target_offset[2]}, {copy_range[1], copy_range[2]}); } } - raw_buffer_data raw_buffer_data::copy(cl::sycl::id<3> offset, cl::sycl::range<3> copy_range) { - assert((id_cast<3>(offset) < id_cast<3>(range)) == cl::sycl::id<3>(1, 1, 1)); - assert((id_cast<3>(offset + copy_range) <= id_cast<3>(range)) == cl::sycl::id<3>(1, 1, 1)); - raw_buffer_data result(elem_size, range_cast<3>(copy_range)); + void linearize_subrange(const void* source_base_ptr, void* target_ptr, size_t elem_size, const range<3>& source_range, const subrange<3>& copy_sr) { + assert((id_cast<3>(copy_sr.offset) < id_cast<3>(source_range)) == cl::sycl::id<3>(1, 1, 1)); + assert((id_cast<3>(copy_sr.offset + copy_sr.range) <= id_cast<3>(source_range)) == cl::sycl::id<3>(1, 1, 1)); - if(range[2] == 1) { - if(range[1] == 1) { - memcpy_strided(data.get(), result.get_pointer(), elem_size, range_cast<1>(range), range_cast<1>(offset), range_cast<1>(copy_range), - cl::sycl::id<1>(0), range_cast<1>(copy_range)); + if(source_range[2] == 1) { + if(source_range[1] == 1) { + memcpy_strided(source_base_ptr, target_ptr, elem_size, range_cast<1>(source_range), range_cast<1>(copy_sr.offset), range_cast<1>(copy_sr.range), + cl::sycl::id<1>(0), range_cast<1>(copy_sr.range)); } else { - memcpy_strided(data.get(), result.get_pointer(), elem_size, range_cast<2>(range), range_cast<2>(offset), range_cast<2>(copy_range), - cl::sycl::id<2>(0, 0), range_cast<2>(copy_range)); + memcpy_strided(source_base_ptr, target_ptr, elem_size, range_cast<2>(source_range), range_cast<2>(copy_sr.offset), range_cast<2>(copy_sr.range), + cl::sycl::id<2>(0, 0), range_cast<2>(copy_sr.range)); } } else { - memcpy_strided(data.get(), result.get_pointer(), elem_size, range_cast<3>(range), offset, copy_range, cl::sycl::id<3>(0, 0, 0), copy_range); + memcpy_strided( + source_base_ptr, target_ptr, elem_size, range_cast<3>(source_range), copy_sr.offset, copy_sr.range, cl::sycl::id<3>(0, 0, 0), copy_sr.range); } - - return result; } } // namespace detail diff --git a/src/buffer_transfer_manager.cc b/src/buffer_transfer_manager.cc index 60ffcce96..05917bd63 100644 --- a/src/buffer_transfer_manager.cc +++ b/src/buffer_transfer_manager.cc @@ -20,16 +20,15 @@ namespace detail { // --> This probably needs some kind of heuristic, as for small (e.g. ghost cell) transfers the overhead of threading is way too big const push_data& data = std::get(pkg.data); - const auto raw_data = - runtime::get_instance().get_buffer_manager().get_buffer_data(data.bid, cl::sycl::range<3>(data.sr.offset[0], data.sr.offset[1], data.sr.offset[2]), - cl::sycl::range<3>(data.sr.range[0], data.sr.range[1], data.sr.range[2])); + auto& bm = runtime::get_instance().get_buffer_manager(); + const auto element_size = bm.get_buffer_info(data.bid).element_size; - unique_frame_ptr frame(from_payload_size, raw_data.get_size()); + unique_frame_ptr frame(from_payload_size, data.sr.range.size() * element_size); frame->sr = data.sr; frame->bid = data.bid; frame->rid = data.rid; frame->push_cid = pkg.cid; - memcpy(frame->data, raw_data.get_pointer(), raw_data.get_size()); + bm.get_buffer_data(data.bid, data.sr, frame->data); CELERITY_TRACE("Ready to send {} of buffer {} ({} B) to {}", data.sr, data.bid, frame.get_payload_size(), data.target); @@ -145,21 +144,19 @@ namespace detail { void buffer_transfer_manager::commit_transfer(transfer_in& transfer) { const auto& frame = *transfer.frame; - const size_t elem_size = transfer.frame.get_payload_size() / (frame.sr.range[0] * frame.sr.range[1] * frame.sr.range[2]); - raw_buffer_data raw_data{elem_size, frame.sr.range}; - memcpy(raw_data.get_pointer(), frame.data, raw_data.get_size()); + unique_payload_ptr payload{std::move(transfer.frame)}; + if(frame.rid) { auto& rm = runtime::get_instance().get_reduction_manager(); // In some rare situations the local runtime might not yet know about this reduction. Busy wait until it does. while(!rm.has_reduction(frame.rid)) {} - rm.push_overlapping_reduction_data(frame.rid, transfer.source_nid, std::move(raw_data)); + rm.push_overlapping_reduction_data(frame.rid, transfer.source_nid, std::move(payload)); } else { auto& bm = runtime::get_instance().get_buffer_manager(); // In some rare situations the local runtime might not yet know about this buffer. Busy wait until it does. while(!bm.has_buffer(frame.bid)) {} - bm.set_buffer_data(frame.bid, frame.sr.offset, std::move(raw_data)); + bm.set_buffer_data(frame.bid, frame.sr, std::move(payload)); } - transfer.frame = {}; } } // namespace detail diff --git a/src/worker_job.cc b/src/worker_job.cc index 1ff4c3931..067749d6e 100644 --- a/src/worker_job.cc +++ b/src/worker_job.cc @@ -211,7 +211,10 @@ namespace detail { auto tsk = task_mngr.get_task(data.tid); for(auto rid : tsk->get_reductions()) { auto reduction = reduction_mngr.get_reduction(rid); - reduction_mngr.push_overlapping_reduction_data(rid, local_nid, buffer_mngr.get_buffer_data(reduction.output_buffer_id, {}, {1, 1, 1})); + const auto element_size = buffer_mngr.get_buffer_info(reduction.output_buffer_id).element_size; + unique_payload_ptr operand{unique_payload_ptr::allocate_uninitialized, element_size}; + buffer_mngr.get_buffer_data(reduction.output_buffer_id, {{}, {1, 1, 1}}, operand.get_pointer()); + reduction_mngr.push_overlapping_reduction_data(rid, local_nid, std::move(operand)); } if(queue.is_profiling_enabled()) { diff --git a/test/buffer_manager_tests.cc b/test/buffer_manager_tests.cc index 111268356..617f91a72 100644 --- a/test/buffer_manager_tests.cc +++ b/test/buffer_manager_tests.cc @@ -583,10 +583,9 @@ namespace detail { bid, get_other_target(tgt), {64}, {0}, [](cl::sycl::id<1>, size_t& value) { value = 33; }); // Add transfer for second half on this side - std::vector data(32, 77); - auto transfer = raw_buffer_data{sizeof(size_t), cl::sycl::range<3>(32, 1, 1)}; - std::memcpy(transfer.get_pointer(), data.data(), sizeof(size_t) * data.size()); - bm.set_buffer_data(bid, cl::sycl::id<3>(32, 0, 0), std::move(transfer)); + unique_payload_ptr data{unique_payload_ptr::allocate_uninitialized, 32}; + std::uninitialized_fill_n(static_cast(data.get_pointer()), 32, size_t{77}); + bm.set_buffer_data(bid, {{32, 0, 0}, {32, 1, 1}}, std::move(data)); // Check that transfer has been correctly ingested { @@ -619,10 +618,9 @@ namespace detail { // Set full range to new value. { - std::vector other(128, 77); - auto data = raw_buffer_data{sizeof(size_t), cl::sycl::range<3>(128, 1, 1)}; - std::memcpy(data.get_pointer(), other.data(), sizeof(size_t) * other.size()); - bm.set_buffer_data(bid, cl::sycl::id<3>(0, 0, 0), std::move(data)); + unique_payload_ptr data{unique_payload_ptr::allocate_uninitialized, 128}; + std::uninitialized_fill_n(static_cast(data.get_pointer()), 128, size_t{77}); + bm.set_buffer_data(bid, {{0, 0, 0}, {128, 1, 1}}, std::move(data)); } // Now read full range. @@ -653,10 +651,9 @@ namespace detail { // Set data that only partially overlaps with currently allocated range. { - std::vector init(64, 99); - auto data = raw_buffer_data{sizeof(size_t), cl::sycl::range<3>(64, 1, 1)}; - std::memcpy(data.get_pointer(), init.data(), sizeof(size_t) * init.size()); - bm.set_buffer_data(bid, cl::sycl::id<3>(32, 0, 0), std::move(data)); + unique_payload_ptr data{unique_payload_ptr::allocate_uninitialized, 64}; + std::uninitialized_fill_n(static_cast(data.get_pointer()), 64, size_t{99}); + bm.set_buffer_data(bid, {{32, 0, 0}, {64, 1, 1}}, std::move(data)); } // Check that second half of buffer has been updated... @@ -690,9 +687,10 @@ namespace detail { bid, get_other_target(tgt), {32}, {0}, [](cl::sycl::id<1> idx, size_t& value) { value = idx[0]; }); buffer_for_each( bid, tgt, {32}, {0}, [](cl::sycl::id<1> idx, size_t& value) { value += 1; }); - auto data = bm.get_buffer_data(bid, {0, 0, 0}, {32, 1, 1}); + std::vector data(32); + bm.get_buffer_data(bid, {{0, 0, 0}, {32, 1, 1}}, data.data()); for(size_t i = 0; i < 32; ++i) { - REQUIRE_LOOP(reinterpret_cast(data.get_pointer())[i] == i + 1); + REQUIRE_LOOP(data[i] == i + 1); } }; @@ -704,9 +702,10 @@ namespace detail { bid, access_target::DEVICE, {16}, {0}, [](cl::sycl::id<1> idx, size_t& value) { value = idx[0]; }); buffer_for_each( bid, access_target::HOST, {16}, {16}, [](cl::sycl::id<1> idx, size_t& value) { value = idx[0] * 2; }); - auto data = bm.get_buffer_data(bid, {0, 0, 0}, {32, 1, 1}); + std::vector data(32); + bm.get_buffer_data(bid, {{0, 0, 0}, {32, 1, 1}}, data.data()); for(size_t i = 0; i < 32; ++i) { - REQUIRE_LOOP(reinterpret_cast(data.get_pointer())[i] == (i < 16 ? i : 2 * i)); + REQUIRE_LOOP(data[i] == (i < 16 ? i : 2 * i)); } } } diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc index 9d28b342b..7dfeba440 100644 --- a/test/runtime_tests.cc +++ b/test/runtime_tests.cc @@ -428,49 +428,31 @@ namespace detail { } } - TEST_CASE("raw_buffer_data works as expected") { + TEST_CASE("linearize_subrange works as expected") { const cl::sycl::range<3> data1_range{3, 5, 7}; - raw_buffer_data data1{sizeof(size_t), data1_range}; - REQUIRE(data1.get_range() == data1_range); - REQUIRE(data1.get_pointer() != nullptr); - REQUIRE(data1.get_size() == sizeof(size_t) * data1_range.size()); + std::vector data1(data1_range.size()); for(size_t i = 0; i < data1_range[0]; ++i) { for(size_t j = 0; j < data1_range[1]; ++j) { for(size_t k = 0; k < data1_range[2]; ++k) { - reinterpret_cast(data1.get_pointer())[i * data1_range[1] * data1_range[2] + j * data1_range[2] + k] = i * 100 + j * 10 + k; + data1[i * data1_range[1] * data1_range[2] + j * data1_range[2] + k] = i * 100 + j * 10 + k; } } } const cl::sycl::range<3> data2_range{2, 2, 4}; const cl::sycl::id<3> data2_offset{1, 2, 2}; - auto data2 = data1.copy(data2_offset, data2_range); - REQUIRE(data2.get_range() == data2_range); - REQUIRE(data2.get_pointer() != nullptr); - REQUIRE(data2.get_pointer() != data1.get_pointer()); - REQUIRE(data2.get_size() == sizeof(size_t) * data2_range.size()); + std::vector data2(data2_range.size()); + linearize_subrange(data1.data(), data2.data(), sizeof(size_t), data1_range, {data2_offset, data2_range}); for(size_t i = 0; i < 2; ++i) { for(size_t j = 0; j < 2; ++j) { for(size_t k = 0; k < 4; ++k) { - REQUIRE_LOOP(reinterpret_cast(data2.get_pointer())[i * data2_range[1] * data2_range[2] + j * data2_range[2] + k] + REQUIRE_LOOP(data2[i * data2_range[1] * data2_range[2] + j * data2_range[2] + k] == (i + data2_offset[0]) * 100 + (j + data2_offset[1]) * 10 + (k + data2_offset[2])); } } } - - const auto data2_ptr = data2.get_pointer(); - auto data3 = std::move(data2); - REQUIRE(data2.get_pointer() == nullptr); - REQUIRE(data3.get_range() == data2_range); - REQUIRE(data3.get_pointer() == data2_ptr); - REQUIRE(data3.get_size() == sizeof(size_t) * data2_range.size()); - - raw_buffer_data data4{sizeof(uint64_t), {16, 8, 4}}; - data4.reinterpret(sizeof(uint32_t), {32, 8, 4}); - REQUIRE(data4.get_range() == cl::sycl::range<3>{32, 8, 4}); - REQUIRE(data4.get_size() == sizeof(uint64_t) * 16 * 8 * 4); } TEST_CASE_METHOD(test_utils::runtime_fixture, "collective host_task produces one item per rank", "[task]") {