Replace raw_buffer_storage with unique_payload_ptr / void* to avoid c…

…opies
celerity · Apr 28, 2022 · 986e449 · 986e449
1 parent 97849bb
commit 986e449
Show file tree

Hide file tree

Showing 10 changed files with 148 additions and 205 deletions.
diff --git a/include/buffer_manager.h b/include/buffer_manager.h
@@ -14,16 +14,14 @@
 #include "access_modes.h"
 #include "buffer_storage.h"
 #include "device_queue.h"
+#include "mpi_support.h"
 #include "ranges.h"
 #include "region_map.h"
 #include "types.h"
 
 namespace celerity {
 namespace detail {
 
-	class raw_buffer_data;
-
-
 	/**
 	 * The buffer_manager keeps track of all Celerity buffers currently existing within the runtime.
 	 *
@@ -92,6 +90,7 @@ namespace detail {
 
 		struct buffer_info {
 			cl::sycl::range<3> range = {1, 1, 1};
+			size_t element_size = 0;
 			bool is_host_initialized;
 		};
 
@@ -124,7 +123,7 @@ namespace detail {
 			{
 				std::unique_lock lock(mutex);
 				bid = buffer_count++;
-				buffer_infos[bid] = buffer_info{range, is_host_initialized};
+				buffer_infos[bid] = buffer_info{range, sizeof(DataT), is_host_initialized};
 				newest_data_location.emplace(bid, region_map<data_location>(range, data_location::NOWHERE));
 
 #if defined(CELERITY_DETAIL_ENABLE_DEBUG)
@@ -179,12 +178,8 @@ namespace detail {
 		 * TODO:
 		 * - Ideally we would transfer data directly out of the original buffer (at least on the host, need RDMA otherwise).
 		 * - We'd have to consider the data striding in the MPI data type we build.
-		 *
-		 * @param bid
-		 * @param offset
-		 * @param range
 		 */
-		raw_buffer_data get_buffer_data(buffer_id bid, const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range);
+		void get_buffer_data(buffer_id bid, const subrange<3>& sr, void* out_linearized);
 
 		/**
 		 * Updates a buffer's content with the provided @p data.
@@ -195,7 +190,7 @@ namespace detail {
 		 * - Host buffer might not be large enough.
 		 * - H->D transfers currently work better for contiguous copies.
 		 */
-		void set_buffer_data(buffer_id bid, cl::sycl::id<3> offset, raw_buffer_data&& data);
+		void set_buffer_data(buffer_id bid, const subrange<3>& sr, unique_payload_ptr in_linearized);
 
 		template <typename DataT, int Dims>
 		access_info<DataT, Dims, device_buffer> get_device_buffer(
@@ -338,8 +333,8 @@ namespace detail {
 		};
 
 		struct transfer {
-			raw_buffer_data data;
-			cl::sycl::id<3> target_offset;
+			unique_payload_ptr linearized;
+			subrange<3> sr;
 		};
 
 		struct resize_info {

diff --git a/include/buffer_storage.h b/include/buffer_storage.h
@@ -6,6 +6,7 @@
 
 #include <CL/sycl.hpp>
 
+#include "mpi_support.h"
 #include "ranges.h"
 #include "workaround.h"
 
@@ -24,69 +25,15 @@ namespace detail {
 	    const cl::sycl::id<3>& source_offset, const cl::sycl::range<3>& target_range, const cl::sycl::id<3>& target_offset,
 	    const cl::sycl::range<3>& copy_range);
 
-	/**
-	 * Dense, linearized host-side storage for buffer data.
-	 */
-	class raw_buffer_data {
-	  public:
-		raw_buffer_data() {}
-
-		raw_buffer_data(size_t elem_size, cl::sycl::range<3> range) : elem_size(elem_size), range(range) {
-			const size_t size = get_size();
-			data = std::make_unique<unsigned char[]>(size);
-		}
-
-		raw_buffer_data(const raw_buffer_data&) = delete;
-
-		raw_buffer_data(raw_buffer_data&& other) noexcept { *this = std::move(other); }
-
-		raw_buffer_data& operator=(raw_buffer_data&& other) noexcept {
-			elem_size = other.elem_size;
-			range = other.range;
-			data = std::move(other.data);
-			return *this;
-		}
-
-		/**
-		 * Changes the element size and range of this buffer.
-		 * Note that the resulting data size must remain the same as before.
-		 */
-		void reinterpret(size_t elem_size, cl::sycl::range<3> range) {
-			assert(elem_size * range.size() == this->elem_size * this->range.size());
-			this->elem_size = elem_size;
-			this->range = range;
-		}
-
-		/**
-		 * Returns the pointer to the dense, linearized data location.
-		 */
-		void* get_pointer() const { return data.get(); }
-
-		cl::sycl::range<3> get_range() const { return range; }
-
-		/**
-		 * Returns the data size, in bytes.
-		 */
-		size_t get_size() const { return elem_size * range.size(); }
-
-		/**
-		 * Copies the specified data subrange into a new (unstrided) raw_buffer_data instance.
-		 */
-		raw_buffer_data copy(cl::sycl::id<3> offset, cl::sycl::range<3> copy_range);
-
-	  private:
-		size_t elem_size = 0;
-		cl::sycl::range<3> range = {1, 1, 1};
-		std::unique_ptr<unsigned char[]> data;
-	};
+	void linearize_subrange(const void* source_base_ptr, void* target_ptr, size_t elem_size, const range<3>& source_range, const subrange<3>& copy_sr);
 
 	template <typename DataT, int Dims>
 	using device_buffer = cl::sycl::buffer<DataT, Dims>;
 
 	template <typename DataT, int Dims>
 	class host_buffer {
 	  public:
-		host_buffer(cl::sycl::range<Dims> range) : range(range) {
+		explicit host_buffer(cl::sycl::range<Dims> range) : range(range) {
 			auto r3 = range_cast<3>(range);
 			data = std::make_unique<DataT[]>(r3[0] * r3[1] * r3[2]);
 		}
@@ -111,9 +58,9 @@ namespace detail {
 		/**
 		 * @param range The size of the buffer
 		 */
-		buffer_storage(cl::sycl::range<3> range, buffer_type type) : range(range), type(type) {}
+		buffer_storage(celerity::range<3> range, buffer_type type) : range(range), type(type) {}
 
-		cl::sycl::range<3> get_range() const { return range; }
+		celerity::range<3> get_range() const { return range; }
 
 		buffer_type get_type() const { return type; }
 
@@ -122,9 +69,9 @@ namespace detail {
 		 */
 		virtual size_t get_size() const = 0;
 
-		virtual raw_buffer_data get_data(const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range) const = 0;
+		virtual void get_data(const subrange<3>& sr, void* out_linearized) const = 0;
 
-		virtual void set_data(cl::sycl::id<3> offset, raw_buffer_data data) = 0;
+		virtual void set_data(const subrange<3>& sr, const void* in_linearized) = 0;
 
 		/**
 		 * Convenience function to create new buffer_storages of the same (templated) type, useful in contexts where template type information is not available.
@@ -172,59 +119,54 @@ namespace detail {
 
 		const device_buffer<DataT, Dims>& get_device_buffer() const { return device_buf; }
 
-		raw_buffer_data get_data(const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range) const override {
-			assert(Dims > 1 || (offset[1] == 0 && range[1] == 1));
-			assert(Dims > 2 || (offset[2] == 0 && range[2] == 1));
+		void get_data(const subrange<3>& sr, void* out_linearized) const override {
+			assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1));
+			assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1));
 
-			auto result = raw_buffer_data{sizeof(DataT), range};
 			auto buf = get_device_buffer();
 
 			// ComputeCpp (as of version 2.5.0) expects the target pointer of an explicit copy operation to have the same size as the buffer.
 			// As a workaround, we copy the data manually using a kernel.
 #if WORKAROUND_COMPUTECPP
-			cl::sycl::buffer<DataT, Dims> tmp_dst_buf(reinterpret_cast<DataT*>(result.get_pointer()), range_cast<Dims>(range));
+			cl::sycl::buffer<DataT, Dims> tmp_dst_buf(static_cast<DataT*>(out_linearized), range_cast<Dims>(sr.range));
 			auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) {
-				const auto src_acc = buf.template get_access<cl::sycl::access::mode::read>(cgh, range_cast<Dims>(range), id_cast<Dims>(offset));
+				const auto src_acc = buf.template get_access<cl::sycl::access::mode::read>(cgh, range_cast<Dims>(sr.range), id_cast<Dims>(sr.offset));
 				const auto dst_acc = tmp_dst_buf.template get_access<cl::sycl::access::mode::discard_write>(cgh);
 				const auto src_buf_range = buf.get_range();
 				cgh.parallel_for<computecpp_get_data_workaround<DataT, Dims>>(
-				    range_cast<Dims>(range), [=](const sycl::id<Dims> id) { dst_acc[id] = ranged_sycl_access(src_acc, src_buf_range, id); });
+				    range_cast<Dims>(sr.range), [=](const sycl::id<Dims> id) { dst_acc[id] = ranged_sycl_access(src_acc, src_buf_range, id); });
 			});
 #else
 			auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) {
-				auto acc = buf.template get_access<cl::sycl::access::mode::read>(cgh, range_cast<Dims>(range), id_cast<Dims>(offset));
-				cgh.copy(acc, reinterpret_cast<DataT*>(result.get_pointer()));
+				auto acc = buf.template get_access<cl::sycl::access::mode::read>(cgh, range_cast<Dims>(sr.range), id_cast<Dims>(sr.offset));
+				cgh.copy(acc, static_cast<DataT*>(out_linearized));
 			});
 #endif
 
 			// TODO: Ideally we'd not wait here and instead return some sort of async handle that can be waited upon
 			event.wait();
-
-			return result;
 		}
 
-		void set_data(cl::sycl::id<3> offset, raw_buffer_data data) override {
-			assert(Dims > 1 || (offset[1] == 0 && data.get_range()[1] == 1));
-			assert(Dims > 2 || (offset[2] == 0 && data.get_range()[2] == 1));
-			assert(data.get_size() == data.get_range().size() * sizeof(DataT));
-			assert(data.get_size() <= device_buf.get_range().size() * sizeof(DataT));
+		void set_data(const subrange<3>& sr, const void* in_linearized) override {
+			assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1));
+			assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1));
 
 			auto buf = get_device_buffer();
 
 			// See above for why this workaround is needed.
 #if WORKAROUND_COMPUTECPP
-			cl::sycl::buffer<DataT, Dims> tmp_src_buf(reinterpret_cast<DataT*>(data.get_pointer()), range_cast<Dims>(data.get_range()));
+			cl::sycl::buffer<DataT, Dims> tmp_src_buf(static_cast<const DataT*>(in_linearized), range_cast<Dims>(sr.range));
 			auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) {
 				auto src_acc = tmp_src_buf.template get_access<cl::sycl::access::mode::read>(cgh);
-				auto dst_acc = buf.template get_access<cl::sycl::access::mode::discard_write>(cgh, range_cast<Dims>(data.get_range()), id_cast<Dims>(offset));
+				auto dst_acc = buf.template get_access<cl::sycl::access::mode::discard_write>(cgh, range_cast<Dims>(sr.range), id_cast<Dims>(sr.offset));
 				const auto dst_buf_range = buf.get_range();
 				cgh.parallel_for<computecpp_set_data_workaround<DataT, Dims>>(
-				    range_cast<Dims>(data.get_range()), [=](const sycl::id<Dims> id) { ranged_sycl_access(dst_acc, dst_buf_range, id) = src_acc[id]; });
+				    range_cast<Dims>(sr.range), [=](const sycl::id<Dims> id) { ranged_sycl_access(dst_acc, dst_buf_range, id) = src_acc[id]; });
 			});
 #else
 			auto event = transfer_queue.submit([&](cl::sycl::handler& cgh) {
-				auto acc = buf.template get_access<cl::sycl::access::mode::discard_write>(cgh, range_cast<Dims>(data.get_range()), id_cast<Dims>(offset));
-				cgh.copy(reinterpret_cast<DataT*>(data.get_pointer()), acc);
+				auto acc = buf.template get_access<cl::sycl::access::mode::discard_write>(cgh, range_cast<Dims>(sr.range), id_cast<Dims>(sr.offset));
+				cgh.copy(static_cast<const DataT*>(in_linearized), acc);
 			});
 #endif
 
@@ -258,28 +200,24 @@ namespace detail {
 	template <typename DataT, int Dims>
 	class host_buffer_storage : public buffer_storage {
 	  public:
-		host_buffer_storage(cl::sycl::range<Dims> range) : buffer_storage(range_cast<3>(range), buffer_type::HOST_BUFFER), host_buf(range) {}
+		explicit host_buffer_storage(cl::sycl::range<Dims> range) : buffer_storage(range_cast<3>(range), buffer_type::HOST_BUFFER), host_buf(range) {}
 
 		size_t get_size() const override { return get_range().size() * sizeof(DataT); };
 
-		raw_buffer_data get_data(const cl::sycl::id<3>& offset, const cl::sycl::range<3>& range) const override {
-			assert(Dims > 1 || (offset[1] == 0 && range[1] == 1));
-			assert(Dims > 2 || (offset[2] == 0 && range[2] == 1));
+		void get_data(const subrange<3>& sr, void* out_linearized) const override {
+			assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1));
+			assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1));
 
-			auto result = raw_buffer_data{sizeof(DataT), range};
-			memcpy_strided(host_buf.get_pointer(), result.get_pointer(), sizeof(DataT), range_cast<Dims>(host_buf.get_range()), id_cast<Dims>(offset),
-			    range_cast<Dims>(range), id_cast<Dims>(cl::sycl::id<3>{0, 0, 0}), range_cast<Dims>(range));
-			return result;
+			memcpy_strided(host_buf.get_pointer(), out_linearized, sizeof(DataT), range_cast<Dims>(host_buf.get_range()), id_cast<Dims>(sr.offset),
+			    range_cast<Dims>(sr.range), id_cast<Dims>(cl::sycl::id<3>{0, 0, 0}), range_cast<Dims>(sr.range));
 		}
 
-		void set_data(cl::sycl::id<3> offset, raw_buffer_data data) override {
-			assert(Dims > 1 || (offset[1] == 0 && data.get_range()[1] == 1));
-			assert(Dims > 2 || (offset[2] == 0 && data.get_range()[2] == 1));
-			assert(data.get_size() == data.get_range().size() * sizeof(DataT));
-			assert(data.get_size() <= host_buf.get_range().size() * sizeof(DataT));
+		void set_data(const subrange<3>& sr, const void* in_linearized) override {
+			assert(Dims > 1 || (sr.offset[1] == 0 && sr.range[1] == 1));
+			assert(Dims > 2 || (sr.offset[2] == 0 && sr.range[2] == 1));
 
-			memcpy_strided(reinterpret_cast<DataT*>(data.get_pointer()), host_buf.get_pointer(), sizeof(DataT), range_cast<Dims>(data.get_range()),
-			    id_cast<Dims>(cl::sycl::id<3>(0, 0, 0)), range_cast<Dims>(host_buf.get_range()), id_cast<Dims>(offset), range_cast<Dims>(data.get_range()));
+			memcpy_strided(in_linearized, host_buf.get_pointer(), sizeof(DataT), range_cast<Dims>(sr.range), id_cast<Dims>(cl::sycl::id<3>(0, 0, 0)),
+			    range_cast<Dims>(host_buf.get_range()), id_cast<Dims>(sr.offset), range_cast<Dims>(sr.range));
 		}
 
 		buffer_storage* make_new_of_same_type(cl::sycl::range<3> range) const override { return new host_buffer_storage<DataT, Dims>(range_cast<Dims>(range)); }
@@ -322,7 +260,9 @@ namespace detail {
 		// TODO: Optimize for contiguous copies - we could do a single SYCL H->D copy directly.
 		else if(source.get_type() == buffer_type::HOST_BUFFER) {
 			auto& host_source = dynamic_cast<const host_buffer_storage<DataT, Dims>&>(source);
-			set_data(target_offset, host_source.get_data(source_offset, copy_range));
+			unique_payload_ptr tmp{unique_payload_ptr::allocate_uninitialized<DataT>, copy_range.size()};
+			host_source.get_data(subrange{source_offset, copy_range}, static_cast<DataT*>(tmp.get_pointer()));
+			set_data(subrange{target_offset, copy_range}, static_cast<const DataT*>(tmp.get_pointer()));
 		}
 
 		else {
@@ -337,8 +277,10 @@ namespace detail {
 
 		// TODO: Optimize for contiguous copies - we could do a single SYCL D->H copy directly.
 		if(source.get_type() == buffer_type::DEVICE_BUFFER) {
-			auto data = source.get_data(source_offset, copy_range);
-			set_data(target_offset, std::move(data));
+			// This looks more convoluted than using a vector<DataT>, but that would break if DataT == bool
+			unique_payload_ptr tmp{unique_payload_ptr::allocate_uninitialized<DataT>, copy_range.size()};
+			source.get_data(subrange{source_offset, copy_range}, static_cast<DataT*>(tmp.get_pointer()));
+			set_data(subrange{target_offset, copy_range}, static_cast<const DataT*>(tmp.get_pointer()));
 		}
 
 		else if(source.get_type() == buffer_type::HOST_BUFFER) {

diff --git a/include/mpi_support.h b/include/mpi_support.h
@@ -28,6 +28,8 @@ class unique_frame_ptr : private std::unique_ptr<Frame> {
   private:
 	using impl = std::unique_ptr<Frame>;
 
+	friend class unique_payload_ptr;
+
   public:
 	using payload_type = typename Frame::payload_type;
 
@@ -74,4 +76,29 @@ class unique_frame_ptr : private std::unique_ptr<Frame> {
 	}
 };
 
+class unique_payload_ptr : private std::unique_ptr<void, std::function<void(void*)>> {
+  private:
+	using impl = std::unique_ptr<void, std::function<void(void*)>>;
+
+  public:
+	template <typename T>
+	struct allocate_uninitialized_tag {};
+
+	template <typename T>
+	inline static constexpr allocate_uninitialized_tag<T> allocate_uninitialized;
+
+	unique_payload_ptr() noexcept = default;
+
+	template <typename T>
+	explicit unique_payload_ptr(allocate_uninitialized_tag<T>, size_t count) : impl(operator new(count * sizeof(T)), [](void* p) { operator delete(p); }) {}
+
+	template <typename Frame>
+	explicit unique_payload_ptr(unique_frame_ptr<Frame> frame) : impl(frame.release() + 1, [](void* p) { delete(static_cast<Frame*>(p) - 1); }) {}
+
+	void* get_pointer() { return impl::get(); }
+	const void* get_pointer() const { return impl::get(); }
+
+	using impl::operator bool;
+};
+
 } // namespace celerity::detail
diff --git a/include/reduction_manager.h b/include/reduction_manager.h
@@ -19,15 +19,15 @@ namespace detail {
 		explicit abstract_buffer_reduction(buffer_id bid, bool include_current_buffer_value) : info{bid, include_current_buffer_value} {}
 		virtual ~abstract_buffer_reduction() = default;
 
-		void push_overlapping_data(node_id source_nid, raw_buffer_data data) { overlapping_data.emplace_back(source_nid, std::move(data)); }
+		void push_overlapping_data(node_id source_nid, unique_payload_ptr data) { overlapping_data.emplace_back(source_nid, std::move(data)); }
 
 		virtual void reduce_to_buffer() = 0;
 
 		reduction_info get_info() const { return info; }
 
 	  protected:
 		reduction_info info;
-		std::vector<std::pair<node_id, raw_buffer_data>> overlapping_data;
+		std::vector<std::pair<node_id, unique_payload_ptr>> overlapping_data;
 	};
 
 	template <typename DataT, int Dims, typename BinaryOperation>
@@ -41,9 +41,7 @@ namespace detail {
 
 			DataT acc = init;
 			for(auto& [nid, data] : overlapping_data) {
-				assert(data.get_range() == cl::sycl::range<3>(1, 1, 1));
-				DataT other = *static_cast<const DataT*>(data.get_pointer());
-				acc = op(acc, other);
+				acc = op(acc, *static_cast<const DataT*>(data.get_pointer()));
 			}
 
 			auto host_buf = runtime::get_instance().get_buffer_manager().get_host_buffer<DataT, Dims>(
@@ -76,7 +74,7 @@ namespace detail {
 			return reductions.at(rid)->get_info();
 		}
 
-		void push_overlapping_reduction_data(reduction_id rid, node_id source_nid, raw_buffer_data data) {
+		void push_overlapping_reduction_data(reduction_id rid, node_id source_nid, unique_payload_ptr data) {
 			std::lock_guard lock{mutex};
 			reductions.at(rid)->push_overlapping_data(source_nid, std::move(data));
 		}