Skip to content

Commit

Permalink
Use native pointers for backing buffer allocations
Browse files Browse the repository at this point in the history
Use native pointers (allocated using `sycl::malloc_device`) instead of
relying on SYCL buffers for backing Celerity virtual buffers.

This greatly simplifies various aspects of accessors and buffer
management while enabling future optimizations. Futhermore, by using
native pointers we completely circumvent any dataflow analysis performed
by the SYCL runtime.
  • Loading branch information
psalz committed Mar 28, 2023
1 parent 5aa33d6 commit 44497b3
Show file tree
Hide file tree
Showing 15 changed files with 409 additions and 701 deletions.
205 changes: 51 additions & 154 deletions include/accessor.h

Large diffs are not rendered by default.

130 changes: 35 additions & 95 deletions include/buffer_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "payload.h"
#include "ranges.h"
#include "region_map.h"
#include "sycl_wrappers.h"
#include "types.h"

namespace celerity {
Expand All @@ -26,11 +27,9 @@ namespace detail {
/**
* The buffer_manager keeps track of all Celerity buffers currently existing within the runtime.
*
* This includes both host and device buffers. Note that instead of relying on SYCL's host-side buffers,
* we keep separate copies that allow for more explicit control. All data accesses within device buffers
* are on the device or through explicit memory operations, meaning that a sufficiently optimized SYCL
* implementation would never have to allocate any host memory whatsoever. Users need to ensure that
* device buffers returned from the buffer_manager are also only being used on the device.
* This includes both and device buffers. Note that we do not rely on SYCL buffers at all, instead
* we manage host and device memory manually; the latter through sycl::malloc and sycl::free, which are
* part of SYCL 2020's USM APIs.
*
* Most operations of the buffer_manager are performed lazily. For example, upon registering a buffer,
* no memory is being allocated on either the host or device. Only when requesting an explicit range of
Expand Down Expand Up @@ -89,28 +88,38 @@ namespace detail {

using buffer_lifecycle_callback = std::function<void(buffer_lifecycle_event, buffer_id)>;

using device_buffer_factory = std::function<std::unique_ptr<buffer_storage>(const range<3>&, sycl::queue&)>;
using host_buffer_factory = std::function<std::unique_ptr<buffer_storage>(const range<3>&)>;

struct buffer_info {
cl::sycl::range<3> range = {1, 1, 1};
size_t element_size = 0;
bool is_host_initialized;
std::string debug_name = {};

device_buffer_factory construct_device;
host_buffer_factory construct_host;
};

/**
* When requesting a host or device buffer through the buffer_manager, this is what is returned.
* When requesting access to a host or device buffer through the buffer_manager, this is what is returned.
*/
template <typename DataT, int Dims, template <typename, int> class BufferT>
struct access_info {
/**
* This is the *currently used* backing buffer for the requested virtual buffer.
* This is a pointer to the *currently used* backing buffer for the requested virtual buffer.
* This reference can become stale if the backing buffer needs to be resized by a subsequent access.
*/
BufferT<DataT, Dims>& buffer;
void* ptr;

/**
* The range of the backing buffer for the requested virtual buffer.
*/
range<3> backing_buffer_range;

/**
* This is the offset of the backing buffer relative to the requested virtual buffer.
*/
cl::sycl::id<Dims> offset;
id<3> backing_buffer_offset;
};

using buffer_lock_id = size_t;
Expand All @@ -125,7 +134,12 @@ namespace detail {
{
std::unique_lock lock(m_mutex);
bid = m_buffer_count++;
m_buffer_infos[bid] = buffer_info{range, sizeof(DataT), is_host_initialized};
m_buffers.emplace(std::piecewise_construct, std::tuple{bid}, std::tuple{});
auto device_factory = [](const ::celerity::range<3>& r, sycl::queue& q) {
return std::make_unique<device_buffer_storage<DataT, Dims>>(range_cast<Dims>(r), q);
};
auto host_factory = [](const ::celerity::range<3>& r) { return std::make_unique<host_buffer_storage<DataT, Dims>>(range_cast<Dims>(r)); };
m_buffer_infos.emplace(bid, buffer_info{range, sizeof(DataT), is_host_initialized, {}, std::move(device_factory), std::move(host_factory)});
m_newest_data_location.emplace(bid, region_map<data_location>(range, data_location::nowhere));

#if defined(CELERITY_DETAIL_ENABLE_DEBUG)
Expand All @@ -134,8 +148,8 @@ namespace detail {
}
if(is_host_initialized) {
// We need to access the full range for host-initialized buffers.
auto info = get_host_buffer<DataT, Dims>(bid, cl::sycl::access::mode::discard_write, range, cl::sycl::id<3>(0, 0, 0));
std::memcpy(info.buffer.get_pointer(), host_init_ptr, range.size() * sizeof(DataT));
auto info = access_host_buffer(bid, cl::sycl::access::mode::discard_write, range, cl::sycl::id<3>(0, 0, 0));
std::memcpy(info.ptr, host_init_ptr, range.size() * sizeof(DataT));
}
m_lifecycle_cb(buffer_lifecycle_event::registered, bid);
return bid;
Expand Down Expand Up @@ -196,98 +210,25 @@ namespace detail {
void set_buffer_data(buffer_id bid, const subrange<3>& sr, unique_payload_ptr in_linearized);

template <typename DataT, int Dims>
access_info<DataT, Dims, device_buffer> get_device_buffer(
buffer_id bid, cl::sycl::access::mode mode, const cl::sycl::range<3>& range, const cl::sycl::id<3>& offset) {
std::unique_lock lock(m_mutex);
access_info access_device_buffer(buffer_id bid, cl::sycl::access::mode mode, const cl::sycl::range<3>& range, const cl::sycl::id<3>& offset) {
#if defined(CELERITY_DETAIL_ENABLE_DEBUG)
assert((m_buffer_types.at(bid)->has_type<DataT, Dims>()));
#endif
assert((range_cast<3>(offset + range) <= m_buffer_infos.at(bid).range) == cl::sycl::range<3>(true, true, true));

auto& existing_buf = m_buffers[bid].device_buf;
backing_buffer replacement_buf;

if(!existing_buf.is_allocated()) {
replacement_buf =
backing_buffer{std::make_unique<device_buffer_storage<DataT, Dims>>(range_cast<Dims>(range), m_queue.get_sycl_queue()), offset};
} else {
// FIXME: For large buffers we might not be able to store two copies in device memory at once.
// Instead, we'd first have to transfer everything to the host and free the old buffer before allocating the new one.
// TODO: What we CAN do however already is to free the old buffer early iff we're requesting a discard_* access!
// (AND that access request covers the entirety of the old buffer!)
const auto info = is_resize_required(existing_buf, range, offset);
if(info.resize_required) {
replacement_buf = backing_buffer{
std::make_unique<device_buffer_storage<DataT, Dims>>(range_cast<Dims>(info.new_range), m_queue.get_sycl_queue()), info.new_offset};
}
}

audit_buffer_access(bid, replacement_buf.is_allocated(), mode);

if(m_test_mode && replacement_buf.is_allocated()) {
auto device_buf = static_cast<device_buffer_storage<DataT, Dims>*>(replacement_buf.storage.get())->get_device_buffer();

// We need two separate approaches here for hipSYCL and ComputeCpp, as hipSYCL currently (0.9.1) does
// not support reinterpreting buffers to other dimensionalities, while ComputeCpp (2.5.0) does not
// support filling buffers with arbitrary data types.
#if CELERITY_WORKAROUND(HIPSYCL)
DataT pattern;
memset(&pattern, test_mode_pattern, sizeof(DataT));
#else
auto byte_buf = device_buf.template reinterpret<unsigned char, 1>();
#endif

m_queue.get_sycl_queue()
.submit([&](cl::sycl::handler& cgh) {
#if CELERITY_WORKAROUND(HIPSYCL)
auto acc = device_buf.template get_access<cl::sycl::access::mode::discard_write>(cgh);
cgh.fill(acc, pattern);
#else
auto acc = byte_buf.template get_access<cl::sycl::access::mode::discard_write>(cgh);
cgh.fill(acc, test_mode_pattern);
#endif
})
.wait();
}

existing_buf = make_buffer_subrange_coherent(bid, mode, std::move(existing_buf), {offset, range}, std::move(replacement_buf));

return {dynamic_cast<device_buffer_storage<DataT, Dims>*>(existing_buf.storage.get())->get_device_buffer(), id_cast<Dims>(existing_buf.offset)};
return access_device_buffer(bid, mode, range, offset);
}

access_info access_device_buffer(buffer_id bid, cl::sycl::access::mode mode, const cl::sycl::range<3>& range, const cl::sycl::id<3>& offset);

template <typename DataT, int Dims>
access_info<DataT, Dims, host_buffer> get_host_buffer(
buffer_id bid, cl::sycl::access::mode mode, const cl::sycl::range<3>& range, const cl::sycl::id<3>& offset) {
std::unique_lock lock(m_mutex);
access_info access_host_buffer(buffer_id bid, cl::sycl::access::mode mode, const cl::sycl::range<3>& range, const cl::sycl::id<3>& offset) {
#if defined(CELERITY_DETAIL_ENABLE_DEBUG)
assert((m_buffer_types.at(bid)->has_type<DataT, Dims>()));
#endif
assert((range_cast<3>(offset + range) <= m_buffer_infos.at(bid).range) == cl::sycl::range<3>(true, true, true));

auto& existing_buf = m_buffers[bid].host_buf;
backing_buffer replacement_buf;

if(!existing_buf.is_allocated()) {
replacement_buf = backing_buffer{std::make_unique<host_buffer_storage<DataT, Dims>>(range_cast<Dims>(range)), offset};
} else {
const auto info = is_resize_required(existing_buf, range, offset);
if(info.resize_required) {
replacement_buf = backing_buffer{std::make_unique<host_buffer_storage<DataT, Dims>>(range_cast<Dims>(info.new_range)), info.new_offset};
}
}

audit_buffer_access(bid, replacement_buf.is_allocated(), mode);

if(m_test_mode && replacement_buf.is_allocated()) {
auto& host_buf = static_cast<host_buffer_storage<DataT, Dims>*>(replacement_buf.storage.get())->get_host_buffer();
std::memset(host_buf.get_pointer(), test_mode_pattern, host_buf.get_range().size() * sizeof(DataT));
}

existing_buf = make_buffer_subrange_coherent(bid, mode, std::move(existing_buf), {offset, range}, std::move(replacement_buf));

return {static_cast<host_buffer_storage<DataT, Dims>*>(existing_buf.storage.get())->get_host_buffer(), id_cast<Dims>(existing_buf.offset)};
return access_host_buffer(bid, mode, range, offset);
}

access_info access_host_buffer(buffer_id bid, cl::sycl::access::mode mode, const cl::sycl::range<3>& range, const cl::sycl::id<3>& offset);

/**
* @brief Tries to lock the given list of @p buffers using the given lock @p id.
*
Expand Down Expand Up @@ -367,7 +308,6 @@ namespace detail {
return dynamic_cast<const buffer_type_guard<DataT, Dims>*>(this) != nullptr;
}
};

template <typename DataT, int Dims>
struct buffer_type_guard : buffer_type_guard_base {};
#endif
Expand Down
Loading

0 comments on commit 44497b3

Please sign in to comment.