Skip to content

Commit

Permalink
Introduce backend system for vendor-specific code paths
Browse files Browse the repository at this point in the history
Since SYCL 2020 does not support multi-dimensional (rectangular) copies
for USM pointers, we have to either do it in a loop (slow) or fall back
to vendor-specific APIs.

This introduces a new "backend" system that does the latter.
Currently only "generic" (= SYCL, slow) and CUDA (when using OpenSYCL or
DPC++) are supported.

Since backends are configuration during compile time, this additionally
introduces a new integration testing mechanism for testing backends.
This requires Celerity to be built with different CMake options, so the
test is implemented as a Python script.
  • Loading branch information
psalz committed Mar 28, 2023
1 parent 44497b3 commit 750f32a
Show file tree
Hide file tree
Showing 23 changed files with 778 additions and 67 deletions.
7 changes: 5 additions & 2 deletions .github/workflows/celerity_ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -171,14 +171,17 @@ jobs:
timeout-minutes: 5
working-directory: ${{ env.build-dir }}
run: ${{ env.container-workspace }}/ci/run-unit-tests.sh
- name: Run integration tests
- name: Run examples
timeout-minutes: 5
# We build examples twice, but only run the installed version (which probably has more failure modes)
working-directory: ${{ env.examples-build-dir }}
run: ${{ env.container-workspace }}/ci/run-integration-tests.sh /data/Lenna.png 1 2 4
run: ${{ env.container-workspace }}/ci/run-examples.sh /data/Lenna.png 1 2 4
- name: Run system tests
working-directory: ${{ env.build-dir }}
run: ${{ env.container-workspace }}/ci/run-system-tests.sh 2 4
- name: Run integration tests
working-directory: ${{ env.build-dir }}
run: ${{ env.container-workspace }}/test/integration/run-integration-tests.py . ${{ matrix.platform }}
- name: Upload stack traces (if any)
if: always()
uses: actions/upload-artifact@v2
Expand Down
5 changes: 4 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,9 @@ target_link_libraries(celerity_runtime PUBLIC
${SYCL_LIB}
)

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/backend)
target_link_libraries(celerity_runtime PUBLIC celerity_backends)

# For debug builds, we set the CELERITY_DETAIL_ENABLE_DEBUG preprocessor flag,
# which allows Celerity to control debug functionality within headers regardless
# of a user target's build type. (This flag is not intended to be modified by
Expand Down Expand Up @@ -368,7 +371,7 @@ install(
DESTINATION include/celerity/vendor
)
install(
TARGETS celerity_runtime
TARGETS celerity_runtime celerity_backends
EXPORT install_exports
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
Expand Down
File renamed without changes.
3 changes: 3 additions & 0 deletions cmake/celerity-config.cmake.in
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,9 @@ find_dependency(fmt REQUIRED)
find_dependency(spdlog REQUIRED)
find_dependency(small_vector REQUIRED)
find_dependency(libenvpp REQUIRED)
if(@CELERITY_ENABLE_CUDA_BACKEND@)
find_dependency(CUDAToolkit REQUIRED)
endif()

if(CELERITY_SYCL_IMPL STREQUAL "hipSYCL")
if(NOT DEFINED HIPSYCL_TARGETS AND NOT "@HIPSYCL_TARGETS@" STREQUAL "")
Expand Down
57 changes: 57 additions & 0 deletions include/backend/backend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#pragma once

#include <sycl/sycl.hpp>

#include "backend/generic_backend.h"
#include "backend/traits.h"
#include "backend/type.h"

// NOTE: These should not leak any symbols from the backend library (i.e. don't include it in the header)
#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
#include "backend/cuda_backend.h"
#endif

// Helper function to instantiate `Template` (during compile time) based on the backend type (a runtime value).
namespace celerity::detail::backend_detail {
template <template <backend::type> typename Template, typename Callback>
auto specialize_for_backend(backend::type type, Callback cb) {
switch(type) {
case backend::type::cuda: return cb(Template<backend::type::cuda>{});
case backend::type::generic: return cb(Template<backend::type::generic>{});
case backend::type::unknown: [[fallthrough]];
default: return cb(Template<backend::type::unknown>{});
}
}
} // namespace celerity::detail::backend_detail

namespace celerity::detail::backend {

/**
* Returns the detected backend type for this SYCL device.
*
* Returns either a specialized backend or 'unknown', never 'generic'.
*/
type get_type(const sycl::device& device);

/**
* Returns the effective backend type for this SYCL device, depending on the detected
* backend type and which backend modules have been compiled.
*
* Returns either a specialized backend or 'generic', never 'unknown'.
*/
type get_effective_type(const sycl::device& device);

inline std::string_view get_name(type type) {
return backend_detail::specialize_for_backend<backend_detail::name>(type, [](auto op) { return decltype(op)::value; });
}

template <int Dims>
void memcpy_strided_device(sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size, const sycl::range<Dims>& source_range,
const sycl::id<Dims>& source_offset, const sycl::range<Dims>& target_range, const sycl::id<Dims>& target_offset, const sycl::range<Dims>& copy_range) {
backend_detail::specialize_for_backend<backend_detail::backend_operations>(get_effective_type(queue.get_device()), [&](auto op) {
decltype(op)::memcpy_strided_device(
queue, source_base_ptr, target_base_ptr, elem_size, source_range, source_offset, target_range, target_offset, copy_range);
});
}

} // namespace celerity::detail::backend
27 changes: 27 additions & 0 deletions include/backend/cuda_backend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#pragma once

#include <sycl/sycl.hpp>

#include "backend/operations.h"
#include "backend/type.h"

namespace celerity::detail::backend_detail {

void memcpy_strided_device_cuda(sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size, const sycl::range<1>& source_range,
const sycl::id<1>& source_offset, const sycl::range<1>& target_range, const sycl::id<1>& target_offset, const sycl::range<1>& copy_range);

void memcpy_strided_device_cuda(sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size, const sycl::range<2>& source_range,
const sycl::id<2>& source_offset, const sycl::range<2>& target_range, const sycl::id<2>& target_offset, const sycl::range<2>& copy_range);

void memcpy_strided_device_cuda(sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size, const sycl::range<3>& source_range,
const sycl::id<3>& source_offset, const sycl::range<3>& target_range, const sycl::id<3>& target_offset, const sycl::range<3>& copy_range);

template <>
struct backend_operations<backend::type::cuda> {
template <typename... Args>
static void memcpy_strided_device(Args&&... args) {
memcpy_strided_device_cuda(args...);
}
};

} // namespace celerity::detail::backend_detail
27 changes: 27 additions & 0 deletions include/backend/generic_backend.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#pragma once

#include <sycl/sycl.hpp>

#include "backend/operations.h"
#include "backend/type.h"

namespace celerity::detail::backend_detail {

void memcpy_strided_device_generic(sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size, const sycl::range<1>& source_range,
const sycl::id<1>& source_offset, const sycl::range<1>& target_range, const sycl::id<1>& target_offset, const sycl::range<1>& copy_range);

void memcpy_strided_device_generic(sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size, const sycl::range<2>& source_range,
const sycl::id<2>& source_offset, const sycl::range<2>& target_range, const sycl::id<2>& target_offset, const sycl::range<2>& copy_range);

void memcpy_strided_device_generic(sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size, const sycl::range<3>& source_range,
const sycl::id<3>& source_offset, const sycl::range<3>& target_range, const sycl::id<3>& target_offset, const sycl::range<3>& copy_range);

template <>
struct backend_operations<backend::type::generic> {
template <typename... Args>
static void memcpy_strided_device(Args&&... args) {
memcpy_strided_device_generic(args...);
}
};

} // namespace celerity::detail::backend_detail
17 changes: 17 additions & 0 deletions include/backend/operations.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#pragma once

#include <stdexcept>

#include "backend/type.h"

namespace celerity::detail::backend_detail {

template <backend::type Type>
struct backend_operations {
template <typename... Args>
static void memcpy_strided_device(Args&&... args) {
throw std::runtime_error{"Invalid backend"};
}
};

} // namespace celerity::detail::backend_detail
41 changes: 41 additions & 0 deletions include/backend/traits.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#pragma once

#include <type_traits>

#include "backend/type.h"

namespace celerity::detail::backend_detail {

template <backend::type Type>
struct is_enabled : public std::false_type {};

template <backend::type Type>
constexpr bool is_enabled_v = is_enabled<Type>::value;

template <backend::type Type>
struct name {
static constexpr const char* value = "(unknown)";
};

template <backend::type Type>
constexpr const char* name_v = name<Type>::value;

template <>
struct is_enabled<backend::type::generic> : public std::true_type {};

template <>
struct name<backend::type::generic> {
static constexpr const char* value = "generic";
};

#if CELERITY_DETAIL_BACKEND_CUDA_ENABLED
template <>
struct is_enabled<backend::type::cuda> : public std::true_type {};
#endif

template <>
struct name<backend::type::cuda> {
static constexpr const char* value = "CUDA";
};

} // namespace celerity::detail::backend_detail
5 changes: 5 additions & 0 deletions include/backend/type.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
#pragma once

namespace celerity::detail::backend {
enum class type { generic, cuda, unknown };
} // namespace celerity::detail::backend
22 changes: 5 additions & 17 deletions include/buffer_storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <CL/sycl.hpp>

#include "backend/backend.h"
#include "payload.h"
#include "ranges.h"
#include "workaround.h"
Expand All @@ -25,19 +26,6 @@ namespace detail {
const cl::sycl::id<3>& source_offset, const cl::sycl::range<3>& target_range, const cl::sycl::id<3>& target_offset,
const cl::sycl::range<3>& copy_range);

// SYCL 2020 doesn't include any strided overloads for memcpy, so much like on the host, we have to roll our own.
void memcpy_strided_device(cl::sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size,
const cl::sycl::range<1>& source_range, const cl::sycl::id<1>& source_offset, const cl::sycl::range<1>& target_range,
const cl::sycl::id<1>& target_offset, const cl::sycl::range<1>& copy_range);

void memcpy_strided_device(cl::sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size,
const cl::sycl::range<2>& source_range, const cl::sycl::id<2>& source_offset, const cl::sycl::range<2>& target_range,
const cl::sycl::id<2>& target_offset, const cl::sycl::range<2>& copy_range);

void memcpy_strided_device(cl::sycl::queue& queue, const void* source_base_ptr, void* target_base_ptr, size_t elem_size,
const cl::sycl::range<3>& source_range, const cl::sycl::id<3>& source_offset, const cl::sycl::range<3>& target_range,
const cl::sycl::id<3>& target_offset, const cl::sycl::range<3>& copy_range);

void linearize_subrange(const void* source_base_ptr, void* target_ptr, size_t elem_size, const range<3>& source_range, const subrange<3>& copy_sr);

template <typename DataT, int Dims>
Expand Down Expand Up @@ -158,8 +146,8 @@ namespace detail {
assert_copy_is_in_range(range_cast<3>(m_device_buf.get_range()), sr.range, sr.offset, id<3>{}, sr.range);

// TODO: Ideally we'd make this non-blocking and return some sort of async handle that can be waited upon
memcpy_strided_device(m_owning_queue, m_device_buf.get_pointer(), out_linearized, sizeof(DataT), m_device_buf.get_range(), id_cast<Dims>(sr.offset),
range_cast<Dims>(sr.range), id<Dims>{}, range_cast<Dims>(sr.range));
backend::memcpy_strided_device(m_owning_queue, m_device_buf.get_pointer(), out_linearized, sizeof(DataT), m_device_buf.get_range(),
id_cast<Dims>(sr.offset), range_cast<Dims>(sr.range), id<Dims>{}, range_cast<Dims>(sr.range));
}

void set_data(const subrange<3>& sr, const void* in_linearized) override {
Expand All @@ -168,7 +156,7 @@ namespace detail {
assert_copy_is_in_range(sr.range, range_cast<3>(m_device_buf.get_range()), id<3>{}, sr.offset, sr.range);

// TODO: Ideally we'd make this non-blocking and return some sort of async handle that can be waited upon
memcpy_strided_device(m_owning_queue, in_linearized, m_device_buf.get_pointer(), sizeof(DataT), range_cast<Dims>(sr.range), id<Dims>{},
backend::memcpy_strided_device(m_owning_queue, in_linearized, m_device_buf.get_pointer(), sizeof(DataT), range_cast<Dims>(sr.range), id<Dims>{},
m_device_buf.get_range(), id_cast<Dims>(sr.offset), range_cast<Dims>(sr.range));
}

Expand Down Expand Up @@ -225,7 +213,7 @@ namespace detail {

if(source.get_type() == buffer_type::device_buffer) {
auto& device_source = dynamic_cast<const device_buffer_storage<DataT, Dims>&>(source);
memcpy_strided_device(m_owning_queue, device_source.m_device_buf.get_pointer(), m_device_buf.get_pointer(), sizeof(DataT),
backend::memcpy_strided_device(m_owning_queue, device_source.m_device_buf.get_pointer(), m_device_buf.get_pointer(), sizeof(DataT),
device_source.m_device_buf.get_range(), id_cast<Dims>(source_offset), m_device_buf.get_range(), id_cast<Dims>(target_offset),
range_cast<Dims>(copy_range));
}
Expand Down
17 changes: 17 additions & 0 deletions include/device_queue.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

#include <CL/sycl.hpp>

#include "backend/backend.h"
#include "config.h"
#include "log.h"
#include "workaround.h"
Expand Down Expand Up @@ -245,6 +246,22 @@ namespace detail {
const auto device_name = device.template get_info<sycl::info::device::name>();
CELERITY_INFO("Using platform '{}', device '{}' ({})", platform_name, device_name, how_selected);

if constexpr(std::is_same_v<DeviceT, sycl::device>) {
if(backend::get_effective_type(device) == backend::type::generic) {
if(backend::get_type(device) == backend::type::unknown) {
CELERITY_WARN("No backend specialization available for selected platform '{}', falling back to generic. Performance may be degraded.",
device.get_platform().template get_info<sycl::info::platform::name>());
} else {
CELERITY_WARN(
"Selected platform '{}' is compatible with specialized {} backend, but it has not been compiled. Performance may be degraded.",
device.get_platform().template get_info<sycl::info::platform::name>(), backend::get_name(backend::get_type(device)));
}
} else {
CELERITY_DEBUG("Using {} backend for selected platform '{}'.", backend::get_name(backend::get_effective_type(device)),
device.get_platform().template get_info<sycl::info::platform::name>());
}
}

return device;
}

Expand Down
23 changes: 23 additions & 0 deletions src/backend/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
find_package(CUDAToolkit QUIET)
# find_package(LevelZero QUIET) # TODO: Need find module?
# find_package(ROCM QUIET) # TODO: Need find module?

# TODO: Make conditional on CELERITY_SYCL_IMPL?
option(CELERITY_ENABLE_CUDA_BACKEND "Enable optimized code paths for CUDA backends" ${CUDAToolkit_FOUND})
if(CELERITY_ENABLE_CUDA_BACKEND AND NOT CUDAToolkit_FOUND)
# Run find_package again to emit error message
find_package(CUDAToolkit REQUIRED)
endif()

add_library(celerity_backends STATIC backend.cc generic_backend.cc)
set_property(TARGET celerity_backends PROPERTY CXX_STANDARD 17)
# We link against the RT here to get all of its transitive properties (circular linking is allowed for static libraries).
target_link_libraries(celerity_backends PRIVATE celerity_runtime)
add_sycl_to_target(TARGET celerity_backends SOURCES)

if(CELERITY_ENABLE_CUDA_BACKEND)
target_sources(celerity_backends PRIVATE cuda_backend.cc)
target_link_libraries(celerity_backends PUBLIC CUDA::cudart)
target_compile_definitions(celerity_backends PUBLIC "CELERITY_DETAIL_BACKEND_CUDA_ENABLED=1")
message(STATUS "CUDA backend enabled")
endif()
25 changes: 25 additions & 0 deletions src/backend/backend.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#include "backend/backend.h"

namespace celerity::detail::backend {

type get_type(const sycl::device& device) {
#if defined(__HIPSYCL__) && defined(SYCL_EXT_HIPSYCL_BACKEND_CUDA)
if(device.get_backend() == sycl::backend::cuda) { return type::cuda; }
#endif
#if defined(__SYCL_COMPILER_VERSION) // DPC++ (TODO: This may break when using OpenSYCL w/ DPC++ as compiler)
if(device.get_backend() == sycl::backend::ext_oneapi_cuda) { return type::cuda; }
#endif
return type::unknown;
}

type get_effective_type(const sycl::device& device) {
[[maybe_unused]] const auto b = get_type(device);

#if defined(CELERITY_DETAIL_BACKEND_CUDA_ENABLED)
if(b == type::cuda) return b;
#endif

return type::generic;
}

} // namespace celerity::detail::backend
Loading

0 comments on commit 750f32a

Please sign in to comment.