Skip to content

Commit

Permalink
Lift restrictions on reductions for DPC++
Browse files Browse the repository at this point in the history
DPC++ now supports passing multiple reductions into a kernel. The
restriction on 1-dimensional buffers can also be lifted as we now use
pointer-based reductions internally.
  • Loading branch information
psalz committed May 23, 2023
1 parent 0a96d15 commit efff21b
Show file tree
Hide file tree
Showing 10 changed files with 54 additions and 69 deletions.
11 changes: 5 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -134,17 +134,12 @@ endif()

if(CELERITY_SYCL_IMPL STREQUAL hipSYCL AND HIPSYCL_SUPPORTS_SYCL_2020_REDUCTIONS)
set(CELERITY_FEATURE_SCALAR_REDUCTIONS ON)
set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ON)
elseif(CELERITY_SYCL_IMPL STREQUAL "DPC++")
set(CELERITY_FEATURE_SCALAR_REDUCTIONS OFF)
set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ON)
set(CELERITY_FEATURE_SCALAR_REDUCTIONS ON)
else()
set(CELERITY_FEATURE_SCALAR_REDUCTIONS OFF)
set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS OFF)
endif()

set(CELERITY_FEATURE_LOCAL_ACCESSOR ON)

if(NOT CELERITY_SYCL_IMPL STREQUAL ComputeCpp)
set(CELERITY_FEATURE_UNNAMED_KERNELS ON)
else()
Expand Down Expand Up @@ -291,6 +286,10 @@ target_link_libraries(celerity_runtime PUBLIC
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/backend)
target_link_libraries(celerity_runtime PUBLIC celerity_backends)

# Deprecated feature flags
set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ${CELERITY_FEATURE_SCALAR_REDUCTIONS})
set(CELERITY_FEATURE_LOCAL_ACCESSOR ON)

# For debug builds, we set the CELERITY_DETAIL_ENABLE_DEBUG preprocessor flag,
# which allows Celerity to control debug functionality within headers regardless
# of a user target's build type. (This flag is not intended to be modified by
Expand Down
24 changes: 6 additions & 18 deletions docs/reductions.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,10 @@ higher-dimensional reduction outputs will only become available once SYCL suppor

### No Broad Support Across SYCL Implementations

Only hipSYCL provides a complete implementation of SYCL 2020 reduction variables at the moment, but
requires [a patch](https://github.com/illuhad/hipSYCL/pull/578). Installing this version of hipSYCL will
enable you to run the `reduction` Celerity example.
Only DPC++ provides a complete implementation of SYCL 2020 reduction variables at the moment,
while hipSYCL requires [a patch](https://github.com/illuhad/hipSYCL/pull/578).
Installing this version of hipSYCL will enable you to run the `reduction` Celerity example.

DPC++ currently implements an incompatible version of reductions from an earlier Intel proposal.
Celerity can partially work around this API difference, but not without limitations:

- Reduction output buffers can only be 1-dimensional
- Calls to `parallel_for` can receive at most one reduction

ComputeCpp does not support reductions at all as of version 2.6.0, so Celerity does not expose them for this backend.

Celerity provides feature-detection macros for reduction support, both in CMake (`ON` or `OFF`) and
as C++ macros (always defined to `0` or `1`):

- `CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS` for (at least) the limited reduction support provided
by DPC++.
- `CELERITY_FEATURE_SCALAR_REDUCTIONS` for the full reduction support provided by a 2020-conformant
SYCL implementation. Implies `CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS`.
Celerity provides the `CELERITY_FEATURE_SCALAR_REDUCTIONS` feature-detection
macro for reduction support, both in CMake (`ON` or `OFF`) and as C++ macros
(always defined to `0` or `1`).
2 changes: 1 addition & 1 deletion examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ add_example(matmul)
add_example(syncing)
add_example(wave_sim)

if(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS)
if(CELERITY_FEATURE_SCALAR_REDUCTIONS)
add_example(reduction)
endif()

Expand Down
2 changes: 1 addition & 1 deletion examples/reduction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.13)
project(syncing LANGUAGES CXX)

find_package(Celerity 0.3.2 REQUIRED)
if (NOT CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS)
if (NOT CELERITY_FEATURE_SCALAR_REDUCTIONS)
message(SEND_ERROR "Your Celerity installation does not support reductions. Skip this example.")
endif ()

Expand Down
15 changes: 5 additions & 10 deletions include/handler.h
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ namespace detail {

template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
auto make_sycl_reduction(const reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>& d) {
#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
#else
cl::sycl::property_list props;
Expand Down Expand Up @@ -546,7 +546,7 @@ namespace detail {

template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation>
auto make_reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) {
#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
#else
if(vars.get_range().size() != 1) {
Expand Down Expand Up @@ -599,10 +599,8 @@ void handler::parallel_for_kernel_and_reductions(range<Dims> global_range, id<Di
device_handler.submit_to_sycl([&](cl::sycl::handler& cgh) {
constexpr int sycl_dims = std::max(1, Dims);

if constexpr(!CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) {
if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) {
static_assert(detail::constexpr_false<Kernel>, "Reductions are not supported by your SYCL implementation");
} else if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 1) {
static_assert(detail::constexpr_false<Kernel>, "DPC++ currently does not support more than one reduction variable per kernel");
} else if constexpr(std::is_same_v<KernelFlavor, detail::simple_kernel_flavor>) {
const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(chunk_range));
detail::invoke_sycl_parallel_for<KernelName>(cgh, sycl_global_range, detail::make_sycl_reduction(reductions)...,
Expand Down Expand Up @@ -649,12 +647,9 @@ void handler::host_task(range<Dims> global_range, id<Dims> global_offset, Functo

template <typename DataT, int Dims, typename BinaryOperation>
auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
#else
#if CELERITY_WORKAROUND(DPCPP)
static_assert(Dims == 1, "DPC++ currently does not support reductions to buffers with dimensionality != 1");
#endif
static_assert(cl::sycl::has_known_identity_v<BinaryOperation, DataT>,
"Celerity does not currently support reductions without an identity. Either specialize "
"cl::sycl::known_identity or use the reduction() overload taking an identity at runtime");
Expand All @@ -664,7 +659,7 @@ auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation co

template <typename DataT, int Dims, typename BinaryOperation>
auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
#else
static_assert(!cl::sycl::has_known_identity_v<BinaryOperation, DataT>, "Identity is known to SYCL, remove the identity parameter from reduction()");
Expand Down
2 changes: 1 addition & 1 deletion include/sycl_wrappers.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace property {
using cl::sycl::property::no_init;
#endif

#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
namespace reduction {
using cl::sycl::property::reduction::initialize_to_identity;
}
Expand Down
21 changes: 9 additions & 12 deletions test/runtime_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -650,9 +650,8 @@ namespace detail {
});
}

#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS

TEST_CASE_METHOD(test_utils::runtime_fixture, "attempting a reduction on buffers with size != 1 throws", "[task-manager]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
runtime::init(nullptr, nullptr);
auto& tm = runtime::get_instance().get_task_manager();

Expand All @@ -666,8 +665,6 @@ namespace detail {
cgh.parallel_for<class UKN(ok_size_1)>(range<1>{1}, reduction(buf_4, cgh, cl::sycl::plus<float>{}), [=](celerity::item<1>, auto&) {});
}));

#if CELERITY_FEATURE_SCALAR_REDUCTIONS

buffer<float, 2> buf_2{range<2>{1, 2}};
CHECK_THROWS(tm.submit_command_group([&](handler& cgh) { //
cgh.parallel_for<class UKN(wrong_size_2)>(range<2>{1, 1}, reduction(buf_2, cgh, cl::sycl::plus<float>{}), [=](celerity::item<2>, auto&) {});
Expand All @@ -687,11 +684,11 @@ namespace detail {
CHECK_NOTHROW(tm.submit_command_group([&](handler& cgh) { //
cgh.parallel_for<class UKN(ok_size_3)>(range<3>{1, 1, 1}, reduction(buf_6, cgh, cl::sycl::plus<float>{}), [=](celerity::item<3>, auto&) {});
}));
#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

#endif

TEST_CASE_METHOD(test_utils::runtime_fixture, "handler::parallel_for accepts nd_range", "[handler]") {
distr_queue q;

Expand Down Expand Up @@ -760,19 +757,19 @@ namespace detail {
});
}

#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS

TEST_CASE_METHOD(test_utils::runtime_fixture, "reductions can be passed into nd_range kernels", "[handler]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
// Note: We assume a local range size of 16 here, this should be supported by most devices.

buffer<int, 1> b{range<1>{1}};
distr_queue{}.submit([=](handler& cgh) {
cgh.parallel_for<class UKN(kernel)>(celerity::nd_range{range<2>{8, 8}, range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}),
[](nd_item<2> item, auto& sum) { sum += item.get_global_linear_id(); });
});
}

#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

#if CELERITY_FEATURE_UNNAMED_KERNELS

Expand All @@ -784,7 +781,7 @@ namespace detail {
// without name
q.submit([](handler& cgh) { cgh.parallel_for(range<1>{64}, [](item<1> item) {}); });
q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [](nd_item<1> item) {}); });
#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
buffer<int> b{{1}};
q.submit([=](handler& cgh) {
cgh.parallel_for(
Expand All @@ -799,7 +796,7 @@ namespace detail {
// with name
q.submit([=](handler& cgh) { cgh.parallel_for<class UKN(simple_kernel_with_name)>(range<1>{64}, [=](item<1> item) {}); });
q.submit([=](handler& cgh) { cgh.parallel_for<class UKN(nd_range_kernel_with_name)>(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) {}); });
#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
q.submit([=](handler& cgh) {
cgh.parallel_for<class UKN(simple_kernel_with_name_and_reductions)>(
range<1>{64}, reduction(b, cgh, cl::sycl::plus<int>{}), [=](item<1> item, auto& r) { r += static_cast<int>(item.get_linear_id()); });
Expand Down
8 changes: 4 additions & 4 deletions test/sycl_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,9 @@ static auto make_device_accessor(sycl::buffer<int, 1>& buf, sycl::handler& cgh,
}
}

#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS

// If this test fails, celerity can't reliably support reductions on the user's combination of backend and hardware
TEST_CASE_METHOD(test_utils::device_queue_fixture, "SYCL has working simple scalar reductions", "[sycl][reductions]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
const size_t N = GENERATE(64, 512, 1024, 4096);
CAPTURE(N);

Expand All @@ -44,9 +43,10 @@ TEST_CASE_METHOD(test_utils::device_queue_fixture, "SYCL has working simple scal

sycl::host_accessor acc{buf};
CHECK(static_cast<size_t>(acc[0]) == N);
}

#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

TEST_CASE("SYCL implements by-value equality-comparison of device information", "[sycl][device-selection][!mayfail]") {
constexpr static auto get_devices = [] {
Expand Down
36 changes: 20 additions & 16 deletions test/system/distr_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,32 +15,20 @@
namespace celerity {
namespace detail {

#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS

template <typename T>
struct unknown_identity_maximum {
T operator()(T a, T b) const { return a < b ? b : a; }
};

TEST_CASE_METHOD(test_utils::runtime_fixture, "simple reductions produce the expected results", "[reductions]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
size_t N = 1000;
buffer<size_t, 1> sum_buf{{1}};
buffer<size_t, 1> max_buf{{1}};

distr_queue q;
const auto initialize_to_identity = cl::sycl::property::reduction::initialize_to_identity{};

#if !CELERITY_FEATURE_SCALAR_REDUCTIONS // DPC++ can handle at most 1 reduction variable per kernel
q.submit([=](handler& cgh) {
auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus<size_t>{}, initialize_to_identity);
cgh.parallel_for<class UKN(kernel)>(range{N}, id{1}, sum_r, [=](celerity::item<1> item, auto& sum) { sum += item.get_id(0); });
});

q.submit([=](handler& cgh) {
auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum<size_t>{}, initialize_to_identity);
cgh.parallel_for<class UKN(kernel)>(range{N}, id{1}, max_r, [=](celerity::item<1> item, auto& max) { max.combine(item.get_id(0)); });
});
#else
q.submit([=](handler& cgh) {
auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus<size_t>{}, initialize_to_identity);
auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum<size_t>{}, initialize_to_identity);
Expand All @@ -49,7 +37,6 @@ namespace detail {
max.combine(item.get_id(0));
});
});
#endif

q.submit([=](handler& cgh) {
accessor sum_acc{sum_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
Expand All @@ -59,11 +46,15 @@ namespace detail {
CHECK(max_acc[0] == N);
});
});
#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

// Regression test: The host -> device transfer previously caused an illegal nested sycl::queue::submit call which deadlocks
// Distributed test, since the single-node case optimizes the reduction command away
TEST_CASE_METHOD(test_utils::runtime_fixture, "reduction commands perform host -> device transfers if necessary", "[reductions]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
distr_queue q;

REQUIRE(runtime::get_instance().get_num_nodes() > 1);
Expand All @@ -80,9 +71,13 @@ namespace detail {
accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
cgh.host_task(on_master_node, [=] { CHECK(acc[0] == N + init); });
});
#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

TEST_CASE_METHOD(test_utils::runtime_fixture, "multiple chained reductions produce correct results", "[reductions]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
distr_queue q;

const int N = 1000;
Expand All @@ -102,10 +97,14 @@ namespace detail {
accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
cgh.host_task(on_master_node, [=] { CHECK(acc[0] == 3 * N); });
});
#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

TEST_CASE_METHOD(
test_utils::runtime_fixture, "subsequently requiring reduction results on different subsets of nodes produces correct data flow", "[reductions]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
distr_queue q;

const int N = 1000;
Expand All @@ -130,10 +129,14 @@ namespace detail {
CHECK(acc[0] == expected);
});
});
#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

TEST_CASE_METHOD(
test_utils::runtime_fixture, "runtime-shutdown graph printing works in the presence of a finished reduction", "[reductions][print_graph][smoke-test]") {
#if CELERITY_FEATURE_SCALAR_REDUCTIONS
// init runtime early so the distr_queue ctor doesn't override the log level set by log_capture
runtime::init(nullptr, nullptr);
const bool is_master_node = runtime::get_instance().is_master_node();
Expand Down Expand Up @@ -161,10 +164,11 @@ namespace detail {
CHECK_THAT(log, ContainsSubstring("(R1) <b>await push</b> from N1"));
CHECK_THAT(log, ContainsSubstring("<b>reduction</b> R1<br/> B0 {[[0,0,0] - [1,1,1]]}"));
}
#else
SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
#endif
}

#endif // CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS

template <int Dims>
class kernel_name_nd_geometry;

Expand Down
2 changes: 2 additions & 0 deletions test/test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@
*/
#define REQUIRE_LOOP(...) CELERITY_DETAIL_REQUIRE_LOOP(__VA_ARGS__)

#define SKIP_BECAUSE_NO_SCALAR_REDUCTIONS SKIP("CELERITY_FEATURE_SCALAR_REDUCTIONS == 0");

namespace celerity {
namespace detail {

Expand Down

0 comments on commit efff21b

Please sign in to comment.