diff --git a/CMakeLists.txt b/CMakeLists.txt index 777c246ee..cd3c94b39 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -134,17 +134,12 @@ endif() if(CELERITY_SYCL_IMPL STREQUAL hipSYCL AND HIPSYCL_SUPPORTS_SYCL_2020_REDUCTIONS) set(CELERITY_FEATURE_SCALAR_REDUCTIONS ON) - set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ON) elseif(CELERITY_SYCL_IMPL STREQUAL "DPC++") - set(CELERITY_FEATURE_SCALAR_REDUCTIONS OFF) - set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ON) + set(CELERITY_FEATURE_SCALAR_REDUCTIONS ON) else() set(CELERITY_FEATURE_SCALAR_REDUCTIONS OFF) - set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS OFF) endif() -set(CELERITY_FEATURE_LOCAL_ACCESSOR ON) - if(NOT CELERITY_SYCL_IMPL STREQUAL ComputeCpp) set(CELERITY_FEATURE_UNNAMED_KERNELS ON) else() @@ -291,6 +286,10 @@ target_link_libraries(celerity_runtime PUBLIC add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/backend) target_link_libraries(celerity_runtime PUBLIC celerity_backends) +# Deprecated feature flags +set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ${CELERITY_FEATURE_SCALAR_REDUCTIONS}) +set(CELERITY_FEATURE_LOCAL_ACCESSOR ON) + # For debug builds, we set the CELERITY_DETAIL_ENABLE_DEBUG preprocessor flag, # which allows Celerity to control debug functionality within headers regardless # of a user target's build type. (This flag is not intended to be modified by diff --git a/docs/reductions.md b/docs/reductions.md index b2a5a0e1c..0429f0cc2 100644 --- a/docs/reductions.md +++ b/docs/reductions.md @@ -45,22 +45,10 @@ higher-dimensional reduction outputs will only become available once SYCL suppor ### No Broad Support Across SYCL Implementations -Only hipSYCL provides a complete implementation of SYCL 2020 reduction variables at the moment, but -requires [a patch](https://github.com/illuhad/hipSYCL/pull/578). Installing this version of hipSYCL will -enable you to run the `reduction` Celerity example. +Only DPC++ provides a complete implementation of SYCL 2020 reduction variables at the moment, +while hipSYCL requires [a patch](https://github.com/illuhad/hipSYCL/pull/578). +Installing this version of hipSYCL will enable you to run the `reduction` Celerity example. -DPC++ currently implements an incompatible version of reductions from an earlier Intel proposal. -Celerity can partially work around this API difference, but not without limitations: - -- Reduction output buffers can only be 1-dimensional -- Calls to `parallel_for` can receive at most one reduction - -ComputeCpp does not support reductions at all as of version 2.6.0, so Celerity does not expose them for this backend. - -Celerity provides feature-detection macros for reduction support, both in CMake (`ON` or `OFF`) and -as C++ macros (always defined to `0` or `1`): - -- `CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS` for (at least) the limited reduction support provided - by DPC++. -- `CELERITY_FEATURE_SCALAR_REDUCTIONS` for the full reduction support provided by a 2020-conformant - SYCL implementation. Implies `CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS`. +Celerity provides the `CELERITY_FEATURE_SCALAR_REDUCTIONS` feature-detection +macro for reduction support, both in CMake (`ON` or `OFF`) and as C++ macros +(always defined to `0` or `1`). diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 5a94fb40d..b79c2761a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -27,7 +27,7 @@ add_example(matmul) add_example(syncing) add_example(wave_sim) -if(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS) +if(CELERITY_FEATURE_SCALAR_REDUCTIONS) add_example(reduction) endif() diff --git a/examples/reduction/CMakeLists.txt b/examples/reduction/CMakeLists.txt index e3692d110..33a929c49 100644 --- a/examples/reduction/CMakeLists.txt +++ b/examples/reduction/CMakeLists.txt @@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.13) project(syncing LANGUAGES CXX) find_package(Celerity 0.3.2 REQUIRED) -if (NOT CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS) +if (NOT CELERITY_FEATURE_SCALAR_REDUCTIONS) message(SEND_ERROR "Your Celerity installation does not support reductions. Skip this example.") endif () diff --git a/include/handler.h b/include/handler.h index 048e61f93..2ed53e644 100644 --- a/include/handler.h +++ b/include/handler.h @@ -500,7 +500,7 @@ namespace detail { template auto make_sycl_reduction(const reduction_descriptor& d) { -#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS +#if !CELERITY_FEATURE_SCALAR_REDUCTIONS static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); #else cl::sycl::property_list props; @@ -546,7 +546,7 @@ namespace detail { template auto make_reduction(const buffer& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) { -#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS +#if !CELERITY_FEATURE_SCALAR_REDUCTIONS static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); #else if(vars.get_range().size() != 1) { @@ -599,10 +599,8 @@ void handler::parallel_for_kernel_and_reductions(range global_range, id 0) { + if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) { static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); - } else if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 1) { - static_assert(detail::constexpr_false, "DPC++ currently does not support more than one reduction variable per kernel"); } else if constexpr(std::is_same_v) { const auto sycl_global_range = sycl::range(detail::range_cast(chunk_range)); detail::invoke_sycl_parallel_for(cgh, sycl_global_range, detail::make_sycl_reduction(reductions)..., @@ -649,12 +647,9 @@ void handler::host_task(range global_range, id global_offset, Functo template auto reduction(const buffer& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { -#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS +#if !CELERITY_FEATURE_SCALAR_REDUCTIONS static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); #else -#if CELERITY_WORKAROUND(DPCPP) - static_assert(Dims == 1, "DPC++ currently does not support reductions to buffers with dimensionality != 1"); -#endif static_assert(cl::sycl::has_known_identity_v, "Celerity does not currently support reductions without an identity. Either specialize " "cl::sycl::known_identity or use the reduction() overload taking an identity at runtime"); @@ -664,7 +659,7 @@ auto reduction(const buffer& vars, handler& cgh, BinaryOperation co template auto reduction(const buffer& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) { -#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS +#if !CELERITY_FEATURE_SCALAR_REDUCTIONS static_assert(detail::constexpr_false, "Reductions are not supported by your SYCL implementation"); #else static_assert(!cl::sycl::has_known_identity_v, "Identity is known to SYCL, remove the identity parameter from reduction()"); diff --git a/include/sycl_wrappers.h b/include/sycl_wrappers.h index 05c34f62f..d56c407f5 100644 --- a/include/sycl_wrappers.h +++ b/include/sycl_wrappers.h @@ -44,7 +44,7 @@ namespace property { using cl::sycl::property::no_init; #endif -#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS +#if CELERITY_FEATURE_SCALAR_REDUCTIONS namespace reduction { using cl::sycl::property::reduction::initialize_to_identity; } diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc index 7e5b11089..f8d653c0b 100644 --- a/test/runtime_tests.cc +++ b/test/runtime_tests.cc @@ -650,9 +650,8 @@ namespace detail { }); } -#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS - TEST_CASE_METHOD(test_utils::runtime_fixture, "attempting a reduction on buffers with size != 1 throws", "[task-manager]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS runtime::init(nullptr, nullptr); auto& tm = runtime::get_instance().get_task_manager(); @@ -666,8 +665,6 @@ namespace detail { cgh.parallel_for(range<1>{1}, reduction(buf_4, cgh, cl::sycl::plus{}), [=](celerity::item<1>, auto&) {}); })); -#if CELERITY_FEATURE_SCALAR_REDUCTIONS - buffer buf_2{range<2>{1, 2}}; CHECK_THROWS(tm.submit_command_group([&](handler& cgh) { // cgh.parallel_for(range<2>{1, 1}, reduction(buf_2, cgh, cl::sycl::plus{}), [=](celerity::item<2>, auto&) {}); @@ -687,11 +684,11 @@ namespace detail { CHECK_NOTHROW(tm.submit_command_group([&](handler& cgh) { // cgh.parallel_for(range<3>{1, 1, 1}, reduction(buf_6, cgh, cl::sycl::plus{}), [=](celerity::item<3>, auto&) {}); })); +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS #endif } -#endif - TEST_CASE_METHOD(test_utils::runtime_fixture, "handler::parallel_for accepts nd_range", "[handler]") { distr_queue q; @@ -760,9 +757,8 @@ namespace detail { }); } -#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS - TEST_CASE_METHOD(test_utils::runtime_fixture, "reductions can be passed into nd_range kernels", "[handler]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS // Note: We assume a local range size of 16 here, this should be supported by most devices. buffer b{range<1>{1}}; @@ -770,9 +766,10 @@ namespace detail { cgh.parallel_for(celerity::nd_range{range<2>{8, 8}, range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}), [](nd_item<2> item, auto& sum) { sum += item.get_global_linear_id(); }); }); - } - +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS #endif + } #if CELERITY_FEATURE_UNNAMED_KERNELS @@ -784,7 +781,7 @@ namespace detail { // without name q.submit([](handler& cgh) { cgh.parallel_for(range<1>{64}, [](item<1> item) {}); }); q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [](nd_item<1> item) {}); }); -#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS +#if CELERITY_FEATURE_SCALAR_REDUCTIONS buffer b{{1}}; q.submit([=](handler& cgh) { cgh.parallel_for( @@ -799,7 +796,7 @@ namespace detail { // with name q.submit([=](handler& cgh) { cgh.parallel_for(range<1>{64}, [=](item<1> item) {}); }); q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) {}); }); -#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS +#if CELERITY_FEATURE_SCALAR_REDUCTIONS q.submit([=](handler& cgh) { cgh.parallel_for( range<1>{64}, reduction(b, cgh, cl::sycl::plus{}), [=](item<1> item, auto& r) { r += static_cast(item.get_linear_id()); }); diff --git a/test/sycl_tests.cc b/test/sycl_tests.cc index d72336104..bc0a8a376 100644 --- a/test/sycl_tests.cc +++ b/test/sycl_tests.cc @@ -28,10 +28,9 @@ static auto make_device_accessor(sycl::buffer& buf, sycl::handler& cgh, } } -#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS - // If this test fails, celerity can't reliably support reductions on the user's combination of backend and hardware TEST_CASE_METHOD(test_utils::device_queue_fixture, "SYCL has working simple scalar reductions", "[sycl][reductions]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS const size_t N = GENERATE(64, 512, 1024, 4096); CAPTURE(N); @@ -44,9 +43,10 @@ TEST_CASE_METHOD(test_utils::device_queue_fixture, "SYCL has working simple scal sycl::host_accessor acc{buf}; CHECK(static_cast(acc[0]) == N); -} - +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS #endif +} TEST_CASE("SYCL implements by-value equality-comparison of device information", "[sycl][device-selection][!mayfail]") { constexpr static auto get_devices = [] { diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc index 5d8508d72..f38bd8b68 100644 --- a/test/system/distr_tests.cc +++ b/test/system/distr_tests.cc @@ -15,14 +15,13 @@ namespace celerity { namespace detail { -#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS - template struct unknown_identity_maximum { T operator()(T a, T b) const { return a < b ? b : a; } }; TEST_CASE_METHOD(test_utils::runtime_fixture, "simple reductions produce the expected results", "[reductions]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS size_t N = 1000; buffer sum_buf{{1}}; buffer max_buf{{1}}; @@ -30,17 +29,6 @@ namespace detail { distr_queue q; const auto initialize_to_identity = cl::sycl::property::reduction::initialize_to_identity{}; -#if !CELERITY_FEATURE_SCALAR_REDUCTIONS // DPC++ can handle at most 1 reduction variable per kernel - q.submit([=](handler& cgh) { - auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus{}, initialize_to_identity); - cgh.parallel_for(range{N}, id{1}, sum_r, [=](celerity::item<1> item, auto& sum) { sum += item.get_id(0); }); - }); - - q.submit([=](handler& cgh) { - auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum{}, initialize_to_identity); - cgh.parallel_for(range{N}, id{1}, max_r, [=](celerity::item<1> item, auto& max) { max.combine(item.get_id(0)); }); - }); -#else q.submit([=](handler& cgh) { auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus{}, initialize_to_identity); auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum{}, initialize_to_identity); @@ -49,7 +37,6 @@ namespace detail { max.combine(item.get_id(0)); }); }); -#endif q.submit([=](handler& cgh) { accessor sum_acc{sum_buf, cgh, celerity::access::all{}, celerity::read_only_host_task}; @@ -59,11 +46,15 @@ namespace detail { CHECK(max_acc[0] == N); }); }); +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS +#endif } // Regression test: The host -> device transfer previously caused an illegal nested sycl::queue::submit call which deadlocks // Distributed test, since the single-node case optimizes the reduction command away TEST_CASE_METHOD(test_utils::runtime_fixture, "reduction commands perform host -> device transfers if necessary", "[reductions]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS distr_queue q; REQUIRE(runtime::get_instance().get_num_nodes() > 1); @@ -80,9 +71,13 @@ namespace detail { accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(on_master_node, [=] { CHECK(acc[0] == N + init); }); }); +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS +#endif } TEST_CASE_METHOD(test_utils::runtime_fixture, "multiple chained reductions produce correct results", "[reductions]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS distr_queue q; const int N = 1000; @@ -102,10 +97,14 @@ namespace detail { accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task}; cgh.host_task(on_master_node, [=] { CHECK(acc[0] == 3 * N); }); }); +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS +#endif } TEST_CASE_METHOD( test_utils::runtime_fixture, "subsequently requiring reduction results on different subsets of nodes produces correct data flow", "[reductions]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS distr_queue q; const int N = 1000; @@ -130,10 +129,14 @@ namespace detail { CHECK(acc[0] == expected); }); }); +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS +#endif } TEST_CASE_METHOD( test_utils::runtime_fixture, "runtime-shutdown graph printing works in the presence of a finished reduction", "[reductions][print_graph][smoke-test]") { +#if CELERITY_FEATURE_SCALAR_REDUCTIONS // init runtime early so the distr_queue ctor doesn't override the log level set by log_capture runtime::init(nullptr, nullptr); const bool is_master_node = runtime::get_instance().is_master_node(); @@ -161,10 +164,11 @@ namespace detail { CHECK_THAT(log, ContainsSubstring("(R1) await push from N1")); CHECK_THAT(log, ContainsSubstring("reduction R1
B0 {[[0,0,0] - [1,1,1]]}")); } +#else + SKIP_BECAUSE_NO_SCALAR_REDUCTIONS +#endif } -#endif // CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS - template class kernel_name_nd_geometry; diff --git a/test/test_utils.h b/test/test_utils.h index 3eec78906..0ebdff599 100644 --- a/test/test_utils.h +++ b/test/test_utils.h @@ -43,6 +43,8 @@ */ #define REQUIRE_LOOP(...) CELERITY_DETAIL_REQUIRE_LOOP(__VA_ARGS__) +#define SKIP_BECAUSE_NO_SCALAR_REDUCTIONS SKIP("CELERITY_FEATURE_SCALAR_REDUCTIONS == 0"); + namespace celerity { namespace detail {