Lift restrictions on reductions for DPC++

DPC++ now supports passing multiple reductions into a kernel. The restriction on 1-dimensional buffers can also be lifted as we now use pointer-based reductions internally.
celerity · May 23, 2023 · efff21b · efff21b
1 parent 0a96d15
commit efff21b
Show file tree

Hide file tree

Showing 10 changed files with 54 additions and 69 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -134,17 +134,12 @@ endif()
 
 if(CELERITY_SYCL_IMPL STREQUAL hipSYCL AND HIPSYCL_SUPPORTS_SYCL_2020_REDUCTIONS)
   set(CELERITY_FEATURE_SCALAR_REDUCTIONS ON)
-  set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ON)
 elseif(CELERITY_SYCL_IMPL STREQUAL "DPC++")
-  set(CELERITY_FEATURE_SCALAR_REDUCTIONS OFF)
-  set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ON)
+  set(CELERITY_FEATURE_SCALAR_REDUCTIONS ON)
 else()
   set(CELERITY_FEATURE_SCALAR_REDUCTIONS OFF)
-  set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS OFF)
 endif()
 
-set(CELERITY_FEATURE_LOCAL_ACCESSOR ON)
-
 if(NOT CELERITY_SYCL_IMPL STREQUAL ComputeCpp)
   set(CELERITY_FEATURE_UNNAMED_KERNELS ON)
 else()
@@ -291,6 +286,10 @@ target_link_libraries(celerity_runtime PUBLIC
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/src/backend)
 target_link_libraries(celerity_runtime PUBLIC celerity_backends)
 
+# Deprecated feature flags
+set(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS ${CELERITY_FEATURE_SCALAR_REDUCTIONS})
+set(CELERITY_FEATURE_LOCAL_ACCESSOR ON)
+
 # For debug builds, we set the CELERITY_DETAIL_ENABLE_DEBUG preprocessor flag,
 # which allows Celerity to control debug functionality within headers regardless
 # of a user target's build type. (This flag is not intended to be modified by

diff --git a/docs/reductions.md b/docs/reductions.md
@@ -45,22 +45,10 @@ higher-dimensional reduction outputs will only become available once SYCL suppor
 
 ### No Broad Support Across SYCL Implementations
 
-Only hipSYCL provides a complete implementation of SYCL 2020 reduction variables at the moment, but
-requires [a patch](https://github.com/illuhad/hipSYCL/pull/578). Installing this version of hipSYCL will
-enable you to run the `reduction` Celerity example.
+Only DPC++ provides a complete implementation of SYCL 2020 reduction variables at the moment,
+while hipSYCL requires [a patch](https://github.com/illuhad/hipSYCL/pull/578).
+Installing this version of hipSYCL will enable you to run the `reduction` Celerity example.
 
-DPC++ currently implements an incompatible version of reductions from an earlier Intel proposal.
-Celerity can partially work around this API difference, but not without limitations:
-
-- Reduction output buffers can only be 1-dimensional
-- Calls to `parallel_for` can receive at most one reduction
-
-ComputeCpp does not support reductions at all as of version 2.6.0, so Celerity does not expose them for this backend.
-
-Celerity provides feature-detection macros for reduction support, both in CMake (`ON` or `OFF`) and
-as C++ macros (always defined to `0` or `1`):
-
-- `CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS` for (at least) the limited reduction support provided
-  by DPC++.
-- `CELERITY_FEATURE_SCALAR_REDUCTIONS` for the full reduction support provided by a 2020-conformant
-  SYCL implementation. Implies `CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS`.
+Celerity provides the `CELERITY_FEATURE_SCALAR_REDUCTIONS` feature-detection
+macro for reduction support, both in CMake (`ON` or `OFF`) and as C++ macros
+(always defined to `0` or `1`).
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -27,7 +27,7 @@ add_example(matmul)
 add_example(syncing)
 add_example(wave_sim)
 
-if(CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS)
+if(CELERITY_FEATURE_SCALAR_REDUCTIONS)
   add_example(reduction)
 endif()
 

diff --git a/examples/reduction/CMakeLists.txt b/examples/reduction/CMakeLists.txt
@@ -2,7 +2,7 @@ cmake_minimum_required(VERSION 3.13)
 project(syncing LANGUAGES CXX)
 
 find_package(Celerity 0.3.2 REQUIRED)
-if (NOT CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS)
+if (NOT CELERITY_FEATURE_SCALAR_REDUCTIONS)
   message(SEND_ERROR "Your Celerity installation does not support reductions. Skip this example.")
 endif ()
 

diff --git a/include/handler.h b/include/handler.h
@@ -500,7 +500,7 @@ namespace detail {
 
 	template <typename DataT, int Dims, typename BinaryOperation, bool WithExplicitIdentity>
 	auto make_sycl_reduction(const reduction_descriptor<DataT, Dims, BinaryOperation, WithExplicitIdentity>& d) {
-#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
+#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
 		static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
 #else
 		cl::sycl::property_list props;
@@ -546,7 +546,7 @@ namespace detail {
 
 	template <bool WithExplicitIdentity, typename DataT, int Dims, typename BinaryOperation>
 	auto make_reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation op, DataT identity, const cl::sycl::property_list& prop_list) {
-#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
+#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
 		static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
 #else
 		if(vars.get_range().size() != 1) {
@@ -599,10 +599,8 @@ void handler::parallel_for_kernel_and_reductions(range<Dims> global_range, id<Di
 	device_handler.submit_to_sycl([&](cl::sycl::handler& cgh) {
 		constexpr int sycl_dims = std::max(1, Dims);
 
-		if constexpr(!CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) {
+		if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 0) {
 			static_assert(detail::constexpr_false<Kernel>, "Reductions are not supported by your SYCL implementation");
-		} else if constexpr(!CELERITY_FEATURE_SCALAR_REDUCTIONS && sizeof...(reductions) > 1) {
-			static_assert(detail::constexpr_false<Kernel>, "DPC++ currently does not support more than one reduction variable per kernel");
 		} else if constexpr(std::is_same_v<KernelFlavor, detail::simple_kernel_flavor>) {
 			const auto sycl_global_range = sycl::range<sycl_dims>(detail::range_cast<sycl_dims>(chunk_range));
 			detail::invoke_sycl_parallel_for<KernelName>(cgh, sycl_global_range, detail::make_sycl_reduction(reductions)...,
@@ -649,12 +647,9 @@ void handler::host_task(range<Dims> global_range, id<Dims> global_offset, Functo
 
 template <typename DataT, int Dims, typename BinaryOperation>
 auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
-#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
+#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
 	static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
 #else
-#if CELERITY_WORKAROUND(DPCPP)
-	static_assert(Dims == 1, "DPC++ currently does not support reductions to buffers with dimensionality != 1");
-#endif
 	static_assert(cl::sycl::has_known_identity_v<BinaryOperation, DataT>,
 	    "Celerity does not currently support reductions without an identity. Either specialize "
 	    "cl::sycl::known_identity or use the reduction() overload taking an identity at runtime");
@@ -664,7 +659,7 @@ auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, BinaryOperation co
 
 template <typename DataT, int Dims, typename BinaryOperation>
 auto reduction(const buffer<DataT, Dims>& vars, handler& cgh, const DataT identity, BinaryOperation combiner, const cl::sycl::property_list& prop_list = {}) {
-#if !CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
+#if !CELERITY_FEATURE_SCALAR_REDUCTIONS
 	static_assert(detail::constexpr_false<BinaryOperation>, "Reductions are not supported by your SYCL implementation");
 #else
 	static_assert(!cl::sycl::has_known_identity_v<BinaryOperation, DataT>, "Identity is known to SYCL, remove the identity parameter from reduction()");

diff --git a/include/sycl_wrappers.h b/include/sycl_wrappers.h
@@ -44,7 +44,7 @@ namespace property {
 	using cl::sycl::property::no_init;
 #endif
 
-#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 	namespace reduction {
 		using cl::sycl::property::reduction::initialize_to_identity;
 	}

diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc
@@ -650,9 +650,8 @@ namespace detail {
 		});
 	}
 
-#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
-
 	TEST_CASE_METHOD(test_utils::runtime_fixture, "attempting a reduction on buffers with size != 1 throws", "[task-manager]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		runtime::init(nullptr, nullptr);
 		auto& tm = runtime::get_instance().get_task_manager();
 
@@ -666,8 +665,6 @@ namespace detail {
 			cgh.parallel_for<class UKN(ok_size_1)>(range<1>{1}, reduction(buf_4, cgh, cl::sycl::plus<float>{}), [=](celerity::item<1>, auto&) {});
 		}));
 
-#if CELERITY_FEATURE_SCALAR_REDUCTIONS
-
 		buffer<float, 2> buf_2{range<2>{1, 2}};
 		CHECK_THROWS(tm.submit_command_group([&](handler& cgh) { //
 			cgh.parallel_for<class UKN(wrong_size_2)>(range<2>{1, 1}, reduction(buf_2, cgh, cl::sycl::plus<float>{}), [=](celerity::item<2>, auto&) {});
@@ -687,11 +684,11 @@ namespace detail {
 		CHECK_NOTHROW(tm.submit_command_group([&](handler& cgh) { //
 			cgh.parallel_for<class UKN(ok_size_3)>(range<3>{1, 1, 1}, reduction(buf_6, cgh, cl::sycl::plus<float>{}), [=](celerity::item<3>, auto&) {});
 		}));
+#else
+		SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
 #endif
 	}
 
-#endif
-
 	TEST_CASE_METHOD(test_utils::runtime_fixture, "handler::parallel_for accepts nd_range", "[handler]") {
 		distr_queue q;
 
@@ -760,19 +757,19 @@ namespace detail {
 		});
 	}
 
-#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
-
 	TEST_CASE_METHOD(test_utils::runtime_fixture, "reductions can be passed into nd_range kernels", "[handler]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		// Note: We assume a local range size of 16 here, this should be supported by most devices.
 
 		buffer<int, 1> b{range<1>{1}};
 		distr_queue{}.submit([=](handler& cgh) {
 			cgh.parallel_for<class UKN(kernel)>(celerity::nd_range{range<2>{8, 8}, range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}),
 			    [](nd_item<2> item, auto& sum) { sum += item.get_global_linear_id(); });
 		});
-	}
-
+#else
+		SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
 #endif
+	}
 
 #if CELERITY_FEATURE_UNNAMED_KERNELS
 
@@ -784,7 +781,7 @@ namespace detail {
 		// without name
 		q.submit([](handler& cgh) { cgh.parallel_for(range<1>{64}, [](item<1> item) {}); });
 		q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [](nd_item<1> item) {}); });
-#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		buffer<int> b{{1}};
 		q.submit([=](handler& cgh) {
 			cgh.parallel_for(
@@ -799,7 +796,7 @@ namespace detail {
 		// with name
 		q.submit([=](handler& cgh) { cgh.parallel_for<class UKN(simple_kernel_with_name)>(range<1>{64}, [=](item<1> item) {}); });
 		q.submit([=](handler& cgh) { cgh.parallel_for<class UKN(nd_range_kernel_with_name)>(celerity::nd_range<1>{64, 32}, [=](nd_item<1> item) {}); });
-#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		q.submit([=](handler& cgh) {
 			cgh.parallel_for<class UKN(simple_kernel_with_name_and_reductions)>(
 			    range<1>{64}, reduction(b, cgh, cl::sycl::plus<int>{}), [=](item<1> item, auto& r) { r += static_cast<int>(item.get_linear_id()); });

diff --git a/test/sycl_tests.cc b/test/sycl_tests.cc
@@ -28,10 +28,9 @@ static auto make_device_accessor(sycl::buffer<int, 1>& buf, sycl::handler& cgh,
 	}
 }
 
-#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
-
 // If this test fails, celerity can't reliably support reductions on the user's combination of backend and hardware
 TEST_CASE_METHOD(test_utils::device_queue_fixture, "SYCL has working simple scalar reductions", "[sycl][reductions]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 	const size_t N = GENERATE(64, 512, 1024, 4096);
 	CAPTURE(N);
 
@@ -44,9 +43,10 @@ TEST_CASE_METHOD(test_utils::device_queue_fixture, "SYCL has working simple scal
 
 	sycl::host_accessor acc{buf};
 	CHECK(static_cast<size_t>(acc[0]) == N);
-}
-
+#else
+	SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
 #endif
+}
 
 TEST_CASE("SYCL implements by-value equality-comparison of device information", "[sycl][device-selection][!mayfail]") {
 	constexpr static auto get_devices = [] {

diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc
@@ -15,32 +15,20 @@
 namespace celerity {
 namespace detail {
 
-#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
-
 	template <typename T>
 	struct unknown_identity_maximum {
 		T operator()(T a, T b) const { return a < b ? b : a; }
 	};
 
 	TEST_CASE_METHOD(test_utils::runtime_fixture, "simple reductions produce the expected results", "[reductions]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		size_t N = 1000;
 		buffer<size_t, 1> sum_buf{{1}};
 		buffer<size_t, 1> max_buf{{1}};
 
 		distr_queue q;
 		const auto initialize_to_identity = cl::sycl::property::reduction::initialize_to_identity{};
 
-#if !CELERITY_FEATURE_SCALAR_REDUCTIONS // DPC++ can handle at most 1 reduction variable per kernel
-		q.submit([=](handler& cgh) {
-			auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus<size_t>{}, initialize_to_identity);
-			cgh.parallel_for<class UKN(kernel)>(range{N}, id{1}, sum_r, [=](celerity::item<1> item, auto& sum) { sum += item.get_id(0); });
-		});
-
-		q.submit([=](handler& cgh) {
-			auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum<size_t>{}, initialize_to_identity);
-			cgh.parallel_for<class UKN(kernel)>(range{N}, id{1}, max_r, [=](celerity::item<1> item, auto& max) { max.combine(item.get_id(0)); });
-		});
-#else
 		q.submit([=](handler& cgh) {
 			auto sum_r = reduction(sum_buf, cgh, cl::sycl::plus<size_t>{}, initialize_to_identity);
 			auto max_r = reduction(max_buf, cgh, size_t{0}, unknown_identity_maximum<size_t>{}, initialize_to_identity);
@@ -49,7 +37,6 @@ namespace detail {
 				max.combine(item.get_id(0));
 			});
 		});
-#endif
 
 		q.submit([=](handler& cgh) {
 			accessor sum_acc{sum_buf, cgh, celerity::access::all{}, celerity::read_only_host_task};
@@ -59,11 +46,15 @@ namespace detail {
 				CHECK(max_acc[0] == N);
 			});
 		});
+#else
+		SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
+#endif
 	}
 
 	// Regression test: The host -> device transfer previously caused an illegal nested sycl::queue::submit call which deadlocks
 	// Distributed test, since the single-node case optimizes the reduction command away
 	TEST_CASE_METHOD(test_utils::runtime_fixture, "reduction commands perform host -> device transfers if necessary", "[reductions]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		distr_queue q;
 
 		REQUIRE(runtime::get_instance().get_num_nodes() > 1);
@@ -80,9 +71,13 @@ namespace detail {
 			accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			cgh.host_task(on_master_node, [=] { CHECK(acc[0] == N + init); });
 		});
+#else
+		SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
+#endif
 	}
 
 	TEST_CASE_METHOD(test_utils::runtime_fixture, "multiple chained reductions produce correct results", "[reductions]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		distr_queue q;
 
 		const int N = 1000;
@@ -102,10 +97,14 @@ namespace detail {
 			accessor acc{sum, cgh, celerity::access::all{}, celerity::read_only_host_task};
 			cgh.host_task(on_master_node, [=] { CHECK(acc[0] == 3 * N); });
 		});
+#else
+		SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
+#endif
 	}
 
 	TEST_CASE_METHOD(
 	    test_utils::runtime_fixture, "subsequently requiring reduction results on different subsets of nodes produces correct data flow", "[reductions]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		distr_queue q;
 
 		const int N = 1000;
@@ -130,10 +129,14 @@ namespace detail {
 				CHECK(acc[0] == expected);
 			});
 		});
+#else
+		SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
+#endif
 	}
 
 	TEST_CASE_METHOD(
 	    test_utils::runtime_fixture, "runtime-shutdown graph printing works in the presence of a finished reduction", "[reductions][print_graph][smoke-test]") {
+#if CELERITY_FEATURE_SCALAR_REDUCTIONS
 		// init runtime early so the distr_queue ctor doesn't override the log level set by log_capture
 		runtime::init(nullptr, nullptr);
 		const bool is_master_node = runtime::get_instance().is_master_node();
@@ -161,10 +164,11 @@ namespace detail {
 			CHECK_THAT(log, ContainsSubstring("(R1) <b>await push</b> from N1"));
 			CHECK_THAT(log, ContainsSubstring("<b>reduction</b> R1<br/> B0 {[[0,0,0] - [1,1,1]]}"));
 		}
+#else
+		SKIP_BECAUSE_NO_SCALAR_REDUCTIONS
+#endif
 	}
 
-#endif // CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
-
 	template <int Dims>
 	class kernel_name_nd_geometry;
 

diff --git a/test/test_utils.h b/test/test_utils.h
@@ -43,6 +43,8 @@
  */
 #define REQUIRE_LOOP(...) CELERITY_DETAIL_REQUIRE_LOOP(__VA_ARGS__)
 
+#define SKIP_BECAUSE_NO_SCALAR_REDUCTIONS SKIP("CELERITY_FEATURE_SCALAR_REDUCTIONS == 0");
+
 namespace celerity {
 namespace detail {