Reduce local range size in tests to improve device compatibility

celerity · Feb 17, 2022 · f0cf3f4 · f0cf3f4
1 parent 39dacdf
commit f0cf3f4
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 3 deletions.
diff --git a/examples/matmul/matmul.cc b/examples/matmul/matmul.cc
@@ -20,6 +20,7 @@ void multiply(celerity::distr_queue queue, celerity::buffer<T, 2> mat_a, celerit
 		celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};
 
 		// Use local-memory tiling to avoid waiting on global memory too often
+		// Note: We assume a local range size of 64 here, this should be supported by most devices.
 		const size_t GROUP_SIZE = 8;
 		celerity::local_accessor<T, 2> scratch_a{{GROUP_SIZE, GROUP_SIZE}, cgh};
 		celerity::local_accessor<T, 2> scratch_b{{GROUP_SIZE, GROUP_SIZE}, cgh};

diff --git a/test/runtime_tests.cc b/test/runtime_tests.cc
@@ -2163,7 +2163,7 @@ namespace detail {
 	TEST_CASE("handler::parallel_for accepts nd_range", "[handler]") {
 		distr_queue q;
 
-		// Note: be careful about local range sizes here, not all devices support work groups with > 256 elements.
+		// Note: We assume a local range size of 64 here, this should be supported by most devices.
 
 		CHECK_NOTHROW(q.submit([&](handler& cgh) {
 			cgh.parallel_for<class UKN(nd_range_1)>(celerity::nd_range<1>{{256}, {64}}, [](nd_item<1> item) {
@@ -2175,10 +2175,10 @@ namespace detail {
 		}));
 
 		CHECK_NOTHROW(q.submit([&](handler& cgh) {
-			cgh.parallel_for<class UKN(nd_range_2)>(celerity::nd_range<2>{{64, 64}, {16, 16}}, [](nd_item<2> item) {
+			cgh.parallel_for<class UKN(nd_range_2)>(celerity::nd_range<2>{{64, 64}, {8, 8}}, [](nd_item<2> item) {
 				group_barrier(item.get_group());
 #if !WORKAROUND_COMPUTECPP // no group primitives
-				group_broadcast(item.get_group(), 42, 99);
+				group_broadcast(item.get_group(), 42, 25);
 #endif
 			});
 		}));
@@ -2206,6 +2206,8 @@ namespace detail {
 		distr_queue q;
 		buffer<int, 1> out{64};
 
+		// Note: We assume a local range size of 32 here, this should be supported by most devices.
+
 		q.submit([=](handler& cgh) {
 			local_accessor<int> la{32, cgh};
 			accessor ga{out, cgh, celerity::access::one_to_one{}, write_only};
@@ -2229,6 +2231,8 @@ namespace detail {
 #if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS
 
 	TEST_CASE("reductions can be passed into nd_range kernels", "[handler]") {
+		// Note: We assume a local range size of 16 here, this should be supported by most devices.
+
 		buffer<int, 1> b{cl::sycl::range<1>{1}};
 		distr_queue{}.submit([=](handler& cgh) {
 			cgh.parallel_for<class UKN(kernel)>(celerity::nd_range{cl::sycl::range<2>{8, 8}, cl::sycl::range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}),
@@ -2243,6 +2247,8 @@ namespace detail {
 	TEST_CASE("handler::parallel_for kernel names are optional", "[handler]") {
 		distr_queue q;
 
+		// Note: We assume a local range size of 32 here, this should be supported by most devices.
+
 		// without name
 		q.submit([](handler& cgh) { cgh.parallel_for(cl::sycl::range<1>{64}, [](item<1> item) {}); });
 		q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [](nd_item<1> item) {}); });

diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc
@@ -163,6 +163,8 @@ namespace detail {
 		distr_queue q;
 		auto n = runtime::get_instance().get_num_nodes();
 
+		// Note: We assume a local range size of 165 here, this may not be supported by all devices.
+
 		auto global_range = range_cast<Dims>(cl::sycl::range<3>{n * 4 * 3, 3 * 5, 2 * 11});
 		auto local_range = range_cast<Dims>(cl::sycl::range<3>{3, 5, 11});
 		auto group_range = global_range / local_range;