Skip to content

Commit

Permalink
Reduce local range size in tests to improve device compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
Markus Wippler authored and fknorr committed Feb 17, 2022
1 parent 39dacdf commit f0cf3f4
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 3 deletions.
1 change: 1 addition & 0 deletions examples/matmul/matmul.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ void multiply(celerity::distr_queue queue, celerity::buffer<T, 2> mat_a, celerit
celerity::accessor c{mat_c, cgh, celerity::access::one_to_one{}, celerity::write_only, celerity::no_init};

// Use local-memory tiling to avoid waiting on global memory too often
// Note: We assume a local range size of 64 here, this should be supported by most devices.
const size_t GROUP_SIZE = 8;
celerity::local_accessor<T, 2> scratch_a{{GROUP_SIZE, GROUP_SIZE}, cgh};
celerity::local_accessor<T, 2> scratch_b{{GROUP_SIZE, GROUP_SIZE}, cgh};
Expand Down
12 changes: 9 additions & 3 deletions test/runtime_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2163,7 +2163,7 @@ namespace detail {
TEST_CASE("handler::parallel_for accepts nd_range", "[handler]") {
distr_queue q;

// Note: be careful about local range sizes here, not all devices support work groups with > 256 elements.
// Note: We assume a local range size of 64 here, this should be supported by most devices.

CHECK_NOTHROW(q.submit([&](handler& cgh) {
cgh.parallel_for<class UKN(nd_range_1)>(celerity::nd_range<1>{{256}, {64}}, [](nd_item<1> item) {
Expand All @@ -2175,10 +2175,10 @@ namespace detail {
}));

CHECK_NOTHROW(q.submit([&](handler& cgh) {
cgh.parallel_for<class UKN(nd_range_2)>(celerity::nd_range<2>{{64, 64}, {16, 16}}, [](nd_item<2> item) {
cgh.parallel_for<class UKN(nd_range_2)>(celerity::nd_range<2>{{64, 64}, {8, 8}}, [](nd_item<2> item) {
group_barrier(item.get_group());
#if !WORKAROUND_COMPUTECPP // no group primitives
group_broadcast(item.get_group(), 42, 99);
group_broadcast(item.get_group(), 42, 25);
#endif
});
}));
Expand Down Expand Up @@ -2206,6 +2206,8 @@ namespace detail {
distr_queue q;
buffer<int, 1> out{64};

// Note: We assume a local range size of 32 here, this should be supported by most devices.

q.submit([=](handler& cgh) {
local_accessor<int> la{32, cgh};
accessor ga{out, cgh, celerity::access::one_to_one{}, write_only};
Expand All @@ -2229,6 +2231,8 @@ namespace detail {
#if CELERITY_FEATURE_SIMPLE_SCALAR_REDUCTIONS

TEST_CASE("reductions can be passed into nd_range kernels", "[handler]") {
// Note: We assume a local range size of 16 here, this should be supported by most devices.

buffer<int, 1> b{cl::sycl::range<1>{1}};
distr_queue{}.submit([=](handler& cgh) {
cgh.parallel_for<class UKN(kernel)>(celerity::nd_range{cl::sycl::range<2>{8, 8}, cl::sycl::range<2>{4, 4}}, reduction(b, cgh, cl::sycl::plus<>{}),
Expand All @@ -2243,6 +2247,8 @@ namespace detail {
TEST_CASE("handler::parallel_for kernel names are optional", "[handler]") {
distr_queue q;

// Note: We assume a local range size of 32 here, this should be supported by most devices.

// without name
q.submit([](handler& cgh) { cgh.parallel_for(cl::sycl::range<1>{64}, [](item<1> item) {}); });
q.submit([=](handler& cgh) { cgh.parallel_for(celerity::nd_range<1>{64, 32}, [](nd_item<1> item) {}); });
Expand Down
2 changes: 2 additions & 0 deletions test/system/distr_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ namespace detail {
distr_queue q;
auto n = runtime::get_instance().get_num_nodes();

// Note: We assume a local range size of 165 here, this may not be supported by all devices.

auto global_range = range_cast<Dims>(cl::sycl::range<3>{n * 4 * 3, 3 * 5, 2 * 11});
auto local_range = range_cast<Dims>(cl::sycl::range<3>{3, 5, 11});
auto group_range = global_range / local_range;
Expand Down

0 comments on commit f0cf3f4

Please sign in to comment.