Avoid creation of empty chunks

So far, tasks with a global size that was smaller than the number of nodes would still be split into N chunks, one for each node, with some chunks having an empty range. This introduces uneccesary overhead in scheduling and execution of commands that are effectively no-ops. Additionally, since we currently do not support accessing empty buffer ranges (which is a separate issue), empty chunks with e.g. a one-to-one mapper would result in a crash.
celerity · Feb 11, 2022 · 15fa929 · 15fa929
1 parent 26ca089
commit 15fa929
Show file tree

Hide file tree

Showing 3 changed files with 96 additions and 23 deletions.
diff --git a/src/transformers/naive_split.cc b/src/transformers/naive_split.cc
@@ -11,7 +11,7 @@ namespace celerity {
 namespace detail {
 
 	// We simply split in the first dimension for now
-	std::vector<chunk<3>> split_equal(const chunk<3>& full_chunk, const cl::sycl::range<3>& granularity, size_t num_chunks, int dims) {
+	static std::vector<chunk<3>> split_equal(const chunk<3>& full_chunk, const cl::sycl::range<3>& granularity, const size_t num_chunks, const int dims) {
 #ifndef NDEBUG
 		assert(num_chunks > 0);
 		for(int d = 0; d < dims; ++d) {
@@ -20,18 +20,23 @@ namespace detail {
 		}
 #endif
 
-		// If global range is not divisible by (num_chunks * granularity), assign ceil(quotient) to the first few chunks and floor(quotient) to the remaining
-		const auto small_chunk_size_dim0 = full_chunk.range[0] / (num_chunks * granularity[0]) * granularity[0];
+		// Due to split granularity requirements or if num_workers > global_size[0],
+		// we may not be able to create the requested number of chunks.
+		const auto actual_num_chunks = std::min(num_chunks, full_chunk.range[0] / granularity[0]);
+
+		// If global range is not divisible by (actual_num_chunks * granularity),
+		// assign ceil(quotient) to the first few chunks and floor(quotient) to the remaining
+		const auto small_chunk_size_dim0 = full_chunk.range[0] / (actual_num_chunks * granularity[0]) * granularity[0];
 		const auto large_chunk_size_dim0 = small_chunk_size_dim0 + granularity[0];
-		const auto num_large_chunks = (full_chunk.range[0] - small_chunk_size_dim0 * num_chunks) / granularity[0];
-		assert(num_large_chunks * large_chunk_size_dim0 + (num_chunks - num_large_chunks) * small_chunk_size_dim0 == full_chunk.range[0]);
+		const auto num_large_chunks = (full_chunk.range[0] - small_chunk_size_dim0 * actual_num_chunks) / granularity[0];
+		assert(num_large_chunks * large_chunk_size_dim0 + (actual_num_chunks - num_large_chunks) * small_chunk_size_dim0 == full_chunk.range[0]);
 
-		std::vector<chunk<3>> result(num_chunks, {full_chunk.offset, full_chunk.range, full_chunk.global_size});
+		std::vector<chunk<3>> result(actual_num_chunks, {full_chunk.offset, full_chunk.range, full_chunk.global_size});
 		for(auto i = 0u; i < num_large_chunks; ++i) {
 			result[i].range[0] = large_chunk_size_dim0;
 			result[i].offset[0] += i * large_chunk_size_dim0;
 		}
-		for(auto i = num_large_chunks; i < num_chunks; ++i) {
+		for(auto i = num_large_chunks; i < actual_num_chunks; ++i) {
 			result[i].range[0] = small_chunk_size_dim0;
 			result[i].offset[0] += num_large_chunks * large_chunk_size_dim0 + (i - num_large_chunks) * small_chunk_size_dim0;
 		}
@@ -53,21 +58,13 @@ namespace detail {
 	}
 
 	naive_split_transformer::naive_split_transformer(size_t num_chunks, size_t num_workers) : num_chunks(num_chunks), num_workers(num_workers) {
-		assert(num_chunks >= num_workers);
+		assert(num_chunks > 0);
+		assert(num_workers > 0);
 	}
 
 	void naive_split_transformer::transform_task(const task& tsk, command_graph& cdag) {
 		if(!tsk.has_variable_split()) return;
 
-		// Assign each chunk to a node
-		std::vector<node_id> nodes(num_chunks);
-		// We assign chunks next to each other to the same worker (if there is more chunks than workers), as this is likely to produce less
-		// transfers between tasks than a round-robin assignment (for typical stencil codes).
-		const auto chunks_per_node = num_workers > 0 ? num_chunks / num_workers : num_chunks;
-		for(auto i = 0u; i < num_chunks; ++i) {
-			nodes[i] = (i / chunks_per_node) % num_workers;
-		}
-
 		auto& task_commands = cdag.task_commands(tsk.get_id());
 		assert(task_commands.size() == 1);
 
@@ -78,9 +75,20 @@ namespace detail {
 		assert(std::distance(original->get_dependents().begin(), original->get_dependents().end()) == 0);
 
 		chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
-		auto chunks = split_equal(full_chunk, tsk.get_granularity(), num_chunks, tsk.get_dimensions());
+		const auto chunks = split_equal(full_chunk, tsk.get_granularity(), num_chunks, tsk.get_dimensions());
+		assert(chunks.size() <= num_chunks); // We may have created less than requested
+		assert(!chunks.empty());
+
+		// Assign each chunk to a node
+		// We assign chunks next to each other to the same worker (if there is more chunks than workers), as this is likely to produce less
+		// transfers between tasks than a round-robin assignment (for typical stencil codes).
+		// FIXME: This only works if the number of chunks is an integer multiple of the number of workers, e.g. 3 chunks for 2 workers degrades to RR.
+		const auto chunks_per_node = std::max<size_t>(1, chunks.size() / num_workers);
+
 		for(size_t i = 0; i < chunks.size(); ++i) {
-			cdag.create<task_command>(nodes[i], tsk.get_id(), subrange{chunks[i]});
+			assert(chunks[i].range.size() != 0);
+			const node_id nid = (i / chunks_per_node) % num_workers;
+			cdag.create<task_command>(nid, tsk.get_id(), subrange{chunks[i]});
 		}
 
 		// Remove original

diff --git a/test/graph_generation_tests.cc b/test/graph_generation_tests.cc
@@ -1182,9 +1182,12 @@ namespace detail {
 		    range);
 		test_utils::build_and_flush(ctx, num_nodes, tid_reduce);
 
-		const auto tid_consume = test_utils::add_compute_task<class UKN(task_consume)>(tm, [&](handler& cgh) {
-			buf_1.get_access<mode::read>(cgh, fixed<1>({0, 1}));
-		});
+		const auto tid_consume = test_utils::add_compute_task<class UKN(task_consume)>(
+		    tm,
+		    [&](handler& cgh) {
+			    buf_1.get_access<mode::read>(cgh, fixed<1>({0, 1}));
+		    },
+		    range);
 		test_utils::build_and_flush(ctx, num_nodes, tid_consume);
 
 		CHECK(has_dependency(tm, tid_reduce, tid_initialize));
@@ -1300,7 +1303,8 @@ namespace detail {
 		auto buf_0 = mbf.create_buffer(cl::sycl::range<1>{1});
 
 		test_utils::build_and_flush(ctx, num_nodes,
-		    test_utils::add_compute_task<class UKN(task_reduction)>(tm, [&](handler& cgh) { test_utils::add_reduction(cgh, rm, buf_0, false); }));
+		    test_utils::add_compute_task<class UKN(task_reduction)>(
+		        tm, [&](handler& cgh) { test_utils::add_reduction(cgh, rm, buf_0, false); }, {num_nodes, 1}));
 
 		test_utils::build_and_flush(ctx, num_nodes, test_utils::add_host_task(tm, on_master_node, [&](handler& cgh) {
 			buf_0.get_access<mode::read>(cgh, fixed<1>({0, 1}));
@@ -1459,5 +1463,47 @@ namespace detail {
 		maybe_print_graphs(ctx);
 	}
 
+	template <int Dims>
+	class simple_task;
+
+	template <int Dims>
+	class nd_range_task;
+
+	TEMPLATE_TEST_CASE_SIG("graph_generator does not create empty chunks", "[graph_generator]", ((int Dims), Dims), 1, 2, 3) {
+		const size_t num_nodes = 3;
+		test_utils::cdag_test_context ctx(num_nodes);
+		auto& tm = ctx.get_task_manager();
+
+		range<Dims> task_range = zero_range;
+		task_id tid = -1;
+
+		SECTION("for simple tasks") {
+			task_range = range_cast<Dims>(range<3>(2, 2, 2));
+			tid = test_utils::build_and_flush(ctx, num_nodes,
+			    test_utils::add_compute_task<simple_task<Dims>>(
+			        tm, [&](handler& cgh) {}, task_range));
+		}
+
+		SECTION("for nd-range tasks") {
+			task_range = range_cast<Dims>(range<3>(16, 2, 2));
+			const auto local_range = range_cast<Dims>(range<3>(8, 1, 1));
+			tid = test_utils::build_and_flush(ctx, num_nodes,
+			    test_utils::add_nd_range_compute_task<nd_range_task<Dims>>(
+			        tm, [&](handler& cgh) {}, nd_range<Dims>(task_range, local_range)));
+		}
+
+		auto& inspector = ctx.get_inspector();
+		auto& cdag = ctx.get_command_graph();
+
+		const auto cmds = inspector.get_commands(tid, std::nullopt, std::nullopt);
+		CHECK(cmds.size() == 2);
+		for(auto tid : cmds) {
+			auto* tcmd = dynamic_cast<task_command*>(cdag.get(tid));
+			auto split_range = range_cast<3>(task_range);
+			split_range[0] /= 2;
+			CHECK(tcmd->get_execution_range().range == split_range);
+		}
+	}
+
 } // namespace detail
 } // namespace celerity
diff --git a/test/system/distr_tests.cc b/test/system/distr_tests.cc
@@ -291,6 +291,25 @@ namespace detail {
 		}
 	}
 
+	TEST_CASE("nodes do not receive commands for empty chunks", "[command-graph]") {
+		distr_queue q;
+		auto n = runtime::get_instance().get_num_nodes();
+		REQUIRE(n > 1);
+
+		buffer<float, 2> buf{{1, 100}};
+
+		const auto chunk_check_rm = [buf_range = buf.get_range()](const chunk<2>& chnk) {
+			CHECK(chnk.range == buf_range);
+			return celerity::access::one_to_one{}(chnk);
+		};
+
+		q.submit([=](handler& cgh) {
+			accessor acc{buf, cgh, chunk_check_rm, write_only, no_init};
+			// The kernel has a size of 1 in dimension 0, so it will not be split into
+			// more than one chunk (assuming current naive split behavior).
+			cgh.parallel_for<class UKN(kernel)>(buf.get_range(), [=](item<2> it) { acc[it] = 0; });
+		});
+	}
 
 } // namespace detail
 } // namespace celerity