Skip to content

Commit

Permalink
Avoid creation of empty chunks
Browse files Browse the repository at this point in the history
So far, tasks with a global size that was smaller than the number of
nodes would still be split into N chunks, one for each node, with some
chunks having an empty range. This introduces uneccesary overhead in
scheduling and execution of commands that are effectively no-ops.
Additionally, since we currently do not support accessing empty buffer
ranges (which is a separate issue), empty chunks with e.g. a one-to-one
mapper would result in a crash.
  • Loading branch information
psalz committed Feb 11, 2022
1 parent 26ca089 commit 15fa929
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 23 deletions.
46 changes: 27 additions & 19 deletions src/transformers/naive_split.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace celerity {
namespace detail {

// We simply split in the first dimension for now
std::vector<chunk<3>> split_equal(const chunk<3>& full_chunk, const cl::sycl::range<3>& granularity, size_t num_chunks, int dims) {
static std::vector<chunk<3>> split_equal(const chunk<3>& full_chunk, const cl::sycl::range<3>& granularity, const size_t num_chunks, const int dims) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < dims; ++d) {
Expand All @@ -20,18 +20,23 @@ namespace detail {
}
#endif

// If global range is not divisible by (num_chunks * granularity), assign ceil(quotient) to the first few chunks and floor(quotient) to the remaining
const auto small_chunk_size_dim0 = full_chunk.range[0] / (num_chunks * granularity[0]) * granularity[0];
// Due to split granularity requirements or if num_workers > global_size[0],
// we may not be able to create the requested number of chunks.
const auto actual_num_chunks = std::min(num_chunks, full_chunk.range[0] / granularity[0]);

// If global range is not divisible by (actual_num_chunks * granularity),
// assign ceil(quotient) to the first few chunks and floor(quotient) to the remaining
const auto small_chunk_size_dim0 = full_chunk.range[0] / (actual_num_chunks * granularity[0]) * granularity[0];
const auto large_chunk_size_dim0 = small_chunk_size_dim0 + granularity[0];
const auto num_large_chunks = (full_chunk.range[0] - small_chunk_size_dim0 * num_chunks) / granularity[0];
assert(num_large_chunks * large_chunk_size_dim0 + (num_chunks - num_large_chunks) * small_chunk_size_dim0 == full_chunk.range[0]);
const auto num_large_chunks = (full_chunk.range[0] - small_chunk_size_dim0 * actual_num_chunks) / granularity[0];
assert(num_large_chunks * large_chunk_size_dim0 + (actual_num_chunks - num_large_chunks) * small_chunk_size_dim0 == full_chunk.range[0]);

std::vector<chunk<3>> result(num_chunks, {full_chunk.offset, full_chunk.range, full_chunk.global_size});
std::vector<chunk<3>> result(actual_num_chunks, {full_chunk.offset, full_chunk.range, full_chunk.global_size});
for(auto i = 0u; i < num_large_chunks; ++i) {
result[i].range[0] = large_chunk_size_dim0;
result[i].offset[0] += i * large_chunk_size_dim0;
}
for(auto i = num_large_chunks; i < num_chunks; ++i) {
for(auto i = num_large_chunks; i < actual_num_chunks; ++i) {
result[i].range[0] = small_chunk_size_dim0;
result[i].offset[0] += num_large_chunks * large_chunk_size_dim0 + (i - num_large_chunks) * small_chunk_size_dim0;
}
Expand All @@ -53,21 +58,13 @@ namespace detail {
}

naive_split_transformer::naive_split_transformer(size_t num_chunks, size_t num_workers) : num_chunks(num_chunks), num_workers(num_workers) {
assert(num_chunks >= num_workers);
assert(num_chunks > 0);
assert(num_workers > 0);
}

void naive_split_transformer::transform_task(const task& tsk, command_graph& cdag) {
if(!tsk.has_variable_split()) return;

// Assign each chunk to a node
std::vector<node_id> nodes(num_chunks);
// We assign chunks next to each other to the same worker (if there is more chunks than workers), as this is likely to produce less
// transfers between tasks than a round-robin assignment (for typical stencil codes).
const auto chunks_per_node = num_workers > 0 ? num_chunks / num_workers : num_chunks;
for(auto i = 0u; i < num_chunks; ++i) {
nodes[i] = (i / chunks_per_node) % num_workers;
}

auto& task_commands = cdag.task_commands(tsk.get_id());
assert(task_commands.size() == 1);

Expand All @@ -78,9 +75,20 @@ namespace detail {
assert(std::distance(original->get_dependents().begin(), original->get_dependents().end()) == 0);

chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
auto chunks = split_equal(full_chunk, tsk.get_granularity(), num_chunks, tsk.get_dimensions());
const auto chunks = split_equal(full_chunk, tsk.get_granularity(), num_chunks, tsk.get_dimensions());
assert(chunks.size() <= num_chunks); // We may have created less than requested
assert(!chunks.empty());

// Assign each chunk to a node
// We assign chunks next to each other to the same worker (if there is more chunks than workers), as this is likely to produce less
// transfers between tasks than a round-robin assignment (for typical stencil codes).
// FIXME: This only works if the number of chunks is an integer multiple of the number of workers, e.g. 3 chunks for 2 workers degrades to RR.
const auto chunks_per_node = std::max<size_t>(1, chunks.size() / num_workers);

for(size_t i = 0; i < chunks.size(); ++i) {
cdag.create<task_command>(nodes[i], tsk.get_id(), subrange{chunks[i]});
assert(chunks[i].range.size() != 0);
const node_id nid = (i / chunks_per_node) % num_workers;
cdag.create<task_command>(nid, tsk.get_id(), subrange{chunks[i]});
}

// Remove original
Expand Down
54 changes: 50 additions & 4 deletions test/graph_generation_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1182,9 +1182,12 @@ namespace detail {
range);
test_utils::build_and_flush(ctx, num_nodes, tid_reduce);

const auto tid_consume = test_utils::add_compute_task<class UKN(task_consume)>(tm, [&](handler& cgh) {
buf_1.get_access<mode::read>(cgh, fixed<1>({0, 1}));
});
const auto tid_consume = test_utils::add_compute_task<class UKN(task_consume)>(
tm,
[&](handler& cgh) {
buf_1.get_access<mode::read>(cgh, fixed<1>({0, 1}));
},
range);
test_utils::build_and_flush(ctx, num_nodes, tid_consume);

CHECK(has_dependency(tm, tid_reduce, tid_initialize));
Expand Down Expand Up @@ -1300,7 +1303,8 @@ namespace detail {
auto buf_0 = mbf.create_buffer(cl::sycl::range<1>{1});

test_utils::build_and_flush(ctx, num_nodes,
test_utils::add_compute_task<class UKN(task_reduction)>(tm, [&](handler& cgh) { test_utils::add_reduction(cgh, rm, buf_0, false); }));
test_utils::add_compute_task<class UKN(task_reduction)>(
tm, [&](handler& cgh) { test_utils::add_reduction(cgh, rm, buf_0, false); }, {num_nodes, 1}));

test_utils::build_and_flush(ctx, num_nodes, test_utils::add_host_task(tm, on_master_node, [&](handler& cgh) {
buf_0.get_access<mode::read>(cgh, fixed<1>({0, 1}));
Expand Down Expand Up @@ -1459,5 +1463,47 @@ namespace detail {
maybe_print_graphs(ctx);
}

template <int Dims>
class simple_task;

template <int Dims>
class nd_range_task;

TEMPLATE_TEST_CASE_SIG("graph_generator does not create empty chunks", "[graph_generator]", ((int Dims), Dims), 1, 2, 3) {
const size_t num_nodes = 3;
test_utils::cdag_test_context ctx(num_nodes);
auto& tm = ctx.get_task_manager();

range<Dims> task_range = zero_range;
task_id tid = -1;

SECTION("for simple tasks") {
task_range = range_cast<Dims>(range<3>(2, 2, 2));
tid = test_utils::build_and_flush(ctx, num_nodes,
test_utils::add_compute_task<simple_task<Dims>>(
tm, [&](handler& cgh) {}, task_range));
}

SECTION("for nd-range tasks") {
task_range = range_cast<Dims>(range<3>(16, 2, 2));
const auto local_range = range_cast<Dims>(range<3>(8, 1, 1));
tid = test_utils::build_and_flush(ctx, num_nodes,
test_utils::add_nd_range_compute_task<nd_range_task<Dims>>(
tm, [&](handler& cgh) {}, nd_range<Dims>(task_range, local_range)));
}

auto& inspector = ctx.get_inspector();
auto& cdag = ctx.get_command_graph();

const auto cmds = inspector.get_commands(tid, std::nullopt, std::nullopt);
CHECK(cmds.size() == 2);
for(auto tid : cmds) {
auto* tcmd = dynamic_cast<task_command*>(cdag.get(tid));
auto split_range = range_cast<3>(task_range);
split_range[0] /= 2;
CHECK(tcmd->get_execution_range().range == split_range);
}
}

} // namespace detail
} // namespace celerity
19 changes: 19 additions & 0 deletions test/system/distr_tests.cc
Original file line number Diff line number Diff line change
Expand Up @@ -291,6 +291,25 @@ namespace detail {
}
}

TEST_CASE("nodes do not receive commands for empty chunks", "[command-graph]") {
distr_queue q;
auto n = runtime::get_instance().get_num_nodes();
REQUIRE(n > 1);

buffer<float, 2> buf{{1, 100}};

const auto chunk_check_rm = [buf_range = buf.get_range()](const chunk<2>& chnk) {
CHECK(chnk.range == buf_range);
return celerity::access::one_to_one{}(chnk);
};

q.submit([=](handler& cgh) {
accessor acc{buf, cgh, chunk_check_rm, write_only, no_init};
// The kernel has a size of 1 in dimension 0, so it will not be split into
// more than one chunk (assuming current naive split behavior).
cgh.parallel_for<class UKN(kernel)>(buf.get_range(), [=](item<2> it) { acc[it] = 0; });
});
}

} // namespace detail
} // namespace celerity

0 comments on commit 15fa929

Please sign in to comment.