Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change split functions to work on box instead of chunk #323

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions include/split.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include "grid.h"
#include "ranges.h"

#include <cstddef>
Expand All @@ -8,7 +9,7 @@

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks);
std::vector<box<3>> split_1d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ readability-inconsistent-declaration-parameter-name ⚠️
function celerity::detail::split_1d has a definition with different parameter names

std::vector<box<3>> split_2d(const box<3>& full_box, const range<3>& granularity, const size_t num_boxs);

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ readability-inconsistent-declaration-parameter-name ⚠️
function celerity::detail::split_2d has a definition with different parameter names


} // namespace celerity::detail
12 changes: 7 additions & 5 deletions src/command_graph_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,15 @@ void command_graph_generator::report_overlapping_writes(const task& tsk, const b
}

std::vector<command_graph_generator::assigned_chunk> command_graph_generator::split_task_and_assign_chunks(const task& tsk) const {
const chunk<3> full_chunk{tsk.get_global_offset(), tsk.get_global_size(), tsk.get_global_size()};
const box<3> full_chunk{subrange<3>(tsk.get_global_offset(), tsk.get_global_size())};
const size_t num_chunks = m_num_nodes * m_test_chunk_multiplier;
const auto chunks = ([&] {
if(tsk.get_type() == task_type::collective || tsk.get_type() == task_type::fence) {
std::vector<chunk<3>> chunks;
std::vector<box<3>> chunks;
for(size_t nid = 0; nid < m_num_nodes; ++nid) {
chunks.push_back(chunk_cast<3>(chunk<1>{id<1>{tsk.get_type() == task_type::collective ? nid : 0}, ones, {m_num_nodes}}));
const id<1> min = tsk.get_type() == task_type::collective ? nid : 0;
const id<1> max = min + 1;
chunks.push_back(box_cast<3>(box<1>{min, max}));
}
return chunks;
}
Expand All @@ -157,7 +159,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
if(tsk.get_hint<experimental::hints::split_2d>() != nullptr) { return split_2d(full_chunk, tsk.get_granularity(), num_chunks); }
return split_1d(full_chunk, tsk.get_granularity(), num_chunks);
}
return std::vector<chunk<3>>{full_chunk};
return std::vector<box<3>>{full_chunk};
})();
assert(chunks.size() <= num_chunks); // We may have created less than requested
assert(!chunks.empty());
Expand All @@ -171,7 +173,7 @@ std::vector<command_graph_generator::assigned_chunk> command_graph_generator::sp
std::vector<assigned_chunk> assigned_chunks;
for(size_t i = 0; i < chunks.size(); ++i) {
const node_id nid = (i / chunks_per_node) % m_num_nodes;
assigned_chunks.push_back({nid, chunks[i]});
assigned_chunks.push_back({nid, chunk<3>(chunks[i].get_min(), chunks[i].get_range(), tsk.get_global_size())});
}
return assigned_chunks;
}
Expand Down
7 changes: 3 additions & 4 deletions src/instruction_graph_generator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1502,12 +1502,11 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
tsk.has_variable_split() && tsk.get_side_effect_map().empty() && tsk.get_collective_group_id() == non_collective_group_id;
const auto split = tsk.get_hint<experimental::hints::split_2d>() != nullptr ? split_2d : split_1d;

const auto command_sr = ecmd.get_execution_range();
const auto command_chunk = chunk<3>(command_sr.offset, command_sr.range, tsk.get_global_size());
const auto command_chunk = box<3>(ecmd.get_execution_range());

// As a heuristic to keep inter-device communication to a minimum, we split the execution range twice when oversubscription is active: Once to obtain
// contiguous chunks per device, and one more (below) to subdivide the ranges on each device (which can help with computation-communication overlap).
std::vector<chunk<3>> coarse_chunks;
std::vector<box<3>> coarse_chunks;
if(is_splittable_locally && tsk.get_execution_target() == execution_target::device) {
coarse_chunks = split(command_chunk, tsk.get_granularity(), m_system.devices.size());
} else {
Expand Down Expand Up @@ -1537,7 +1536,7 @@ std::vector<localized_chunk> generator_impl::split_task_execution_range(const ex
for(size_t coarse_idx = 0; coarse_idx < coarse_chunks.size(); ++coarse_idx) {
for(const auto& fine_chunk : split(coarse_chunks[coarse_idx], tsk.get_granularity(), oversubscribe_factor)) {
auto& localized_chunk = concurrent_chunks.emplace_back();
localized_chunk.execution_range = box(subrange(fine_chunk.offset, fine_chunk.range));
localized_chunk.execution_range = fine_chunk;
if(tsk.get_execution_target() == execution_target::device) {
assert(coarse_idx < m_system.devices.size());
localized_chunk.memory_id = m_system.devices[coarse_idx].native_memory;
Expand Down
67 changes: 38 additions & 29 deletions src/split.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,26 +17,26 @@ namespace {
using namespace celerity;
using namespace celerity::detail;

[[maybe_unused]] void sanity_check_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split) {
[[maybe_unused]] void sanity_check_split(const box<3>& full_chunk, const std::vector<box<3>>& split) {
region<3> reconstructed_chunk;
for(auto& chnk : split) {
assert(region_intersection(reconstructed_chunk, box<3>(chnk)).empty());
reconstructed_chunk = region_union(box<3>(chnk), reconstructed_chunk);
assert(region_intersection(reconstructed_chunk, chnk).empty());
reconstructed_chunk = region_union(chnk, reconstructed_chunk);
}
assert(region_difference(reconstructed_chunk, box<3>(full_chunk)).empty());
assert(region_difference(reconstructed_chunk, full_chunk).empty());
}

template <int Dims>
std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks(
const chunk<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
const box<3>& full_chunk, const range<3>& granularity, const std::array<size_t, Dims>& actual_num_chunks) {
range<Dims> small_chunk_size{zeros};
range<Dims> large_chunk_size{zeros};
range<Dims> num_large_chunks{zeros};
for(int d = 0; d < Dims; ++d) {
const size_t ideal_chunk_size = full_chunk.range[d] / actual_num_chunks[d];
const size_t ideal_chunk_size = full_chunk.get_range()[d] / actual_num_chunks[d];
small_chunk_size[d] = (ideal_chunk_size / granularity[d]) * granularity[d];
large_chunk_size[d] = small_chunk_size[d] + granularity[d];
num_large_chunks[d] = (full_chunk.range[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
num_large_chunks[d] = (full_chunk.get_range()[d] - small_chunk_size[d] * actual_num_chunks[d]) / granularity[d];
}
return {small_chunk_size, large_chunk_size, num_large_chunks};
}
Expand All @@ -51,9 +51,9 @@ std::tuple<range<Dims>, range<Dims>, range<Dims>> compute_small_and_large_chunks
* @returns The number of chunks that can be created in dimension 0 and dimension 1, respectively. These are at most
* (f0, f1) or (f1, f0), however may be less if constrained by the split granularity.
*/
std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
std::array<size_t, 2> assign_split_factors_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t factor, const size_t num_chunks) {
assert(num_chunks % factor == 0);
const size_t max_chunks[2] = {full_chunk.range[0] / granularity[0], full_chunk.range[1] / granularity[1]};
const size_t max_chunks[2] = {full_chunk.get_range()[0] / granularity[0], full_chunk.get_range()[1] / granularity[1]};
const size_t f0 = factor;
const size_t f1 = num_chunks / factor;

Expand All @@ -71,12 +71,12 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const

// If domain is square(-ish), prefer splitting along slower dimension.
// (These bounds have been chosen arbitrarily!)
const double squareishness = std::sqrt(full_chunk.range.size()) / static_cast<double>(full_chunk.range[0]);
const double squareishness = std::sqrt(full_chunk.get_area()) / static_cast<double>(full_chunk.get_range()[0]);
if(squareishness > 0.95 && squareishness < 1.05) { return (f0 >= f1) ? split_0_1 : split_1_0; }

// For non-square domains, prefer split that produces shorter edges (compare sum of circumferences)
const auto circ0 = full_chunk.range[0] / split_0_1[0] + full_chunk.range[1] / split_0_1[1];
const auto circ1 = full_chunk.range[0] / split_1_0[0] + full_chunk.range[1] / split_1_0[1];
const auto circ0 = full_chunk.get_range()[0] / split_0_1[0] + full_chunk.get_range()[1] / split_0_1[1];
const auto circ1 = full_chunk.get_range()[0] / split_1_0[0] + full_chunk.get_range()[1] / split_1_0[1];
return circ0 < circ1 ? split_0_1 : split_1_0;

// TODO: Yet another heuristic we may want to consider is how even chunk sizes are,
Expand All @@ -87,28 +87,35 @@ std::array<size_t, 2> assign_split_factors_2d(const chunk<3>& full_chunk, const

namespace celerity::detail {

std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
std::vector<box<3>> split_1d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
assert(full_chunk.get_range()[d] % granularity[d] == 0);
}
#endif

// Due to split granularity requirements or if num_workers > global_size[0],
// we may not be able to create the requested number of chunks.
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.range[0] / granularity[0])};
const std::array<size_t, 1> actual_num_chunks = {std::min(num_chunks, full_chunk.get_range()[0] / granularity[0])};
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<1>(full_chunk, granularity, actual_num_chunks);

std::vector<chunk<3>> result(actual_num_chunks[0], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
std::vector<box<3>> result;
result.reserve(actual_num_chunks[0]);
for(auto i = 0u; i < num_large_chunks[0]; ++i) {
result[i].range[0] = large_chunk_size[0];
result[i].offset[0] += i * large_chunk_size[0];
id<3> min = full_chunk.get_min();
id<3> max = full_chunk.get_max();
min[0] += i * large_chunk_size[0];
max[0] = min[0] + large_chunk_size[0];
result.emplace_back(min, max);
}
for(auto i = num_large_chunks[0]; i < actual_num_chunks[0]; ++i) {
result[i].range[0] = small_chunk_size[0];
result[i].offset[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
id<3> min = full_chunk.get_min();
id<3> max = full_chunk.get_max();
min[0] += num_large_chunks[0] * large_chunk_size[0] + (i - num_large_chunks[0]) * small_chunk_size[0];
max[0] = min[0] + small_chunk_size[0];
result.emplace_back(min, max);
}

#ifndef NDEBUG
Expand All @@ -119,12 +126,12 @@ std::vector<chunk<3>> split_1d(const chunk<3>& full_chunk, const range<3>& granu
}

// TODO: Make the split dimensions configurable for 3D chunks?
std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
std::vector<box<3>> split_2d(const box<3>& full_chunk, const range<3>& granularity, const size_t num_chunks) {
#ifndef NDEBUG
assert(num_chunks > 0);
for(int d = 0; d < 3; ++d) {
assert(granularity[d] > 0);
assert(full_chunk.range[d] % granularity[d] == 0);
assert(full_chunk.get_range()[d] % granularity[d] == 0);
}
#endif

Expand All @@ -147,21 +154,23 @@ std::vector<chunk<3>> split_2d(const chunk<3>& full_chunk, const range<3>& granu
const auto actual_num_chunks = best_chunk_counts;
const auto [small_chunk_size, large_chunk_size, num_large_chunks] = compute_small_and_large_chunks<2>(full_chunk, granularity, actual_num_chunks);

std::vector<chunk<3>> result(actual_num_chunks[0] * actual_num_chunks[1], {full_chunk.offset, full_chunk.range, full_chunk.global_size});
id<3> offset = full_chunk.offset;
std::vector<box<3>> result;
result.reserve(actual_num_chunks[0] * actual_num_chunks[1]);
id<3> offset = full_chunk.get_min();

for(size_t j = 0; j < actual_num_chunks[0]; ++j) {
range<2> chunk_size = {(j < num_large_chunks[0]) ? large_chunk_size[0] : small_chunk_size[0], 0};
for(size_t i = 0; i < actual_num_chunks[1]; ++i) {
chunk_size[1] = (i < num_large_chunks[1]) ? large_chunk_size[1] : small_chunk_size[1];
auto& chnk = result[j * actual_num_chunks[1] + i];
chnk.offset = offset;
chnk.range[0] = chunk_size[0];
chnk.range[1] = chunk_size[1];
const id<3> min = offset;
id<3> max = full_chunk.get_max();
max[0] = min[0] + chunk_size[0];
max[1] = min[1] + chunk_size[1];
result.emplace_back(min, max);
offset[1] += chunk_size[1];
}
offset[0] += chunk_size[0];
offset[1] = full_chunk.offset[1];
offset[1] = full_chunk.get_min()[1];
}

#ifndef NDEBUG
Expand Down
54 changes: 25 additions & 29 deletions test/split_tests.cc
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
#include <unordered_set>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators_range.hpp>
Expand All @@ -14,19 +12,18 @@ using namespace celerity::detail;
namespace {

template <int Dims>
chunk<3> make_full_chunk(range<Dims> range) {
return {id<3>{}, range_cast<3>(range), range_cast<3>(range)};
box<3> make_full_chunk(range<Dims> range) {
return {id<3>{}, range_cast<3>(range)};
}

void check_1d_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& split_chunks, const std::vector<size_t>& chunk_ranges) {
void check_1d_split(const box<3>& full_chunk, const std::vector<box<3>>& split_chunks, const std::vector<size_t>& chunk_ranges) {
REQUIRE(split_chunks.size() == chunk_ranges.size());
id<3> offset = full_chunk.offset;
id<3> offset = full_chunk.get_min();
for(size_t i = 0; i < split_chunks.size(); ++i) {
const auto& chnk = split_chunks[i];
REQUIRE_LOOP(chnk.offset == offset);
REQUIRE_LOOP(chnk.range[0] == chunk_ranges[i]);
REQUIRE_LOOP(chnk.global_size == full_chunk.global_size);
offset[0] += split_chunks[i].range[0];
REQUIRE_LOOP(chnk.get_min() == offset);
REQUIRE_LOOP(chnk.get_range()[0] == chunk_ranges[i]);
offset[0] += chnk.get_range()[0];
}
}

Expand All @@ -48,21 +45,20 @@ void check_1d_split(const chunk<3>& full_chunk, const std::vector<chunk<3>>& spl
* to the width of an individual chunk.
*/
void check_2d_split(
const chunk<3>& full_chunk, const std::vector<chunk<3>>& split_chunks, const std::vector<std::pair<size_t, std::vector<size_t>>>& chunk_ranges) {
const box<3>& full_chunk, const std::vector<box<3>>& split_chunks, const std::vector<std::pair<size_t, std::vector<size_t>>>& chunk_ranges) {
REQUIRE(split_chunks.size() == std::accumulate(chunk_ranges.begin(), chunk_ranges.end(), size_t(0), [](size_t c, auto& p) { return c + p.second.size(); }));
REQUIRE(std::all_of(chunk_ranges.begin(), chunk_ranges.end(), [&](auto& p) { return p.second.size() == chunk_ranges[0].second.size(); }));
id<3> offset = full_chunk.offset;
id<3> offset = full_chunk.get_min();
for(size_t j = 0; j < chunk_ranges.size(); ++j) {
const auto& [height, widths] = chunk_ranges[j];
for(size_t i = 0; i < widths.size(); ++i) {
const auto& chnk = split_chunks[j * chunk_ranges[0].second.size() + i];
REQUIRE_LOOP(chnk.offset == offset);
REQUIRE_LOOP(chnk.range[0] == height);
REQUIRE_LOOP(chnk.range[1] == widths[i]);
REQUIRE_LOOP(chnk.global_size == full_chunk.global_size);
REQUIRE_LOOP(chnk.get_min() == offset);
REQUIRE_LOOP(chnk.get_range()[0] == height);
REQUIRE_LOOP(chnk.get_range()[1] == widths[i]);
offset[1] += widths[i];
}
offset[1] = full_chunk.offset[1];
offset[1] = full_chunk.get_min()[1];
offset[0] += height;
}
}
Expand Down Expand Up @@ -94,13 +90,13 @@ TEST_CASE("split_1d creates fewer chunks than requested if mandated by granulari
}

TEST_CASE("split_1d preserves offset of original chunk", "[split]") {
const auto full_chunk = chunk<3>{{37, 42, 7}, {128, 1, 1}, {128, 1, 1}};
const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {128, 1, 1})};
const auto chunks = split_1d(full_chunk, ones, 4);

CHECK(chunks[0].offset == id<3>{37 + 0, 42, 7});
CHECK(chunks[1].offset == id<3>{37 + 32, 42, 7});
CHECK(chunks[2].offset == id<3>{37 + 64, 42, 7});
CHECK(chunks[3].offset == id<3>{37 + 96, 42, 7});
CHECK(chunks[0].get_min() == id<3>{37 + 0, 42, 7});
CHECK(chunks[1].get_min() == id<3>{37 + 32, 42, 7});
CHECK(chunks[2].get_min() == id<3>{37 + 64, 42, 7});
CHECK(chunks[3].get_min() == id<3>{37 + 96, 42, 7});

check_1d_split(full_chunk, chunks, {32, 32, 32, 32});
}
Expand All @@ -109,7 +105,7 @@ TEST_CASE("split_1d preserves ranges of original chunk in other dimensions", "[s
const auto full_chunk = make_full_chunk<3>({128, 42, 341});
const auto chunks = split_1d(full_chunk, ones, 4);
for(size_t i = 0; i < 4; ++i) {
REQUIRE_LOOP(chunks[0].range == range<3>{32, 42, 341});
REQUIRE_LOOP(chunks[0].get_range() == range<3>{32, 42, 341});
}
}

Expand Down Expand Up @@ -251,19 +247,19 @@ TEST_CASE("split_2d minimizes edge lengths for non-square domains") {
}

TEST_CASE("split_2d preserves offset of original chunk", "[split]") {
const auto full_chunk = chunk<3>{{37, 42, 7}, {64, 64, 1}, {128, 128, 1}};
const auto full_chunk = box<3>{subrange<3>({37, 42, 7}, {64, 64, 1})};
const auto chunks = split_2d(full_chunk, ones, 4);
CHECK(chunks[0].offset == id<3>{37, 42, 7});
CHECK(chunks[1].offset == id<3>{37, 42 + 32, 7});
CHECK(chunks[2].offset == id<3>{37 + 32, 42 + 0, 7});
CHECK(chunks[3].offset == id<3>{37 + 32, 42 + 32, 7});
CHECK(chunks[0].get_min() == id<3>{37, 42, 7});
CHECK(chunks[1].get_min() == id<3>{37, 42 + 32, 7});
CHECK(chunks[2].get_min() == id<3>{37 + 32, 42 + 0, 7});
CHECK(chunks[3].get_min() == id<3>{37 + 32, 42 + 32, 7});
}

TEST_CASE("split_2d preserves ranges of original chunk in other dimensions", "[split]") {
const auto full_chunk = make_full_chunk<3>({128, 128, 341});
const auto chunks = split_2d(full_chunk, ones, 4);
for(size_t i = 0; i < 4; ++i) {
REQUIRE_LOOP(chunks[i].range == range<3>{64, 64, 341});
REQUIRE_LOOP(chunks[i].get_range() == range<3>{64, 64, 341});
}
}

Expand Down
Loading