Skip to content

Commit

Permalink
Benchmark task and command graph generation
Browse files Browse the repository at this point in the history
  • Loading branch information
fknorr committed Mar 10, 2022
1 parent f48a669 commit 36454f2
Show file tree
Hide file tree
Showing 3 changed files with 204 additions and 10 deletions.
2 changes: 1 addition & 1 deletion ci/run-unit-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ fi

# Running "make test" is slow because CTest will spawn one process per test case, adding significant start-up costs.
# We just call all test executables manually to get around this.
find test -maxdepth 1 -executable -type f | while read test; do
find test -maxdepth 1 -executable -type f -not -name benchmarks | while read test; do
echo -e "\n\n ---- Running ${test##*/} ----\n\n" >&2
bash /root/capture-backtrace.sh "$test"
done
210 changes: 201 additions & 9 deletions test/benchmarks.cc
Original file line number Diff line number Diff line change
@@ -1,14 +1,19 @@
#include <catch2/benchmark/catch_benchmark.hpp>
#include <catch2/catch_template_test_macros.hpp>
#include <catch2/catch_test_macros.hpp>
#include <catch2/generators/catch_generators.hpp>

#include "intrusive_graph.h"
#include "task_manager.h"
#include "test_utils.h"

using namespace celerity;
using namespace celerity::detail;

struct bench_graph_node : intrusive_graph_node<bench_graph_node> {};

template <int N>
void intrusive_graph_benchmark() {
// try to cover the dependency counts we'll see in practice
TEMPLATE_TEST_CASE_SIG("benchmark intrusive graph dependency handling with N nodes", "[benchmark][intrusive_graph_node]", ((int N), N), 1, 10, 100) {
// note that bench_graph_nodes are created/destroyed *within* the BENCHMARK
// in the first two cases while the latter 2 cases only operate on already
// existing nodes -- this is intentional; both cases are relevant in practise
Expand Down Expand Up @@ -51,13 +56,200 @@ void intrusive_graph_benchmark() {
};
}

// try to cover the dependency counts we'll see in practice
TEST_CASE("benchmark intrusive graph dependency handling, N=1", "[benchmark]") {
intrusive_graph_benchmark<1>();

struct task_manager_benchmark_context {
task_manager tm{1, nullptr, nullptr};
test_utils::mock_buffer_factory mbf{&tm};

~task_manager_benchmark_context() { tm.generate_epoch_task(celerity::detail::epoch_action::shutdown); }

template <int KernelDims, typename CGF>
void create_task(range<KernelDims> global_range, CGF cgf) {
tm.submit_command_group([=](handler& cgh) {
cgf(cgh);
cgh.host_task(global_range, [](partition<KernelDims>) {});
});
}
};

struct graph_generator_benchmark_context {
const size_t num_nodes;
command_graph cdag;
graph_serializer gsrlzr{cdag, [](node_id, command_pkg, const std::vector<command_id>&) {}};
reduction_manager rm;
task_manager tm{num_nodes, nullptr, &rm};
graph_generator ggen{num_nodes, tm, rm, cdag};
test_utils::mock_buffer_factory mbf{&tm, &ggen};

explicit graph_generator_benchmark_context(size_t num_nodes) : num_nodes{num_nodes} {
tm.register_task_callback([this](const task_id tid) {
naive_split_transformer transformer{this->num_nodes, this->num_nodes};
ggen.build_task(tid, {&transformer});
gsrlzr.flush(tid);
});
}

~graph_generator_benchmark_context() { tm.generate_epoch_task(celerity::detail::epoch_action::shutdown); }

template <int KernelDims, typename CGF>
void create_task(range<KernelDims> global_range, CGF cgf) {
// note: This ignores communication overhead with the scheduler thread
tm.submit_command_group([=](handler& cgh) {
cgf(cgh);
cgh.host_task(global_range, [](partition<KernelDims>) {});
});
}
};

struct scheduler_benchmark_context {
// note: This will include thread creation / destruction overhead in the benchmark times.
const size_t num_nodes;
command_graph cdag;
graph_serializer gsrlzr{cdag, [](node_id, command_pkg, const std::vector<command_id>&) {}};
reduction_manager rm;
task_manager tm{num_nodes, nullptr, &rm};
graph_generator ggen{num_nodes, tm, rm, cdag};
scheduler schdlr{ggen, gsrlzr, num_nodes};
test_utils::mock_buffer_factory mbf{&tm, &ggen};

explicit scheduler_benchmark_context(size_t num_nodes) : num_nodes{num_nodes} { //
schdlr.startup();
}

~scheduler_benchmark_context() {
// scheduler operates in a FIFO manner, so awaiting shutdown will await processing of all pending tasks first
const auto tid = tm.generate_epoch_task(celerity::detail::epoch_action::shutdown);
schdlr.notify_task_created(tid);
schdlr.shutdown();
}

template <int KernelDims, typename CGF>
void create_task(range<KernelDims> global_range, CGF cgf) {
const auto tid = tm.submit_command_group([=](handler& cgh) {
cgf(cgh);
cgh.host_task(global_range, [](partition<KernelDims>) {});
});
schdlr.notify_task_created(tid);
}
};

template <typename BenchmarkContext>
[[gnu::noinline]] void generate_soup_graph(BenchmarkContext&& ctx) {
constexpr int num_tasks = 1000;
const range<1> buffer_range{2048};

for(int t = 0; t < num_tasks; ++t) {
ctx.create_task(buffer_range, [](handler& cgh) {});
}
}
TEST_CASE("benchmark intrusive graph dependency handling, N=10", "[benchmark]") {
intrusive_graph_benchmark<10>();

template <typename BenchmarkContext>
[[gnu::noinline]] void generate_chain_graph(BenchmarkContext&& ctx) {
constexpr int num_tasks = 200;
const range<1> buffer_range{2048};

auto buffer = ctx.mbf.create_buffer(buffer_range);
for(int t = 0; t < num_tasks; ++t) {
ctx.create_task(buffer_range, [&](handler& cgh) { buffer.template get_access<access_mode::read_write>(cgh, celerity::access::one_to_one()); });
}
}
TEST_CASE("benchmark intrusive graph dependency handling, N=100", "[benchmark]") {
intrusive_graph_benchmark<100>();

enum class TreeTopology { Map, Reduce };

template <TreeTopology Topology, typename BenchmarkContext>
[[gnu::noinline]] void generate_tree_graph(BenchmarkContext&& ctx) {
constexpr int num_tasks = 100;
const range<1> buffer_range{2048};

auto buffer = ctx.mbf.create_buffer(buffer_range);
auto buffer2 = ctx.mbf.create_buffer(buffer_range);

int numEpochs = std::log2(num_tasks);
int curEpochTasks = Topology == TreeTopology::Map ? 1 : 1 << numEpochs;
int sentinelEpoch = Topology == TreeTopology::Map ? numEpochs - 1 : 0;
int sentinelEpochMax = num_tasks - (curEpochTasks - 1); // how many tasks to generate at the last/first epoch to reach exactly numTasks

for(int e = 0; e < numEpochs; ++e) {
int taskCount = curEpochTasks;
if(e == sentinelEpoch) taskCount = sentinelEpochMax;

// build tasks for this epoch
for(int t = 0; t < taskCount; ++t) {
ctx.create_task(range<1>{1}, [&](celerity::handler& cgh) {
// mappers constructed to build a binary (potentially inverted) tree
auto read_mapper = [=](const celerity::chunk<1>& chunk) {
return Topology == TreeTopology::Map ? celerity::subrange<1>(t / 2, 1) : celerity::subrange<1>(t * 2, 2);
};
auto write_mapper = [=](const celerity::chunk<1>& chunk) { return celerity::subrange<1>(t, 1); };
buffer.template get_access<access_mode::write>(cgh, write_mapper);
buffer2.template get_access<access_mode::read>(cgh, read_mapper);
});
}

// get ready for the next epoch
if(Topology == TreeTopology::Map) {
curEpochTasks *= 2;
} else {
curEpochTasks /= 2;
}
std::swap(buffer, buffer2);
}
}

// graphs identical to the wave_sim example
template <typename BenchmarkContext>
[[gnu::noinline]] void generate_wave_sim_graph(BenchmarkContext&& ctx) {
constexpr int N = 512;
constexpr float T = 20;
constexpr float dt = 0.25f;

const auto fill = [&](test_utils::mock_buffer<2> u) {
ctx.create_task(u.get_range(), [&](celerity::handler& cgh) { u.get_access<access_mode::discard_write>(cgh, celerity::access::one_to_one{}); });
};

const auto step = [&](test_utils::mock_buffer<2> up, test_utils::mock_buffer<2> u) {
ctx.create_task(up.get_range(), [&](celerity::handler& cgh) {
up.get_access<access_mode::read_write>(cgh, celerity::access::one_to_one{});
u.get_access<access_mode::read>(cgh, celerity::access::neighborhood{1, 1});
});
};

auto up = ctx.mbf.create_buffer(celerity::range<2>(N, N));
auto u = ctx.mbf.create_buffer(celerity::range<2>(N, N));

fill(u);
fill(up);
step(up, u);

auto t = 0.0;
size_t i = 0;
while(t < T) {
step(up, u);
std::swap(u, up);
t += dt;
}
}

TEST_CASE("generating large task graphs", "[benchmark][task-graph]") {
BENCHMARK("soup topology") { generate_soup_graph(task_manager_benchmark_context{}); };
BENCHMARK("chain topology") { generate_chain_graph(task_manager_benchmark_context{}); };
BENCHMARK("map topology") { generate_tree_graph<TreeTopology::Map>(task_manager_benchmark_context{}); };
BENCHMARK("reduce topology") { generate_tree_graph<TreeTopology::Reduce>(task_manager_benchmark_context{}); };
BENCHMARK("wave_sim topology") { generate_wave_sim_graph(task_manager_benchmark_context{}); };
}

TEMPLATE_TEST_CASE_SIG("generating large command graphs for N nodes", "[benchmark][command-graph]", ((size_t NumNodes), NumNodes), 1, 2, 4) {
BENCHMARK("soup topology") { generate_soup_graph(graph_generator_benchmark_context{NumNodes}); };
BENCHMARK("chain topology") { generate_chain_graph(graph_generator_benchmark_context{NumNodes}); };
BENCHMARK("map topology") { generate_tree_graph<TreeTopology::Map>(graph_generator_benchmark_context{NumNodes}); };
BENCHMARK("reduce topology") { generate_tree_graph<TreeTopology::Reduce>(graph_generator_benchmark_context{NumNodes}); };
BENCHMARK("wave_sim topology") { generate_wave_sim_graph(graph_generator_benchmark_context{NumNodes}); };
}

TEMPLATE_TEST_CASE_SIG("processing large graphs with a scheduler thread for N nodes", "[benchmark][scheduler]", ((size_t NumNodes), NumNodes), 1, 2, 4) {
BENCHMARK("soup topology") { generate_soup_graph(scheduler_benchmark_context{NumNodes}); };
BENCHMARK("chain topology") { generate_chain_graph(scheduler_benchmark_context{NumNodes}); };
BENCHMARK("map topology") { generate_tree_graph<TreeTopology::Map>(scheduler_benchmark_context{NumNodes}); };
BENCHMARK("reduce topology") { generate_tree_graph<TreeTopology::Reduce>(scheduler_benchmark_context{NumNodes}); };
BENCHMARK("wave_sim topology") { generate_wave_sim_graph(scheduler_benchmark_context{NumNodes}); };
}
2 changes: 2 additions & 0 deletions test/test_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,8 @@ namespace test_utils {

detail::buffer_id get_id() const { return id; }

range<Dims> get_range() const { return size; }

private:
friend class mock_buffer_factory;

Expand Down

0 comments on commit 36454f2

Please sign in to comment.