Skip to content

Commit

Permalink
Merge pull request #791 from ginkgo-project/benchmarks-auto-repetitions
Browse files Browse the repository at this point in the history
Merge Benchmarks auto repetitions

This PR adds the option to automatically deduce the number of repetitions a benchmark should use. Especially for small working sizes this can lead to more consistent results. The number is chosen s.t. the benchmark runs at least `min_repetitions` and either the total runtime surpasses `min_runtime` or the number of repetitions surpasses `max_repetitions`. Additionally, the timing overhead is reduced, by increasing the number of iterations between each timing. These intervals increase with the factor `repetition_growth_factor`. All mentioned parameters can be adjusted through command-line flags. This behavior is NOT enabled by default, the flags `-repetitions auto` has to be used.

 The PR also changes the internal repetition loop in the benchmark's implementations, using a range-based for-loop similar to google's benchmark.

Related PR: #791
  • Loading branch information
MarcelKoch authored Jun 28, 2021
2 parents 3112263 + 33ff686 commit a37c101
Show file tree
Hide file tree
Showing 7 changed files with 356 additions and 73 deletions.
22 changes: 12 additions & 10 deletions benchmark/blas/blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -437,32 +437,34 @@ void apply_blas(const char *operation_name, std::shared_ptr<gko::Executor> exec,

auto op = operation_map[operation_name](exec, parse_dims(test_case));

auto timer = get_timer(exec, FLAGS_gpu_timer);
IterationControl ic(timer);

// warm run
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
for (auto _ : ic.warmup_run()) {
op->prepare();
exec->synchronize();
op->run();
exec->synchronize();
}

// timed run
auto timer = get_timer(exec, FLAGS_gpu_timer);
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
op->prepare();
exec->synchronize();
timer->tic();
op->prepare();
for (auto _ : ic.run()) {
op->run();
timer->toc();
}
auto runtime = timer->compute_average_time();
auto flops = static_cast<double>(op->get_flops());
auto mem = static_cast<double>(op->get_memory());
const auto runtime = ic.compute_average_time();
const auto flops = static_cast<double>(op->get_flops());
const auto mem = static_cast<double>(op->get_memory());
const auto repetitions = ic.get_num_repetitions();
add_or_set_member(blas_case[operation_name], "time", runtime,
allocator);
add_or_set_member(blas_case[operation_name], "flops", flops / runtime,
allocator);
add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime,
allocator);
add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
allocator);

// compute and write benchmark data
add_or_set_member(blas_case[operation_name], "completed", true,
Expand Down
15 changes: 8 additions & 7 deletions benchmark/conversions/conversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,24 +72,25 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to,
gko::matrix_data<etype> data{gko::dim<2>{1, 1}, 1};
auto matrix_to =
share(formats::matrix_factory.at(format_to)(exec, data));

auto timer = get_timer(exec, FLAGS_gpu_timer);
IterationControl ic{timer};

// warm run
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
for (auto _ : ic.warmup_run()) {
exec->synchronize();
matrix_to->copy_from(matrix_from);
exec->synchronize();
matrix_to->clear();
}
auto timer = get_timer(exec, FLAGS_gpu_timer);
// timed run
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
exec->synchronize();
timer->tic();
for (auto _ : ic.run()) {
matrix_to->copy_from(matrix_from);
timer->toc();
matrix_to->clear();
}
add_or_set_member(conversion_case[conversion_name], "time",
timer->compute_average_time(), allocator);
add_or_set_member(conversion_case[conversion_name], "repetitions",
timer->get_num_repetitions(), allocator);

// compute and write benchmark data
add_or_set_member(conversion_case[conversion_name], "completed", true,
Expand Down
43 changes: 18 additions & 25 deletions benchmark/preconditioner/preconditioner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,45 +159,38 @@ void run_preconditioner(const char *precond_name,
allocator);
}

IterationControl ic_gen{get_timer(exec, FLAGS_gpu_timer)};
IterationControl ic_apply{get_timer(exec, FLAGS_gpu_timer)};

{
// fast run, gets total time
auto x_clone = clone(x);

auto precond = precond_factory.at(precond_name)(exec);

for (auto i = 0u; i < FLAGS_warmup; ++i) {

for (auto _ : ic_apply.warmup_run()) {
precond->generate(system_matrix)->apply(lend(b), lend(x_clone));
}
auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
auto apply_timer = get_timer(exec, FLAGS_gpu_timer);

exec->synchronize();
generate_timer->tic();
std::unique_ptr<gko::LinOp> precond_op;
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
for (auto _ : ic_gen.run()) {
precond_op = precond->generate(system_matrix);
}
generate_timer->toc();

// the timer is out of the loops to reduce calling synchronize
// overhead, so the timer does not know the number of repetitions.
auto generate_time =
generate_timer->get_total_time() / FLAGS_repetitions;
add_or_set_member(this_precond_data["generate"], "time",
generate_time, allocator);
ic_gen.compute_average_time(), allocator);
add_or_set_member(this_precond_data["generate"], "repetitions",
ic_gen.get_num_repetitions(), allocator);

exec->synchronize();
apply_timer->tic();
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
for (auto _ : ic_apply.run()) {
precond_op->apply(lend(b), lend(x_clone));
}
apply_timer->toc();

// the timer is out of the loops to reduce calling synchronize
// overhead, so the timer does not know the number of repetitions.
auto apply_time = apply_timer->get_total_time() / FLAGS_repetitions;
add_or_set_member(this_precond_data["apply"], "time", apply_time,
allocator);
add_or_set_member(this_precond_data["apply"], "time",
ic_apply.compute_average_time(), allocator);
add_or_set_member(this_precond_data["apply"], "repetitions",
ic_apply.get_num_repetitions(), allocator);
}

if (FLAGS_detailed) {
Expand All @@ -209,24 +202,24 @@ void run_preconditioner(const char *precond_name,
std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
exec->add_logger(gen_logger);
std::unique_ptr<gko::LinOp> precond_op;
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) {
precond_op = precond->generate(system_matrix);
}
exec->remove_logger(gko::lend(gen_logger));

gen_logger->write_data(this_precond_data["generate"]["components"],
allocator, FLAGS_repetitions);
allocator, ic_gen.get_num_repetitions());

auto apply_logger =
std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
exec->add_logger(apply_logger);
for (auto i = 0u; i < FLAGS_repetitions; ++i) {
for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) {
precond_op->apply(lend(b), lend(x_clone));
}
exec->remove_logger(gko::lend(apply_logger));

apply_logger->write_data(this_precond_data["apply"]["components"],
allocator, FLAGS_repetitions);
allocator, ic_apply.get_num_repetitions());
}

add_or_set_member(this_precond_data, "completed", true, allocator);
Expand Down
30 changes: 17 additions & 13 deletions benchmark/solver/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -399,9 +399,11 @@ void solve_system(const std::string &solver_name,
allocator);
}

IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};

// warm run
auto it_logger = std::make_shared<IterationLogger>(exec);
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
for (auto _ : ic.warmup_run()) {
auto x_clone = clone(x);
auto precond = precond_factory.at(precond_name)(exec);
auto solver = generate_solver(exec, give(precond), solver_name)
Expand Down Expand Up @@ -472,9 +474,10 @@ void solve_system(const std::string &solver_name,

// timed run
auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
auto apply_timer = get_timer(exec, FLAGS_gpu_timer);
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
auto x_clone = clone(x);
auto apply_timer = ic.get_timer();
auto x_clone = clone(x);
for (auto status : ic.run(false)) {
x_clone = clone(x);

exec->synchronize();
generate_timer->tic();
Expand All @@ -487,19 +490,19 @@ void solve_system(const std::string &solver_name,
apply_timer->tic();
solver->apply(lend(b), lend(x_clone));
apply_timer->toc();

if (b->get_size()[1] == 1 && i == FLAGS_repetitions - 1 &&
!FLAGS_overhead) {
auto residual = compute_residual_norm(lend(system_matrix),
lend(b), lend(x_clone));
add_or_set_member(solver_json, "residual_norm", residual,
allocator);
}
}
if (b->get_size()[1] == 1 && !FLAGS_overhead) {
auto residual = compute_residual_norm(lend(system_matrix), lend(b),
lend(x_clone));
add_or_set_member(solver_json, "residual_norm", residual,
allocator);
}
add_or_set_member(solver_json["generate"], "time",
generate_timer->compute_average_time(), allocator);
add_or_set_member(solver_json["apply"], "time",
apply_timer->compute_average_time(), allocator);
add_or_set_member(solver_json, "repetitions",
apply_timer->get_num_repetitions(), allocator);

// compute and write benchmark data
add_or_set_member(solver_json, "completed", true, allocator);
Expand All @@ -515,7 +518,8 @@ void solve_system(const std::string &solver_name,
int main(int argc, char *argv[])
{
// Set the default repetitions = 1.
FLAGS_repetitions = 1;
FLAGS_repetitions = "1";
FLAGS_min_repetitions = 1;
std::string header =
"A benchmark for measuring performance of Ginkgo's solvers.\n";
std::string format =
Expand Down
24 changes: 11 additions & 13 deletions benchmark/spmv/spmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,10 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
add_or_set_member(spmv_case[format_name], "max_relative_norm2",
max_relative_norm2, allocator);
}

IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
// warm run
for (unsigned int i = 0; i < FLAGS_warmup; i++) {
for (auto _ : ic.warmup_run()) {
auto x_clone = clone(x);
exec->synchronize();
system_matrix->apply(lend(b), lend(x_clone));
Expand Down Expand Up @@ -123,12 +125,10 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
// variable is used.
gko::_tuned_value = val;
auto tuning_timer = get_timer(exec, FLAGS_gpu_timer);
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
auto x_clone = clone(x);
exec->synchronize();
tuning_timer->tic();
IterationControl ic_tuning{tuning_timer};
auto x_clone = clone(x);
for (auto _ : ic_tuning.run()) {
system_matrix->apply(lend(b), lend(x_clone));
tuning_timer->toc();
}
tuning_case["time"].PushBack(tuning_timer->compute_average_time(),
allocator);
Expand All @@ -140,16 +140,14 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
#endif // GINKGO_BENCHMARK_ENABLE_TUNING

// timed run
auto timer = get_timer(exec, FLAGS_gpu_timer);
for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
auto x_clone = clone(x);
exec->synchronize();
timer->tic();
auto x_clone = clone(x);
for (auto _ : ic.run()) {
system_matrix->apply(lend(b), lend(x_clone));
timer->toc();
}
add_or_set_member(spmv_case[format_name], "time",
timer->compute_average_time(), allocator);
ic.compute_average_time(), allocator);
add_or_set_member(spmv_case[format_name], "repetitions",
ic.get_num_repetitions(), allocator);

// compute and write benchmark data
add_or_set_member(spmv_case[format_name], "completed", true, allocator);
Expand Down
Loading

0 comments on commit a37c101

Please sign in to comment.