Merge pull request #791 from ginkgo-project/benchmarks-auto-repetitions

Merge Benchmarks auto repetitions This PR adds the option to automatically deduce the number of repetitions a benchmark should use. Especially for small working sizes this can lead to more consistent results. The number is chosen s.t. the benchmark runs at least `min_repetitions` and either the total runtime surpasses `min_runtime` or the number of repetitions surpasses `max_repetitions`. Additionally, the timing overhead is reduced, by increasing the number of iterations between each timing. These intervals increase with the factor `repetition_growth_factor`. All mentioned parameters can be adjusted through command-line flags. This behavior is NOT enabled by default, the flags `-repetitions auto` has to be used. The PR also changes the internal repetition loop in the benchmark's implementations, using a range-based for-loop similar to google's benchmark. Related PR: #791
ginkgo-project · Jun 28, 2021 · a37c101 · a37c101
2 parents 3112263 + 33ff686
commit a37c101
Show file tree

Hide file tree

Showing 7 changed files with 356 additions and 73 deletions.
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
@@ -437,32 +437,34 @@ void apply_blas(const char *operation_name, std::shared_ptr<gko::Executor> exec,
 
         auto op = operation_map[operation_name](exec, parse_dims(test_case));
 
+        auto timer = get_timer(exec, FLAGS_gpu_timer);
+        IterationControl ic(timer);
+
         // warm run
-        for (unsigned int i = 0; i < FLAGS_warmup; i++) {
+        for (auto _ : ic.warmup_run()) {
             op->prepare();
             exec->synchronize();
             op->run();
             exec->synchronize();
         }
 
         // timed run
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
-        for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
-            op->prepare();
-            exec->synchronize();
-            timer->tic();
+        op->prepare();
+        for (auto _ : ic.run()) {
             op->run();
-            timer->toc();
         }
-        auto runtime = timer->compute_average_time();
-        auto flops = static_cast<double>(op->get_flops());
-        auto mem = static_cast<double>(op->get_memory());
+        const auto runtime = ic.compute_average_time();
+        const auto flops = static_cast<double>(op->get_flops());
+        const auto mem = static_cast<double>(op->get_memory());
+        const auto repetitions = ic.get_num_repetitions();
         add_or_set_member(blas_case[operation_name], "time", runtime,
                           allocator);
         add_or_set_member(blas_case[operation_name], "flops", flops / runtime,
                           allocator);
         add_or_set_member(blas_case[operation_name], "bandwidth", mem / runtime,
                           allocator);
+        add_or_set_member(blas_case[operation_name], "repetitions", repetitions,
+                          allocator);
 
         // compute and write benchmark data
         add_or_set_member(blas_case[operation_name], "completed", true,

diff --git a/benchmark/conversions/conversions.cpp b/benchmark/conversions/conversions.cpp
@@ -72,24 +72,25 @@ void convert_matrix(const gko::LinOp *matrix_from, const char *format_to,
         gko::matrix_data<etype> data{gko::dim<2>{1, 1}, 1};
         auto matrix_to =
             share(formats::matrix_factory.at(format_to)(exec, data));
+
+        auto timer = get_timer(exec, FLAGS_gpu_timer);
+        IterationControl ic{timer};
+
         // warm run
-        for (unsigned int i = 0; i < FLAGS_warmup; i++) {
+        for (auto _ : ic.warmup_run()) {
             exec->synchronize();
             matrix_to->copy_from(matrix_from);
             exec->synchronize();
             matrix_to->clear();
         }
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
         // timed run
-        for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
-            exec->synchronize();
-            timer->tic();
+        for (auto _ : ic.run()) {
             matrix_to->copy_from(matrix_from);
-            timer->toc();
-            matrix_to->clear();
         }
         add_or_set_member(conversion_case[conversion_name], "time",
                           timer->compute_average_time(), allocator);
+        add_or_set_member(conversion_case[conversion_name], "repetitions",
+                          timer->get_num_repetitions(), allocator);
 
         // compute and write benchmark data
         add_or_set_member(conversion_case[conversion_name], "completed", true,

diff --git a/benchmark/preconditioner/preconditioner.cpp b/benchmark/preconditioner/preconditioner.cpp
@@ -159,45 +159,38 @@ void run_preconditioner(const char *precond_name,
                               allocator);
         }
 
+        IterationControl ic_gen{get_timer(exec, FLAGS_gpu_timer)};
+        IterationControl ic_apply{get_timer(exec, FLAGS_gpu_timer)};
+
         {
             // fast run, gets total time
             auto x_clone = clone(x);
 
             auto precond = precond_factory.at(precond_name)(exec);
 
-            for (auto i = 0u; i < FLAGS_warmup; ++i) {
+
+            for (auto _ : ic_apply.warmup_run()) {
                 precond->generate(system_matrix)->apply(lend(b), lend(x_clone));
             }
-            auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
-            auto apply_timer = get_timer(exec, FLAGS_gpu_timer);
 
-            exec->synchronize();
-            generate_timer->tic();
             std::unique_ptr<gko::LinOp> precond_op;
-            for (auto i = 0u; i < FLAGS_repetitions; ++i) {
+            for (auto _ : ic_gen.run()) {
                 precond_op = precond->generate(system_matrix);
             }
-            generate_timer->toc();
 
-            // the timer is out of the loops to reduce calling synchronize
-            // overhead, so the timer does not know the number of repetitions.
-            auto generate_time =
-                generate_timer->get_total_time() / FLAGS_repetitions;
             add_or_set_member(this_precond_data["generate"], "time",
-                              generate_time, allocator);
+                              ic_gen.compute_average_time(), allocator);
+            add_or_set_member(this_precond_data["generate"], "repetitions",
+                              ic_gen.get_num_repetitions(), allocator);
 
-            exec->synchronize();
-            apply_timer->tic();
-            for (auto i = 0u; i < FLAGS_repetitions; ++i) {
+            for (auto _ : ic_apply.run()) {
                 precond_op->apply(lend(b), lend(x_clone));
             }
-            apply_timer->toc();
 
-            // the timer is out of the loops to reduce calling synchronize
-            // overhead, so the timer does not know the number of repetitions.
-            auto apply_time = apply_timer->get_total_time() / FLAGS_repetitions;
-            add_or_set_member(this_precond_data["apply"], "time", apply_time,
-                              allocator);
+            add_or_set_member(this_precond_data["apply"], "time",
+                              ic_apply.compute_average_time(), allocator);
+            add_or_set_member(this_precond_data["apply"], "repetitions",
+                              ic_apply.get_num_repetitions(), allocator);
         }
 
         if (FLAGS_detailed) {
@@ -209,24 +202,24 @@ void run_preconditioner(const char *precond_name,
                 std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
             exec->add_logger(gen_logger);
             std::unique_ptr<gko::LinOp> precond_op;
-            for (auto i = 0u; i < FLAGS_repetitions; ++i) {
+            for (auto i = 0u; i < ic_gen.get_num_repetitions(); ++i) {
                 precond_op = precond->generate(system_matrix);
             }
             exec->remove_logger(gko::lend(gen_logger));
 
             gen_logger->write_data(this_precond_data["generate"]["components"],
-                                   allocator, FLAGS_repetitions);
+                                   allocator, ic_gen.get_num_repetitions());
 
             auto apply_logger =
                 std::make_shared<OperationLogger>(exec, FLAGS_nested_names);
             exec->add_logger(apply_logger);
-            for (auto i = 0u; i < FLAGS_repetitions; ++i) {
+            for (auto i = 0u; i < ic_apply.get_num_repetitions(); ++i) {
                 precond_op->apply(lend(b), lend(x_clone));
             }
             exec->remove_logger(gko::lend(apply_logger));
 
             apply_logger->write_data(this_precond_data["apply"]["components"],
-                                     allocator, FLAGS_repetitions);
+                                     allocator, ic_apply.get_num_repetitions());
         }
 
         add_or_set_member(this_precond_data, "completed", true, allocator);

diff --git a/benchmark/solver/solver.cpp b/benchmark/solver/solver.cpp
@@ -399,9 +399,11 @@ void solve_system(const std::string &solver_name,
                               allocator);
         }
 
+        IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
+
         // warm run
         auto it_logger = std::make_shared<IterationLogger>(exec);
-        for (unsigned int i = 0; i < FLAGS_warmup; i++) {
+        for (auto _ : ic.warmup_run()) {
             auto x_clone = clone(x);
             auto precond = precond_factory.at(precond_name)(exec);
             auto solver = generate_solver(exec, give(precond), solver_name)
@@ -472,9 +474,10 @@ void solve_system(const std::string &solver_name,
 
         // timed run
         auto generate_timer = get_timer(exec, FLAGS_gpu_timer);
-        auto apply_timer = get_timer(exec, FLAGS_gpu_timer);
-        for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
-            auto x_clone = clone(x);
+        auto apply_timer = ic.get_timer();
+        auto x_clone = clone(x);
+        for (auto status : ic.run(false)) {
+            x_clone = clone(x);
 
             exec->synchronize();
             generate_timer->tic();
@@ -487,19 +490,19 @@ void solve_system(const std::string &solver_name,
             apply_timer->tic();
             solver->apply(lend(b), lend(x_clone));
             apply_timer->toc();
-
-            if (b->get_size()[1] == 1 && i == FLAGS_repetitions - 1 &&
-                !FLAGS_overhead) {
-                auto residual = compute_residual_norm(lend(system_matrix),
-                                                      lend(b), lend(x_clone));
-                add_or_set_member(solver_json, "residual_norm", residual,
-                                  allocator);
-            }
+        }
+        if (b->get_size()[1] == 1 && !FLAGS_overhead) {
+            auto residual = compute_residual_norm(lend(system_matrix), lend(b),
+                                                  lend(x_clone));
+            add_or_set_member(solver_json, "residual_norm", residual,
+                              allocator);
         }
         add_or_set_member(solver_json["generate"], "time",
                           generate_timer->compute_average_time(), allocator);
         add_or_set_member(solver_json["apply"], "time",
                           apply_timer->compute_average_time(), allocator);
+        add_or_set_member(solver_json, "repetitions",
+                          apply_timer->get_num_repetitions(), allocator);
 
         // compute and write benchmark data
         add_or_set_member(solver_json, "completed", true, allocator);
@@ -515,7 +518,8 @@ void solve_system(const std::string &solver_name,
 int main(int argc, char *argv[])
 {
     // Set the default repetitions = 1.
-    FLAGS_repetitions = 1;
+    FLAGS_repetitions = "1";
+    FLAGS_min_repetitions = 1;
     std::string header =
         "A benchmark for measuring performance of Ginkgo's solvers.\n";
     std::string format =

diff --git a/benchmark/spmv/spmv.cpp b/benchmark/spmv/spmv.cpp
@@ -91,8 +91,10 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
             add_or_set_member(spmv_case[format_name], "max_relative_norm2",
                               max_relative_norm2, allocator);
         }
+
+        IterationControl ic{get_timer(exec, FLAGS_gpu_timer)};
         // warm run
-        for (unsigned int i = 0; i < FLAGS_warmup; i++) {
+        for (auto _ : ic.warmup_run()) {
             auto x_clone = clone(x);
             exec->synchronize();
             system_matrix->apply(lend(b), lend(x_clone));
@@ -123,12 +125,10 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
             // variable is used.
             gko::_tuned_value = val;
             auto tuning_timer = get_timer(exec, FLAGS_gpu_timer);
-            for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
-                auto x_clone = clone(x);
-                exec->synchronize();
-                tuning_timer->tic();
+            IterationControl ic_tuning{tuning_timer};
+            auto x_clone = clone(x);
+            for (auto _ : ic_tuning.run()) {
                 system_matrix->apply(lend(b), lend(x_clone));
-                tuning_timer->toc();
             }
             tuning_case["time"].PushBack(tuning_timer->compute_average_time(),
                                          allocator);
@@ -140,16 +140,14 @@ void apply_spmv(const char *format_name, std::shared_ptr<gko::Executor> exec,
 #endif  // GINKGO_BENCHMARK_ENABLE_TUNING
 
         // timed run
-        auto timer = get_timer(exec, FLAGS_gpu_timer);
-        for (unsigned int i = 0; i < FLAGS_repetitions; i++) {
-            auto x_clone = clone(x);
-            exec->synchronize();
-            timer->tic();
+        auto x_clone = clone(x);
+        for (auto _ : ic.run()) {
             system_matrix->apply(lend(b), lend(x_clone));
-            timer->toc();
         }
         add_or_set_member(spmv_case[format_name], "time",
-                          timer->compute_average_time(), allocator);
+                          ic.compute_average_time(), allocator);
+        add_or_set_member(spmv_case[format_name], "repetitions",
+                          ic.get_num_repetitions(), allocator);
 
         // compute and write benchmark data
         add_or_set_member(spmv_case[format_name], "completed", true, allocator);