Skip to content

Commit

Permalink
Reorganize par_ilut performance test
Browse files Browse the repository at this point in the history
I saw a big (~2s) discrepency between the times google benchmark was
measuring for me with UseRealTime and the times I was getting with
a manual Kokkos timer, so I went back to my previous approach of
manual timings and itegrated them with google benchmark via UseManualTime.

To reduce some code duplication I added a generic time_call function
to time a lambda call.
  • Loading branch information
jgfouca committed May 7, 2023
1 parent bf06fef commit b60e681
Show file tree
Hide file tree
Showing 2 changed files with 86 additions and 47 deletions.
4 changes: 2 additions & 2 deletions perf_test/sparse/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ if (KokkosKernels_ENABLE_BENCHMARK)
if (Ginkgo_DIR)
find_package(Ginkgo REQUIRED)

target_compile_definitions(sparse_par_ilut PRIVATE "USE_GINKGO")
target_link_libraries(sparse_par_ilut PRIVATE Ginkgo::ginkgo)
target_compile_definitions(KokkosKernels_sparse_par_ilut PRIVATE "USE_GINKGO")
target_link_libraries(KokkosKernels_sparse_par_ilut PRIVATE Ginkgo::ginkgo)
endif()
endif()
129 changes: 84 additions & 45 deletions perf_test/sparse/KokkosSparse_par_ilut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,35 @@ using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
size_type, lno_t, scalar_t, exe_space, mem_space, mem_space>;
using float_t = typename Kokkos::ArithTraits<scalar_t>::mag_type;

///////////////////////////////////////////////////////////////////////////////
template <typename L, typename State>
void time_call(L& lam, State& state, const std::string& name)
///////////////////////////////////////////////////////////////////////////////
{
Kokkos::Timer timer;
double min_time = std::numeric_limits<double>::infinity();
double max_time = 0.0;
double ave_time = 0.0;

for (auto _ : state) {
// Run timable thing
double time = lam();

// Record time
ave_time += time;
if (time > max_time) max_time = time;
if (time < min_time) min_time = time;
state.SetIterationTime(time);

// Report run so user knows something is happening
std::cout << name << " Finished a run in: " << time << " seconds" << std::endl;
}

std::cout << name << " LOOP_AVG_TIME: " << ave_time / state.iterations() << std::endl;
std::cout << name << " LOOP_MAX_TIME: " << max_time << std::endl;
std::cout << name << " LOOP_MIN_TIME: " << min_time << std::endl;
}

///////////////////////////////////////////////////////////////////////////////
void run_par_ilut_test(benchmark::State& state, KernelHandle& kh,
const sp_matrix_type& A, int& num_iters)
Expand All @@ -94,15 +123,13 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh,
EntriesType U_entries("U_entries", 0);
ValuesType U_values("U_values", 0);

size_type nnzL = 0;
size_type nnzU = 0;
for (auto _ : state) {
// Run par_ilut
state.ResumeTiming();
auto plambda = [&]() {
Kokkos::Timer timer;
timer.reset();
par_ilut_symbolic(&kh, A_row_map, A_entries, L_row_map, U_row_map);

nnzL = par_ilut_handle->get_nnzL();
nnzU = par_ilut_handle->get_nnzU();
size_type nnzL = par_ilut_handle->get_nnzL();
size_type nnzU = par_ilut_handle->get_nnzU();

Kokkos::resize(L_entries, nnzL);
Kokkos::resize(U_entries, nnzU);
Expand All @@ -116,8 +143,7 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh,
par_ilut_numeric(&kh, A_row_map, A_entries, A_values, L_row_map, L_entries,
L_values, U_row_map, U_entries, U_values);
Kokkos::fence();

state.PauseTiming();
const double time = timer.seconds();

// Check worked
num_iters = par_ilut_handle->get_num_iters();
Expand All @@ -128,8 +154,11 @@ void run_par_ilut_test(benchmark::State& state, KernelHandle& kh,
Kokkos::deep_copy(L_row_map, 0);
Kokkos::deep_copy(U_row_map, 0);

std::cout << "Finished par_ilut run" << std::endl;
}
// Return time
return time;
};

time_call(plambda, state, "PAR_ILUT");
}

#ifdef USE_GINKGO
Expand All @@ -156,6 +185,8 @@ void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh,
const sp_matrix_type& A, const int& num_iters)
///////////////////////////////////////////////////////////////////////////////
{
const int rows = state.range(0);

auto par_ilut_handle = kh.get_par_ilut_handle();

// Pull out views from CRS
Expand Down Expand Up @@ -184,17 +215,22 @@ void run_par_ilut_test_ginkgo(benchmark::State& state, KernelHandle& kh,

std::shared_ptr<const mtx> a_mtx = std::move(a_mtx_uniq);

for (auto _ : state) {
auto plambda = [&]() {
Kokkos::Timer timer;
timer.reset();

auto fact = gko::factorization::ParIlut<scalar_t, lno_t>::build()
.with_fill_in_limit(par_ilut_handle->get_fill_in_limit())
.with_approximate_select(false)
.with_iterations(num_iters)
.on(exec)
->generate(a_mtx);

// Report run so user knows something is happening
std::cout << "GINKGO Finished a run " << std::endl;
}
// Return time
return timer.seconds();
};

time_call(plambda, state, "GINKGO");
}
#endif

Expand Down Expand Up @@ -229,16 +265,16 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh,
EntriesType U_entries("U_entries", handle_nnz);
ValuesType U_values("U_values", handle_nnz);

for (auto _ : state) {
state.PauseTiming();

if (measure_symbolic) {
state.ResumeTiming();
}
auto plambda = [&]() {
Kokkos::Timer timer;
double time;
timer.reset();
spiluk_symbolic(&kh, fill_lev, A_row_map, A_entries, L_row_map, L_entries,
U_row_map, U_entries);
Kokkos::fence();
state.PauseTiming();
if (measure_symbolic) {
time = timer.seconds();
}

const size_type nnzL = spiluk_handle->get_nnzL();
const size_type nnzU = spiluk_handle->get_nnzU();
Expand All @@ -249,12 +285,12 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh,
Kokkos::resize(U_values, nnzU);

if (!measure_symbolic) {
state.ResumeTiming();
timer.reset();
spiluk_numeric(&kh, fill_lev, A_row_map, A_entries, A_values, L_row_map,
L_entries, L_values, U_row_map, U_entries, U_values);
Kokkos::fence();
time = timer.seconds();
}
spiluk_numeric(&kh, fill_lev, A_row_map, A_entries, A_values, L_row_map,
L_entries, L_values, U_row_map, U_entries, U_values);
Kokkos::fence();
state.PauseTiming();

// Reset inputs
Kokkos::deep_copy(L_row_map, 0);
Expand All @@ -268,14 +304,17 @@ void run_spiluk_test(benchmark::State& state, KernelHandle& kh,

spiluk_handle->reset_handle(rows, handle_nnz, handle_nnz);

std::cout << "Finished spiluk run" << std::endl;
}
return time;
};

std::string name = std::string("SPILUK_") + (measure_symbolic ? "SYM" : "NUM");
time_call(plambda, state, name);
}

///////////////////////////////////////////////////////////////////////////////
int test_par_ilut_perf(const std::string& matrix_file, int rows,
const int nnz_per_row, const int bandwidth,
const int team_size, const int loop, const int test)
int nnz_per_row, const int bandwidth,
int team_size, const int loop, const int test)
///////////////////////////////////////////////////////////////////////////////
{
KernelHandle kh;
Expand All @@ -294,6 +333,14 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows,
A = KokkosSparse::Impl::read_kokkos_crst_matrix<sp_matrix_type>(
matrix_file.c_str());
rows = A.numRows();
nnz_per_row = A.nnz() / rows;
}

// Now that we have A, we can set team_size
if (team_size == -1) {
team_size = KokkosKernels::Impl::kk_is_gpu_exec_space<exe_space>()
? nnz_per_row
: 1;
}

KokkosSparse::sort_crs_matrix(A);
Expand Down Expand Up @@ -330,8 +377,7 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows,
run_par_ilut_test(state, kh, A, num_iters);
};
KokkosKernelsBenchmark::register_benchmark((name + "_par_ilut").c_str(),
plambda, arg_names, args, loop)
->UseRealTime();
plambda, arg_names, args, loop);
}

#ifdef USE_GINKGO
Expand All @@ -340,8 +386,7 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows,
run_par_ilut_test_ginkgo(state, kh, A, num_iters);
};
KokkosKernelsBenchmark::register_benchmark((name + "_gingko").c_str(),
glambda, arg_names, args, loop)
->UseRealTime();
glambda, arg_names, args, loop);
}
#endif

Expand All @@ -353,12 +398,10 @@ int test_par_ilut_perf(const std::string& matrix_file, int rows,
run_spiluk_test(state, kh, A, team_size, false);
};
KokkosKernelsBenchmark::register_benchmark(
(name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop)
->UseRealTime();
(name + "_spiluk_symbolic").c_str(), s1lambda, arg_names, args, loop);

KokkosKernelsBenchmark::register_benchmark(
(name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop)
->UseRealTime();
(name + "_spiluk_numeric").c_str(), s2lambda, arg_names, args, loop);
}

// Need to run before vars used by lambdas go out of scope
Expand Down Expand Up @@ -459,18 +502,14 @@ int main(int argc, char** argv)
}
}

// Set dependent defaults
// Set dependent defaults. Default team_size cannot be set
// until we know more about A
if (nnz_per_row == -1) {
nnz_per_row = std::min(rows / 100, 50);
}
if (bandwidth == -1) {
bandwidth = std::max(2 * (int)std::sqrt(rows), 2 * nnz_per_row);
}
if (team_size == -1) {
team_size = KokkosKernels::Impl::kk_is_gpu_exec_space<exe_space>()
? nnz_per_row
: 1;
}

Kokkos::initialize(argc, argv);
{
Expand Down

0 comments on commit b60e681

Please sign in to comment.