Skip to content

Commit

Permalink
Spmv perftest improvements (#2146)
Browse files Browse the repository at this point in the history
* Spmv perf test improvements

- Add option to flush caches by filling a dummy buffer between
iterations
- Add option to call the non-reuse interface instead of handle/reuse
interface
- Fix modes T, H in nonsquare case (make x,y the correct length)

* Fix mode help text
  • Loading branch information
brian-kelley authored Mar 15, 2024
1 parent acd7141 commit 0c49c21
Showing 1 changed file with 178 additions and 101 deletions.
279 changes: 178 additions & 101 deletions perf_test/sparse/KokkosSparse_kk_spmv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,96 +28,159 @@
#include <KokkosSparse_CrsMatrix.hpp>
#include <KokkosKernels_IOUtils.hpp>
#include <KokkosSparse_IOUtils.hpp>
#include <KokkosSparse_Utils.hpp> // for graph_max_degree
#include <KokkosSparse_spmv.hpp>
#include "KokkosKernels_default_types.hpp"

typedef default_scalar Scalar;
typedef default_lno_t Ordinal;
typedef default_size_type Offset;

template <typename Layout>
void run_spmv(Ordinal numRows, Ordinal numCols, const char* filename, int loop,
int num_vecs, char mode, Scalar beta) {
using matrix_type =
KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::DefaultExecutionSpace,
void, Offset>;
using mv_type = Kokkos::View<Scalar**, Layout>;
using h_mv_type = typename mv_type::HostMirror;

srand(17312837);
matrix_type A;
if (filename)
A = KokkosSparse::Impl::read_kokkos_crst_matrix<matrix_type>(filename);
else {
Offset nnz = 10 * numRows;
// note: the help text says the bandwidth is fixed at 0.01 * numRows
A = KokkosSparse::Impl::kk_generate_sparse_matrix<matrix_type>(
numRows, numCols, nnz, 0, 0.01 * numRows);
}
numRows = A.numRows();
numCols = A.numCols();

std::cout << "A is " << numRows << "x" << numCols << ", with " << A.nnz()
<< " nonzeros\n";
std::cout << "SpMV mode " << mode << ", " << num_vecs
<< " vectors, beta = " << beta << ", multivectors are ";
std::cout << (std::is_same_v<Layout, Kokkos::LayoutLeft> ? "LayoutLeft"
: "LayoutRight");
std::cout << '\n';

mv_type x("X", numCols, num_vecs);
mv_type y("Y", numRows, num_vecs);

h_mv_type h_x = Kokkos::create_mirror_view(x);
h_mv_type h_y = Kokkos::create_mirror_view(y);
h_mv_type h_y_compare = Kokkos::create_mirror(y);

for (int v = 0; v < num_vecs; v++) {
for (int i = 0; i < numCols; i++) {
h_x(i, v) = (Scalar)(1.0 * (rand() % 40) - 20.);
}
}
using Scalar = default_scalar;
using Ordinal = default_lno_t;
using Offset = default_size_type;
using KAT = Kokkos::ArithTraits<Scalar>;

struct SPMVBenchmarking {
// note: CLI currently only allows square matrices to be randomly generated
// and nz/row is fixed at 10
Ordinal num_rows = 110503;
Ordinal num_cols = 110503;
char mode = 'N';
int loop = 100;
int num_vecs = 1;
Scalar beta = KAT::zero();
std::string filename = "";
bool flush_cache = false;
bool non_reuse = false;

Kokkos::deep_copy(x, h_x);

// Benchmark
auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0);
auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0);

// Create handles for both rank-1 and rank-2 cases,
// even though only 1 will get used below (depending on num_vecs)

KokkosSparse::SPMVHandle<Kokkos::DefaultExecutionSpace, matrix_type,
decltype(x0), decltype(y0)>
handle_rank1;
KokkosSparse::SPMVHandle<Kokkos::DefaultExecutionSpace, matrix_type, mv_type,
mv_type>
handle_rank2;
// Do 5 warm up calls (not timed). This will also initialize the handle.
for (int i = 0; i < 5; i++) {
if (num_vecs == 1) {
// run the rank-1 version
KokkosSparse::spmv(&handle_rank1, &mode, 1.0, A, x0, beta, y0);
// Using the parameters above, run and time spmv where x and y use the given
// memory layout.
template <typename Layout>
void run() {
using matrix_type =
KokkosSparse::CrsMatrix<Scalar, Ordinal, Kokkos::DefaultExecutionSpace,
void, Offset>;
using mv_type = Kokkos::View<Scalar**, Layout>;
using h_mv_type = typename mv_type::HostMirror;

srand(17312837);
matrix_type A;
if (filename != "") {
std::cout << "Reading A from file \"" << filename << "\"...\n";
A = KokkosSparse::Impl::read_kokkos_crst_matrix<matrix_type>(
filename.c_str());
num_rows = A.numRows();
num_cols = A.numCols();
} else {
// rank-2
KokkosSparse::spmv(&handle_rank2, &mode, 1.0, A, x, beta, y);
std::cout << "Randomly generating A...\n";
Offset nnz = 10 * num_rows;
// note: the help text says the bandwidth is fixed at 0.01 * numRows
A = KokkosSparse::Impl::kk_generate_sparse_matrix<matrix_type>(
num_rows, num_cols, nnz, 0, 0.01 * num_rows);
}
Kokkos::DefaultExecutionSpace().fence();
}
Kokkos::Timer timer;
for (int i = 0; i < loop; i++) {
if (num_vecs == 1) {
// run the rank-1 version
KokkosSparse::spmv(&handle_rank1, &mode, 1.0, A, x0, beta, y0);
} else {
// rank-2
KokkosSparse::spmv(&handle_rank2, &mode, 1.0, A, x, beta, y);

std::cout << "A is " << A.numRows() << "x" << A.numCols() << ", with "
<< A.nnz() << " nonzeros\n";
std::cout << "Mean nnz/row: " << (double)A.nnz() / A.numRows() << '\n';
std::cout << "Max nnz/row: "
<< KokkosSparse::Impl::graph_max_degree<
Kokkos::DefaultExecutionSpace, Ordinal>(A.graph.row_map)
<< '\n';
std::cout << "SpMV mode " << mode << ", " << num_vecs
<< " vectors, beta = " << beta << ", multivectors are ";
std::cout << (std::is_same_v<Layout, Kokkos::LayoutLeft> ? "LayoutLeft"
: "LayoutRight");
std::cout << '\n';

bool transpose_like = (mode == 'T') || (mode == 'H');

Ordinal xlen = transpose_like ? A.numRows() : A.numCols();
Ordinal ylen = transpose_like ? A.numCols() : A.numRows();

mv_type x("X", xlen, num_vecs);
mv_type y("Y", ylen, num_vecs);

h_mv_type h_x = Kokkos::create_mirror_view(x);
h_mv_type h_y = Kokkos::create_mirror_view(y);
h_mv_type h_y_compare = Kokkos::create_mirror(y);

for (int v = 0; v < num_vecs; v++) {
for (Ordinal i = 0; i < xlen; i++) {
h_x(i, v) = (Scalar)(1.0 * (rand() % 40) - 20.);
}
}

Kokkos::deep_copy(x, h_x);

// Benchmark
auto x0 = Kokkos::subview(x, Kokkos::ALL(), 0);
auto y0 = Kokkos::subview(y, Kokkos::ALL(), 0);

// Create handles for both rank-1 and rank-2 cases,
// even though only 1 will get used below (depending on num_vecs)

KokkosSparse::SPMVHandle<Kokkos::DefaultExecutionSpace, matrix_type,
decltype(x0), decltype(y0)>
handle_rank1;
KokkosSparse::SPMVHandle<Kokkos::DefaultExecutionSpace, matrix_type,
mv_type, mv_type>
handle_rank2;
// Assuming that 1GB is enough to fully clear the L3 cache of a CPU, or the
// L2 of a GPU. (Some AMD EPYC chips have 768 MB L3)
Kokkos::View<char*, Kokkos::DefaultExecutionSpace> cacheFlushData;
if (flush_cache) {
Kokkos::resize(cacheFlushData, 1024 * 1024 * 1024);
}

Kokkos::DefaultExecutionSpace space;

// Do 5 warm up calls (not timed). This will also initialize the handle.
for (int i = 0; i < 5; i++) {
if (num_vecs == 1) {
// run the rank-1 version
if (non_reuse)
KokkosSparse::spmv(space, &mode, 1.0, A, x0, beta, y0);
else
KokkosSparse::spmv(space, &handle_rank1, &mode, 1.0, A, x0, beta, y0);
} else {
// rank-2
if (non_reuse)
KokkosSparse::spmv(space, &mode, 1.0, A, x, beta, y);
else
KokkosSparse::spmv(space, &handle_rank2, &mode, 1.0, A, x, beta, y);
}
space.fence();
}

double totalTime = 0;
Kokkos::Timer timer;
for (int i = 0; i < loop; i++) {
if (flush_cache) {
// Copy some non-zero data to the view multiple times to flush the
// cache. (nonzero in case the system has an optimized path for zero
// pages)
for (int rep = 0; rep < 4; rep++)
Kokkos::deep_copy(space, cacheFlushData, char(rep + 1));
}
space.fence();
timer.reset();
if (num_vecs == 1) {
// run the rank-1 version
if (non_reuse)
KokkosSparse::spmv(space, &mode, 1.0, A, x0, beta, y0);
else
KokkosSparse::spmv(space, &handle_rank1, &mode, 1.0, A, x0, beta, y0);
} else {
// rank-2
if (non_reuse)
KokkosSparse::spmv(space, &mode, 1.0, A, x, beta, y);
else
KokkosSparse::spmv(space, &handle_rank2, &mode, 1.0, A, x, beta, y);
}
space.fence();
totalTime += timer.seconds();
}
Kokkos::DefaultExecutionSpace().fence();
double avg_time = totalTime / loop;
std::cout << avg_time << " s\n";
}
double avg_time = timer.seconds() / loop;
std::cout << avg_time << " s\n";
}
};

void print_help() {
printf(" -s [nrows] : matrix dimension (square)\n");
Expand All @@ -128,30 +191,33 @@ void print_help() {
" --layout left|right : memory layout of x/y. Default depends on "
"build's default execution space\n");
printf(
" -m N|T : matrix apply mode: N (normal, default), T "
"(transpose)\n");
" -m N|T|H|C : matrix apply mode:\n"
" N - normal, default\n"
" T - transpose\n"
" H - conjugate transpose\n"
" C - conjugate\n");
printf(
" -f [file],-fb [file] : Read in Matrix Market (.mtx), or binary "
"(.bin) matrix file.\n");
printf(
" -l [LOOP] : How many spmv to run to aggregate average "
"time. \n");
printf(" -b beta : beta, as in y := Ax + (beta)y\n");
printf(
" --flush : Flush the cache between each spmv call "
"(slow!)\n");
printf(
" --non-reuse : Use non-reuse interface (without "
"SPMVHandle)\n");
}

int main(int argc, char** argv) {
long long int size = 110503; // a prime number
char* filename = NULL;

char mode = 'N';
SPMVBenchmarking sb;
char layout;
if (std::is_same<default_layout, Kokkos::LayoutLeft>::value)
layout = 'L';
else
layout = 'R';
int loop = 100;
int num_vecs = 1;
Scalar beta = 0.0;

if (argc == 1) {
print_help();
Expand All @@ -160,27 +226,31 @@ int main(int argc, char** argv) {

for (int i = 0; i < argc; i++) {
if ((strcmp(argv[i], "-s") == 0)) {
size = atoi(argv[++i]);
// only square matrices supported now
sb.num_rows = atoi(argv[++i]);
sb.num_cols = sb.num_rows;
continue;
}
if ((strcmp(argv[i], "-f") == 0 || strcmp(argv[i], "-fb") == 0)) {
filename = argv[++i];
sb.filename = argv[++i];
continue;
}
if ((strcmp(argv[i], "-l") == 0)) {
loop = atoi(argv[++i]);
sb.loop = atoi(argv[++i]);
continue;
}
if ((strcmp(argv[i], "-m") == 0)) {
mode = toupper(argv[++i][0]);
sb.mode = toupper(argv[++i][0]);
if (sb.mode != 'N' && sb.mode != 'T' && sb.mode != 'C' && sb.mode != 'H')
throw std::invalid_argument("Mode must be one of N, T, C or H.");
continue;
}
if ((strcmp(argv[i], "--nv") == 0)) {
num_vecs = atoi(argv[++i]);
sb.num_vecs = atoi(argv[++i]);
continue;
}
if ((strcmp(argv[i], "-b") == 0)) {
beta = atof(argv[++i]);
sb.beta = atof(argv[++i]);
continue;
}
if ((strcmp(argv[i], "--layout") == 0)) {
Expand All @@ -191,6 +261,15 @@ int main(int argc, char** argv) {
layout = 'R';
else
throw std::runtime_error("Invalid layout");
continue;
}
if ((strcmp(argv[i], "--flush") == 0)) {
sb.flush_cache = true;
continue;
}
if ((strcmp(argv[i], "--non-reuse") == 0)) {
sb.non_reuse = true;
continue;
}
if ((strcmp(argv[i], "--help") == 0) || (strcmp(argv[i], "-h") == 0)) {
print_help();
Expand All @@ -201,11 +280,9 @@ int main(int argc, char** argv) {
Kokkos::initialize(argc, argv);

if (layout == 'L')
run_spmv<Kokkos::LayoutLeft>(size, size, filename, loop, num_vecs, mode,
beta);
sb.template run<Kokkos::LayoutLeft>();
else
run_spmv<Kokkos::LayoutRight>(size, size, filename, loop, num_vecs, mode,
beta);
sb.template run<Kokkos::LayoutRight>();

Kokkos::finalize();
}

0 comments on commit 0c49c21

Please sign in to comment.