Skip to content


MDF: improving performance and adding performance test
Browse files Browse the repository at this point in the history
The performance test allows to generate random matrices, random
diagonal matrices and to read matrices from file. It collects time
for the handle creation, symbolic phase and numeric phase of the
MDF algorithm.

A small change in the methods names is made to make MDF more uniform
with the rest of the library. The unit-test is improved by checking
the results in L and U against analytical solution.

mostly changing the way the discarded fill is computed at each
factorization step, only selecting rows that were impacted by the
last factorized row.
  • Loading branch information
lucbv committed Feb 8, 2023
1 parent 566570a commit 9095beb
Show file tree
Hide file tree
Showing 6 changed files with 688 additions and 52 deletions.
5 changes: 5 additions & 0 deletions perf_test/sparse/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -110,3 +110,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE(
SOURCES KokkosSparse_spiluk.cpp

SOURCES KokkosSparse_mdf.cpp
320 changes: 320 additions & 0 deletions perf_test/sparse/KokkosSparse_mdf.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,320 @@
// ************************************************************************
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

#include <iostream>
#include "KokkosKernels_config.h"
#include "KokkosKernels_Handle.hpp"
#include "KokkosSparse_IOUtils.hpp"
#include "KokkosSparse_Utils_cusparse.hpp"
#include "KokkosSparse_mdf.hpp"
#include "KokkosKernels_TestUtils.hpp"

struct Params {
int use_cuda = 0;
int use_hip = 0;
int use_sycl = 0;
int use_openmp = 0;
int use_threads = 0;
std::string amtx;
int m = 10000;
int n = 10000;
int nnzPerRow = 30;
bool diag = false; // Whether B should be diagonal only (requires A square)
bool verbose = false;
int repeat = 1;

template <class row_map_t, class entries_t>
struct diag_generator_functor {
using size_type = typename row_map_t::non_const_value_type;

row_map_t row_map;
entries_t entries;

diag_generator_functor(row_map_t row_map_, entries_t entries_)
: row_map(row_map_), entries(entries_){};

void operator()(const size_type rowIdx) const {
row_map(rowIdx + 1) = rowIdx + 1;
entries(rowIdx) = rowIdx;

template <typename crsMat_t>
void run_experiment(const Params& params) {
using size_type = typename crsMat_t::size_type;
using lno_t = typename crsMat_t::ordinal_type;
using scalar_t = typename crsMat_t::value_type;
using device_t = typename crsMat_t::device_type;
using exec_space = typename device_t::execution_space;

using graph_t = typename crsMat_t::StaticCrsGraphType;
using rowmap_t = typename graph_t::row_map_type::non_const_type;
using entries_t = typename graph_t::entries_type::non_const_type;
using values_t = typename crsMat_t::values_type::non_const_type;

std::cout << "************************************* \n";
std::cout << "************************************* \n";
crsMat_t A;
lno_t m = params.m;
lno_t n = params.n;
if (params.amtx.length()) {
std::cout << "Loading A from " << params.amtx << '\n';
A = KokkosSparse::Impl::read_kokkos_crst_matrix<crsMat_t>(
m = A.numRows();
n = A.numCols();
} else {
if (params.diag) {
std::cout << "Randomly generating diag matrix\n";
rowmap_t rowmapA("A row map", m + 1);
entries_t entriesA("A entries", m);
values_t valuesA("A values", m);

// Generate the graph of A
diag_generator_functor diag_generator(rowmapA, entriesA);
Kokkos::parallel_for(Kokkos::RangePolicy<size_type, exec_space>(0, m),

// Generate the values of A
Kokkos::Random_XorShift64_Pool<exec_space> rand_pool(13718);
Kokkos::fill_random(valuesA, rand_pool,
10 * Kokkos::ArithTraits<scalar_t>::one());

// Actually put A together
graph_t graph(entriesA, rowmapA);
A = crsMat_t("A matrix", m, valuesA, graph);
} else {
std::cout << "Randomly generating matrix\n";
size_type nnzUnused = m * params.nnzPerRow;
A = KokkosSparse::Impl::kk_generate_sparse_matrix<crsMat_t>(
m, n, nnzUnused, 0, (n + 3) / 3);

if (params.verbose) {
std::cout << "Matrix A" << std::endl;
std::cout << " row_map A:" << std::endl;
std::cout << " entries A:" << std::endl;
std::cout << " values A:" << std::endl;
std::cout << std::endl;

Kokkos::Timer timer;
double handleTime = 0;
double symbolicTime = 0;
double numericTime = 0;

KokkosSparse::Experimental::MDF_handle<crsMat_t> handle(A);
handleTime += timer.seconds();

for (int sumRep = 0; sumRep < params.repeat; sumRep++) {
KokkosSparse::Experimental::mdf_symbolic(A, handle);
symbolicTime += timer.seconds();

KokkosSparse::Experimental::mdf_numeric(A, handle);
numericTime += timer.seconds();

std::cout << "Mean total time: "
<< handleTime + (symbolicTime / params.repeat) +
(numericTime / params.repeat)
<< std::endl
<< "Handle time: " << handleTime << std::endl
<< "Mean symbolic time: " << (symbolicTime / params.repeat)
<< std::endl
<< "Mean numeric time: " << (numericTime / params.repeat)
<< std::endl;

if (params.verbose) {
entries_t permutation = handle.get_permutation();

std::cout << "MDF permutation:" << std::endl;
} // run_experiment

void print_options() {
std::cerr << "Options\n" << std::endl;

<< "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp "
"[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'"
" | '--sycl [syclDeviceIndex]'"
<< std::endl;

std::cerr << "\t[Optional] --amtx <path> :: input matrix" << std::endl;
std::cerr << "\t[Optional] --repeat :: how many times to repeat overall "
<< std::endl;
std::cerr << "\t[Optional] --verbose :: enable verbose output"
<< std::endl;
std::cerr << "\nSettings for randomly generated A matrix" << std::endl;
std::cerr << "\t[Optional] --m :: number of rows to generate"
<< std::endl;
std::cerr << "\t[Optional] --n :: number of cols to generate"
<< std::endl;
<< "\t[Optional] --nnz :: number of entries per row to generate"
<< std::endl;
std::cerr << "\t[Optional] --diag :: generate a diagonal matrix"
<< std::endl;
} // print_options

int parse_inputs(Params& params, int argc, char** argv) {
for (int i = 1; i < argc; ++i) {
if (0 == Test::string_compare_no_case(argv[i], "--threads")) {
params.use_threads = atoi(argv[++i]);
} else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) {
params.use_openmp = atoi(argv[++i]);
} else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) {
params.use_cuda = atoi(argv[++i]) + 1;
} else if (0 == Test::string_compare_no_case(argv[i], "--hip")) {
params.use_hip = atoi(argv[++i]) + 1;
} else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) {
params.use_sycl = atoi(argv[++i]) + 1;
} else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) {
params.amtx = argv[++i];
} else if (0 == Test::string_compare_no_case(argv[i], "--m")) {
params.m = atoi(argv[++i]);
} else if (0 == Test::string_compare_no_case(argv[i], "--n")) {
params.n = atoi(argv[++i]);
} else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) {
params.nnzPerRow = atoi(argv[++i]);
} else if (0 == Test::string_compare_no_case(argv[i], "--diag")) {
params.diag = true;
} else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) {
params.repeat = atoi(argv[++i]);
} else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) {
params.verbose = true;
} else {
std::cerr << "Unrecognized command line argument #" << i << ": "
<< argv[i] << std::endl;
return 1;
return 0;
} // parse_inputs

int main(int argc, char** argv) {
Params params;

if (parse_inputs(params, argc, argv)) {
return 1;
const int num_threads =
params.use_threads); // Assumption is that use_openmp variable
// is provided as number of threads

// If cuda, hip or sycl is used, set device_id
int device_id = 0;
if (params.use_cuda > 0) {
device_id = params.use_cuda - 1;
if (params.use_hip > 0) {
device_id = params.use_hip - 1;
if (params.use_sycl > 0) {
device_id = params.use_sycl - 1;


bool useOMP = params.use_openmp != 0;
bool useThreads = params.use_threads != 0;
bool useCUDA = params.use_cuda != 0;
bool useHIP = params.use_hip != 0;
bool useSYCL = params.use_sycl != 0;
bool useSerial = !useOMP && !useCUDA && !useHIP && !useSYCL;

if (useOMP) {
using crsMat_t =
KokkosSparse::CrsMatrix<double, int, Kokkos::OpenMP, void, int>;
std::cout << "ERROR: OpenMP requested, but not available.\n";
return 1;
if (useThreads) {
using crsMat_t =
KokkosSparse::CrsMatrix<double, int, Kokkos::Threads, void, int>;
std::cout << "ERROR: OpenMP requested, but not available.\n";
return 1;
if (useCUDA) {
using crsMat_t =
KokkosSparse::CrsMatrix<double, int, Kokkos::Cuda, void, int>;
std::cout << "ERROR: CUDA requested, but not available.\n";
return 1;
if (useHIP) {
#if defined(KOKKOS_ENABLE_HIP)
using crsMat_t =
KokkosSparse::CrsMatrix<double, int, Kokkos::HIP, void, int>;
std::cout << "ERROR: HIP requested, but not available.\n";
return 1;
if (useSYCL) {
using crsMat_t =
KokkosSparse::CrsMatrix<double, int, Kokkos::Experimental::SYCL, void,
std::cout << "ERROR: SYCL requested, but not available.\n";
return 1;
if (useSerial) {
using crsMat_t =
KokkosSparse::CrsMatrix<double, int, Kokkos::Serial, void, int>;
std::cout << "ERROR: Serial device requested, but not available.\n";
return 1;
return 0;
} // main

0 comments on commit 9095beb

Please sign in to comment.