diff --git a/perf_test/KokkosKernels_perf_test_utilities.hpp b/perf_test/KokkosKernels_perf_test_utilities.hpp index cc7f70ccec..fdbee134eb 100644 --- a/perf_test/KokkosKernels_perf_test_utilities.hpp +++ b/perf_test/KokkosKernels_perf_test_utilities.hpp @@ -67,6 +67,26 @@ void process_arg_int(char const* str_val, int& val) { } } +void process_arg_double(char const* str_val, double& val) { + errno = 0; + char* ptr_end; + val = std::strtod(str_val, &ptr_end); + + if (str_val == ptr_end) { + std::stringstream ss; + ss << "Error: cannot convert command line argument '" << str_val + << "' to a double.\n"; + throw std::invalid_argument(ss.str()); + } + + if (errno == ERANGE) { + std::stringstream ss; + ss << "Error: converted value for command line argument '" << str_val + << "' falls out of range.\n"; + throw std::invalid_argument(ss.str()); + } +} + bool check_arg_int(int const i, int const argc, char** argv, char const* name, int& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { @@ -83,6 +103,22 @@ bool check_arg_int(int const i, int const argc, char** argv, char const* name, return true; } +bool check_arg_double(int const i, int const argc, char** argv, + char const* name, double& val) { + if (0 != Test::string_compare_no_case(argv[i], name)) { + return false; + } + + if (i < argc - 1) { + process_arg_double(argv[i + 1], val); + } else { + std::stringstream msg; + msg << name << " input argument needs to be followed by a real number"; + throw std::invalid_argument(msg.str()); + } + return true; +} + bool check_arg_bool(int const i, int const /*argc*/, char** argv, char const* name, bool& val) { if (0 != Test::string_compare_no_case(argv[i], name)) { diff --git a/perf_test/graph/KokkosGraph_color.cpp b/perf_test/graph/KokkosGraph_color.cpp index cc969e52a1..57f241d7b1 100644 --- a/perf_test/graph/KokkosGraph_color.cpp +++ b/perf_test/graph/KokkosGraph_color.cpp @@ -379,7 +379,7 @@ void run_experiment(crsGraph_t crsGraph, int num_cols, Parameters params) { } } - if (params.coloring_output_file != NULL) { + if (params.coloring_output_file != "") { std::ofstream os(params.coloring_output_file, std::ofstream::out); KokkosKernels::Impl::print_1Dview(os, colors, true, "\n"); } @@ -420,7 +420,7 @@ void run_multi_mem_experiment(Parameters params) { // typedef typename slow_graph_t::entries_type::const_type // const_slow_cols_view_t; - char *a_mat_file = params.a_mtx_bin_file; + const char *a_mat_file = params.a_mtx_bin_file.c_str(); // char *b_mat_file = params.b_mtx_bin_file; // char *c_mat_file = params.c_mtx_bin_file; @@ -581,7 +581,7 @@ int main(int argc, char **argv) { if (parse_inputs(params, argc, argv)) { return 1; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a matrix file" << std::endl; return 0; } diff --git a/perf_test/graph/KokkosGraph_run_triangle.hpp b/perf_test/graph/KokkosGraph_run_triangle.hpp index 30d1ec77f6..2bdea59bea 100644 --- a/perf_test/graph/KokkosGraph_run_triangle.hpp +++ b/perf_test/graph/KokkosGraph_run_triangle.hpp @@ -64,7 +64,7 @@ bool is_same_graph(crsGraph_t output_mat1, crsGraph_t output_mat2) { if (!is_identical) return false; if (!is_identical) { - std::cout << "Incorret values" << std::endl; + std::cout << "Incorrect values" << std::endl; } return true; } diff --git a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp b/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp deleted file mode 100644 index 269baf3fdc..0000000000 --- a/perf_test/sparse/KokkosSparse_multimem_spgemm.hpp +++ /dev/null @@ -1,216 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosSparse_CrsMatrix.hpp" -#include "KokkosSparse_run_spgemm.hpp" -#include "KokkosSparse_IOUtils.hpp" - -namespace KokkosKernels { - -namespace Experiment { - -template -void run_multi_mem_spgemm(Parameters params) { - typedef exec_space myExecSpace; - typedef Kokkos::Device myFastDevice; - typedef Kokkos::Device mySlowExecSpace; - - typedef typename KokkosSparse::CrsMatrix - fast_crstmat_t; - typedef typename KokkosSparse::CrsMatrix - slow_crstmat_t; - - char *a_mat_file = params.a_mtx_bin_file; - char *b_mat_file = params.b_mtx_bin_file; - char *c_mat_file = params.c_mtx_bin_file; - - slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat; - fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat; - - // read a and b matrices and store them on slow or fast memory. - - if (params.a_mem_space == 1) { - a_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); - } else { - a_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(a_mat_file); - } - - if ((b_mat_file == NULL || strcmp(b_mat_file, a_mat_file) == 0) && - params.b_mem_space == params.a_mem_space) { - std::cout << "Using A matrix for B as well" << std::endl; - b_fast_crsmat = a_fast_crsmat; - b_slow_crsmat = a_slow_crsmat; - } else if (params.b_mem_space == 1) { - if (b_mat_file == NULL) b_mat_file = a_mat_file; - b_fast_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); - } else { - if (b_mat_file == NULL) b_mat_file = a_mat_file; - b_slow_crsmat = - KokkosSparse::Impl::read_kokkos_crst_matrix(b_mat_file); - } - - if (params.a_mem_space == 1) { - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, fast_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_fast_crsmat, - params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, fast_crstmat_t, slow_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_fast_crsmat, b_slow_crsmat, - params); - } - } - } - } else { - // A is in slow memory - if (params.b_mem_space == 1) { - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, fast_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_fast_crsmat, - params); - } - } - } else { - // B is in slow memory - if (params.c_mem_space == 1) { - if (params.work_mem_space == 1) { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } else { - c_fast_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, fast_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } - - } else { - // C is in slow memory. - if (params.work_mem_space == 1) { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t, - hbm_mem_space, hbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } else { - c_slow_crsmat = KokkosKernels::Experiment::run_experiment< - myExecSpace, slow_crstmat_t, slow_crstmat_t, slow_crstmat_t, - sbm_mem_space, sbm_mem_space>(a_slow_crsmat, b_slow_crsmat, - params); - } - } - } - } - - if (c_mat_file != NULL) { - if (params.c_mem_space == 1) { - KokkosSparse::sort_crs_matrix(c_fast_crsmat); - - KokkosSparse::Impl::write_graph_bin( - (lno_t)(c_fast_crsmat.numRows()), - (size_type)(c_fast_crsmat.graph.entries.extent(0)), - c_fast_crsmat.graph.row_map.data(), - c_fast_crsmat.graph.entries.data(), c_fast_crsmat.values.data(), - c_mat_file); - } else { - KokkosSparse::sort_crs_matrix(c_slow_crsmat); - - KokkosSparse::Impl::write_graph_bin( - (lno_t)c_slow_crsmat.numRows(), - (size_type)c_slow_crsmat.graph.entries.extent(0), - c_slow_crsmat.graph.row_map.data(), - c_slow_crsmat.graph.entries.data(), c_slow_crsmat.values.data(), - c_mat_file); - } - } -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/sparse/KokkosSparse_run_spgemm.hpp b/perf_test/sparse/KokkosSparse_run_spgemm.hpp deleted file mode 100644 index 67d61d1f75..0000000000 --- a/perf_test/sparse/KokkosSparse_run_spgemm.hpp +++ /dev/null @@ -1,301 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#include "KokkosSparse_spgemm.hpp" -#include "KokkosKernels_TestParameters.hpp" -#include "KokkosSparse_SortCrs.hpp" - -#define TRANPOSEFIRST false -#define TRANPOSESECOND false - -namespace KokkosKernels { - -namespace Experiment { -template -bool is_same_matrix(crsMat_t output_mat1, crsMat_t output_mat2) { - typedef typename crsMat_t::StaticCrsGraphType graph_t; - typedef typename graph_t::row_map_type::non_const_type lno_view_t; - typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - - size_t nrows1 = output_mat1.graph.row_map.extent(0); - size_t nentries1 = output_mat1.graph.entries.extent(0); - size_t nvals1 = output_mat1.values.extent(0); - - size_t nrows2 = output_mat2.graph.row_map.extent(0); - size_t nentries2 = output_mat2.graph.entries.extent(0); - size_t nvals2 = output_mat2.values.extent(0); - - KokkosSparse::sort_crs_matrix(output_mat1); - - if (nrows1 != nrows2) { - std::cerr << "row count is different" << std::endl; - return false; - } - if (nentries1 != nentries2) { - std::cerr << "nentries2 is different" << std::endl; - return false; - } - if (nvals1 != nvals2) { - std::cerr << "nvals1 is different" << std::endl; - return false; - } - - KokkosSparse::sort_crs_matrix(output_mat2); - - bool is_identical = true; - is_identical = KokkosKernels::Impl::kk_is_identical_view< - typename graph_t::row_map_type, typename graph_t::row_map_type, - typename lno_view_t::value_type, typename device::execution_space>( - output_mat1.graph.row_map, output_mat2.graph.row_map, 0); - if (!is_identical) { - std::cerr << "rowmaps differ" << std::endl; - return false; - } - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, - typename device::execution_space>(output_mat1.graph.entries, - output_mat2.graph.entries, 0); - if (!is_identical) { - for (size_t i = 0; i < nrows1; ++i) { - size_t rb = output_mat1.graph.row_map(i); - size_t re = output_mat1.graph.row_map(i + 1); - bool incorrect = false; - for (size_t j = rb; j < re; ++j) { - if (output_mat1.graph.entries(j) != output_mat2.graph.entries(j)) { - incorrect = true; - break; - } - } - if (incorrect) { - for (size_t j = rb; j < re; ++j) { - std::cerr << "row:" << i << " j:" << j - << " h_ent1(j):" << output_mat1.graph.entries(j) - << " h_ent2(j):" << output_mat2.graph.entries(j) - << " rb:" << rb << " re:" << re << std::endl; - } - } - } - std::cerr << "entries differ" << std::endl; - return false; - } - - is_identical = KokkosKernels::Impl::kk_is_identical_view< - scalar_view_t, scalar_view_t, typename scalar_view_t::value_type, - typename device::execution_space>(output_mat1.values, output_mat2.values, - 0.000001); - if (!is_identical) { - std::cerr << "Incorret values" << std::endl; - } - return true; -} - -template -crsMat_t3 run_experiment(crsMat_t crsMat, crsMat_t2 crsMat2, - Parameters params) { - using namespace KokkosSparse; - using namespace KokkosSparse::Experimental; - using device_t = Kokkos::Device; - int algorithm = params.algorithm; - int repeat = params.repeat; - int chunk_size = params.chunk_size; - - int shmemsize = params.shmemsize; - int team_size = params.team_size; - int use_dynamic_scheduling = params.use_dynamic_scheduling; - int verbose = params.verbose; - int calculate_read_write_cost = params.calculate_read_write_cost; - // char spgemm_step = params.spgemm_step; - int vector_size = params.vector_size; - int check_output = params.check_output; - int mkl_keep_output = params.mkl_keep_output; - // spgemm_step++; - typedef typename crsMat_t3::values_type::non_const_type scalar_view_t; - typedef typename crsMat_t3::row_map_type::non_const_type lno_view_t; - typedef typename crsMat_t3::index_type::non_const_type lno_nnz_view_t; - typedef typename lno_nnz_view_t::value_type lno_t; - typedef typename lno_view_t::value_type size_type; - typedef typename scalar_view_t::value_type scalar_t; - - lno_view_t row_mapC; - lno_nnz_view_t entriesC; - scalar_view_t valuesC; - - typedef KokkosKernels::Experimental::KokkosKernelsHandle< - size_type, lno_t, scalar_t, ExecSpace, TempMemSpace, PersistentMemSpace> - KernelHandle; - - typedef typename lno_nnz_view_t::value_type idx; - typedef typename lno_view_t::value_type size_type; - - KernelHandle kh; - kh.set_team_work_size(chunk_size); - kh.set_shmem_size(shmemsize); - kh.set_suggested_team_size(team_size); - kh.set_suggested_vector_size(vector_size); - - if (use_dynamic_scheduling) { - kh.set_dynamic_scheduling(true); - } - if (verbose) { - kh.set_verbose(true); - } - - const idx m = crsMat.numRows(); - const idx n = crsMat2.numRows(); - const idx k = crsMat2.numCols(); - - if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl; - if (n < crsMat.numCols()) { - std::cerr << "left.numCols():" << crsMat.numCols() - << " right.numRows():" << crsMat2.numRows() << std::endl; - exit(1); - } - - // The reference product (for verifying correctness) - // Don't allocate them if they won't be used, but they must be declared here. - lno_view_t row_mapC_ref; - lno_nnz_view_t entriesC_ref; - scalar_view_t valuesC_ref; - // Reference output has same type as actual output - crsMat_t3 Ccrsmat_ref; - - if (check_output) { - if (verbose) std::cout << "Running a reference algorithm" << std::endl; - row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1); - KernelHandle sequential_kh; - sequential_kh.set_team_work_size(chunk_size); - sequential_kh.set_shmem_size(shmemsize); - sequential_kh.set_suggested_team_size(team_size); - sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL); - - if (use_dynamic_scheduling) { - sequential_kh.set_dynamic_scheduling(true); - } - - spgemm_symbolic(&sequential_kh, m, n, k, crsMat.graph.row_map, - crsMat.graph.entries, TRANPOSEFIRST, crsMat2.graph.row_map, - crsMat2.graph.entries, TRANPOSESECOND, row_mapC_ref); - - ExecSpace().fence(); - - size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz(); - entriesC_ref = lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); - valuesC_ref = scalar_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); - - spgemm_numeric(&sequential_kh, m, n, k, crsMat.graph.row_map, - crsMat.graph.entries, crsMat.values, TRANPOSEFIRST, - - crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, - TRANPOSESECOND, row_mapC_ref, entriesC_ref, valuesC_ref); - ExecSpace().fence(); - - Ccrsmat_ref = crsMat_t3("CorrectC", m, k, valuesC_ref.extent(0), - valuesC_ref, row_mapC_ref, entriesC_ref); - } - - for (int i = 0; i < repeat; ++i) { - kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm)); - - kh.get_spgemm_handle()->mkl_keep_output = mkl_keep_output; - kh.get_spgemm_handle()->set_mkl_sort_option(params.mkl_sort_option); - - // if mkl2 input needs to be converted to 1base. - kh.get_spgemm_handle()->mkl_convert_to_1base = true; - - // 250000 default. if cache-mode is used on KNL can increase to 1M. - kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc; - - if (i == 0) { - kh.get_spgemm_handle()->set_read_write_cost_calc( - calculate_read_write_cost); - } - // do the compression whether in 2 step, or 1 step. - kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); - // whether to scale the hash more. default is 1, so no scale. - kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); - // max occupancy in 1-level LP hashes. LL hashes can be 100% - kh.get_spgemm_handle()->set_first_level_hash_cut_off( - params.first_level_hash_cut_off); - // min reduction on FLOPs to run compression - kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off); - - row_mapC = lno_view_t("non_const_lnow_row", m + 1); - entriesC = lno_nnz_view_t("entriesC (empty)", 0); - valuesC = scalar_view_t("valuesC (empty)", 0); - - Kokkos::Timer timer1; - spgemm_symbolic(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries, - TRANPOSEFIRST, crsMat2.graph.row_map, crsMat2.graph.entries, - TRANPOSESECOND, row_mapC); - - ExecSpace().fence(); - double symbolic_time = timer1.seconds(); - - Kokkos::Timer timer3; - size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); - if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl; - if (c_nnz_size) { - entriesC = lno_nnz_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), - c_nnz_size); - valuesC = scalar_view_t( - Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), - c_nnz_size); - } - spgemm_numeric(&kh, m, n, k, crsMat.graph.row_map, crsMat.graph.entries, - crsMat.values, TRANPOSEFIRST, - - crsMat2.graph.row_map, crsMat2.graph.entries, crsMat2.values, - TRANPOSESECOND, row_mapC, entriesC, valuesC); - ExecSpace().fence(); - double numeric_time = timer3.seconds(); - - std::cout << "mm_time:" << symbolic_time + numeric_time - << " symbolic_time:" << symbolic_time - << " numeric_time:" << numeric_time << std::endl; - } - if (verbose) { - std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; - std::cout << "entriesC:" << entriesC.extent(0) << std::endl; - std::cout << "valuesC:" << valuesC.extent(0) << std::endl; - KokkosKernels::Impl::print_1Dview(valuesC); - KokkosKernels::Impl::print_1Dview(entriesC); - KokkosKernels::Impl::print_1Dview(row_mapC); - } - crsMat_t3 Ccrsmat_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, - row_mapC, entriesC); - if (check_output) { - bool is_identical = - is_same_matrix(Ccrsmat_result, Ccrsmat_ref); - if (!is_identical) { - std::cerr << "Result differs. If values are differing, might be floating " - "point order error." - << std::endl; - exit(1); - } - } - return Ccrsmat_result; -} - -} // namespace Experiment -} // namespace KokkosKernels diff --git a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp index a2004e007b..db4141368a 100644 --- a/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp +++ b/perf_test/sparse/KokkosSparse_run_spgemm_jacobi.hpp @@ -299,9 +299,9 @@ void run_spgemm_jacobi(Parameters params) { void, size_type> slow_crstmat_t; - char *a_mat_file = params.a_mtx_bin_file; - char *b_mat_file = params.b_mtx_bin_file; - char *c_mat_file = params.c_mtx_bin_file; + const char *a_mat_file = params.a_mtx_bin_file.c_str(); + const char *b_mat_file = params.b_mtx_bin_file.c_str(); + const char *c_mat_file = params.c_mtx_bin_file.c_str(); slow_crstmat_t a_slow_crsmat, b_slow_crsmat, c_slow_crsmat; fast_crstmat_t a_fast_crsmat, b_fast_crsmat, c_fast_crsmat; diff --git a/perf_test/sparse/KokkosSparse_spgemm.cpp b/perf_test/sparse/KokkosSparse_spgemm.cpp index d46e9f6f11..cee68ef11a 100644 --- a/perf_test/sparse/KokkosSparse_spgemm.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm.cpp @@ -16,22 +16,119 @@ #include #include "KokkosKernels_config.h" #include "KokkosKernels_default_types.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosSparse_multimem_spgemm.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_CrsMatrix.hpp" +#include "KokkosSparse_spgemm.hpp" +#include "KokkosSparse_SortCrs.hpp" +#include "KokkosBlas1_nrminf.hpp" +#include "KokkosBlas1_axpby.hpp" +#include "KokkosKernels_TestParameters.hpp" #include "KokkosKernels_TestUtils.hpp" +#include "KokkosKernels_perf_test_utilities.hpp" + +#define TRANSPOSEFIRST false +#define TRANSPOSESECOND false + +template +bool is_same_matrix(crsMat_t output_mat_actual, crsMat_t output_mat_reference) { + typedef typename crsMat_t::StaticCrsGraphType graph_t; + typedef typename graph_t::row_map_type::non_const_type lno_view_t; + typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + + size_t nrows1 = output_mat_actual.graph.row_map.extent(0); + size_t ncols1 = output_mat_actual.graph.row_map.extent(0); + size_t nentries1 = output_mat_actual.graph.entries.extent(0); + size_t nvals1 = output_mat_actual.values.extent(0); + + size_t nrows2 = output_mat_reference.graph.row_map.extent(0); + size_t ncols2 = output_mat_reference.graph.row_map.extent(0); + size_t nentries2 = output_mat_reference.graph.entries.extent(0); + size_t nvals2 = output_mat_reference.values.extent(0); + + if (nrows1 != nrows2 || ncols1 != ncols2) { + std::cerr << "Wrong dimensions: is " << nrows1 << 'x' << ncols1 + << " but should be " << nrows2 << 'x' << ncols2 << '\n'; + return false; + } + if (nentries1 != nentries2) { + std::cerr << "Wrong number of entries: " << nentries1 + << ", but should have " << nentries2 << '\n'; + return false; + } + if (nvals1 != nvals2) { + std::cerr << "Wrong number of values: " << nvals1 << ", but should have " + << nvals2 << '\n'; + return false; + } + + bool is_identical = true; + is_identical = KokkosKernels::Impl::kk_is_identical_view< + typename graph_t::row_map_type, typename graph_t::row_map_type, + typename lno_view_t::value_type, typename device::execution_space>( + output_mat_actual.graph.row_map, output_mat_reference.graph.row_map, 0); + if (!is_identical) { + std::cerr << "Wrong rowmap:\n"; + KokkosKernels::Impl::print_1Dview(std::cerr, + output_mat_actual.graph.row_map); + std::cerr << "but should be:\n"; + KokkosKernels::Impl::print_1Dview(std::cerr, + output_mat_reference.graph.row_map); + return false; + } + + is_identical = KokkosKernels::Impl::kk_is_identical_view< + lno_nnz_view_t, lno_nnz_view_t, typename lno_nnz_view_t::value_type, + typename device::execution_space>(output_mat_actual.graph.entries, + output_mat_reference.graph.entries, 0); + if (!is_identical) { + for (size_t i = 0; i < nrows1; ++i) { + size_t rb = output_mat_actual.graph.row_map(i); + size_t re = output_mat_actual.graph.row_map(i + 1); + bool incorrect = false; + for (size_t j = rb; j < re; ++j) { + if (output_mat_actual.graph.entries(j) != + output_mat_reference.graph.entries(j)) { + incorrect = true; + break; + } + } + if (incorrect) { + for (size_t j = rb; j < re; ++j) { + std::cerr << "row:" << i << " j:" << j + << " h_ent1(j):" << output_mat_actual.graph.entries(j) + << " h_ent2(j):" << output_mat_reference.graph.entries(j) + << " rb:" << rb << " re:" << re << std::endl; + } + } + } + std::cerr << "Wrong entries, see above." << std::endl; + return false; + } + + scalar_view_t valueDiff( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "spgemm values diff"), + output_mat_actual.values.extent(0)); + Kokkos::deep_copy(valueDiff, output_mat_actual.values); + KokkosBlas::axpy(-1.0, output_mat_reference.values, valueDiff); + auto maxDiff = KokkosBlas::nrminf(valueDiff); + + std::cout + << "Absolute maximum difference between actual and reference C values: " + << maxDiff << '\n'; + + return true; +} void print_options() { std::cerr << "Options\n" << std::endl; + std::cerr << perf_test::list_common_options(); + std::cerr << "\t[Required] INPUT MATRIX: '--amtx [left_hand_side.mtx]' -- for C=AxA" << std::endl; - std::cerr << "\t[Optional] BACKEND: '--threads [numThreads]' | '--openmp " - "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip " - "[hipDeviceIndex]' --> if none are specified, Serial is used " - "(if enabled)" - << std::endl; std::cerr << "\t[Optional] '--algorithm " "[DEFAULT=KKDEFAULT=KKSPGEMM|KKMEM|KKDENSE]' --> to choose algorithm. " @@ -47,158 +144,113 @@ void print_options() { "250k, which is max k value to choose dense accumulators. This " "can be increased with more memory bandwidth." << std::endl; - std::cerr - << "\tThe memory space used for each matrix: '--memspaces [0|1|....15]' " - "--> Bits representing the use of HBM for Work, C, B, and A " - "respectively. For example 12 = 1100, will store work arrays and C on " - "HBM. A and B will be stored DDR. To use this enable multilevel " - "memory in Kokkos, check generate_makefile.sh" - << std::endl; - std::cerr << "\tLoop scheduling: '--dynamic': Use this for dynamic " - "scheduling of the loops. (Better performance most of the time)" + std::cerr << "\t[Optional] '--dynamic': Use this for dynamic " + "loop scheduling. (Better performance most of the time)" + << std::endl; + std::cerr << "\t[Optional] '--verbose': detailed output about SpGEMM and the " + "output matrix" + << std::endl; + std::cerr << "\t[Optional] '--checkoutput': verify result against serial " + "reference implementation" << std::endl; - std::cerr << "\tVerbose Output: '--verbose'" << std::endl; -} - -static char* getNextArg(int& i, int argc, char** argv) { - i++; - if (i >= argc) { - std::cerr << "Error: expected additional command-line argument!\n"; - exit(1); - } - return argv[i]; } int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, char** argv) { + std::string algoStr; + bool printHelp; for (int i = 1; i < argc; ++i) { - if (0 == Test::string_compare_no_case(argv[i], "--threads")) { - params.use_threads = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { - params.use_openmp = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { - params.use_cuda = atoi(getNextArg(i, argc, argv)) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { - params.use_hip = atoi(getNextArg(i, argc, argv)) + 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { - params.repeat = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--hashscale")) { - params.minhashscale = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--chunksize")) { - params.chunk_size = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--teamsize")) { - params.team_size = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--vectorsize")) { - params.vector_size = atoi(getNextArg(i, argc, argv)); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--compression2step")) { - params.compression2step = true; - } else if (0 == Test::string_compare_no_case(argv[i], "--shmem")) { - params.shmemsize = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--memspaces")) { - int memspaces = atoi(getNextArg(i, argc, argv)); - int memspaceinfo = memspaces; - std::cout << "memspaceinfo:" << memspaceinfo << std::endl; - if (memspaceinfo & 1) { - params.a_mem_space = 1; - std::cout << "Using HBM for A" << std::endl; - } else { - params.a_mem_space = 0; - std::cout << "Using DDR4 for A" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.b_mem_space = 1; - std::cout << "Using HBM for B" << std::endl; - } else { - params.b_mem_space = 0; - std::cout << "Using DDR4 for B" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.c_mem_space = 1; - std::cout << "Using HBM for C" << std::endl; - } else { - params.c_mem_space = 0; - std::cout << "Using DDR4 for C" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - if (memspaceinfo & 1) { - params.work_mem_space = 1; - std::cout << "Using HBM for work memory space" << std::endl; - } else { - params.work_mem_space = 0; - std::cout << "Using DDR4 for work memory space" << std::endl; - } - memspaceinfo = memspaceinfo >> 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--CRWC")) { - params.calculate_read_write_cost = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--CIF")) { - params.coloring_input_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--COF")) { - params.coloring_output_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--CCO")) { + if (perf_test::check_arg_int(i, argc, argv, "--repeat", params.repeat)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--hashscale", + params.minhashscale)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--chunksize", + params.chunk_size)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--teamsize", + params.team_size)) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--vectorsize", + params.vector_size)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--compression2step", + params.compression2step)) { + } else if (perf_test::check_arg_int(i, argc, argv, "--shmem", + params.shmemsize)) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--CRWC", + params.calculate_read_write_cost)) { + } else if (perf_test::check_arg_str(i, argc, argv, "--CIF", + params.coloring_input_file)) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--COF", + params.coloring_output_file)) { + ++i; + } else if (perf_test::check_arg_double(i, argc, argv, "--CCO", + params.compression_cut_off)) { // if 0.85 set, if compression does not reduce flops by at least 15% // symbolic will run on original matrix. otherwise, it will compress the // graph and run symbolic on compressed one. - params.compression_cut_off = atof(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--FLHCO")) { + ++i; + } else if (perf_test::check_arg_double(i, argc, argv, "--FLHCO", + params.first_level_hash_cut_off)) { // if linear probing is used as hash, what is the max occupancy percantage // we allow in the hash. - params.first_level_hash_cut_off = atof(getNextArg(i, argc, argv)); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--flop")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--flop", + params.calculate_read_write_cost)) { // print flop statistics. only for the first repeat. - params.calculate_read_write_cost = 1; - } - - else if (0 == Test::string_compare_no_case(argv[i], "--mklsort")) { + // note: if either --CRWC or --flop is passed, this parameter is set to + // true + } else if (perf_test::check_arg_int(i, argc, argv, "--mklsort", + params.mkl_sort_option)) { // when mkl2 is run, the sort option to use. // 7:not to sort the output // 8:to sort the output - params.mkl_sort_option = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--mklkeepout")) { + ++i; + } else if (perf_test::check_arg_int(i, argc, argv, "--mklkeepout", + params.mkl_keep_output)) { // mkl output is not kept. - params.mkl_keep_output = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--checkoutput")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--checkoutput", + params.check_output)) { // check correctness - params.check_output = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { + } else if (perf_test::check_arg_str(i, argc, argv, "--amtx", + params.a_mtx_bin_file)) { // A at C=AxB - params.a_mtx_bin_file = getNextArg(i, argc, argv); - } - - else if (0 == Test::string_compare_no_case(argv[i], "--bmtx")) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--bmtx", + params.b_mtx_bin_file)) { // B at C=AxB. // if not provided, C = AxA will be performed. - params.b_mtx_bin_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--cmtx")) { + ++i; + } else if (perf_test::check_arg_str(i, argc, argv, "--cmtx", + params.c_mtx_bin_file)) { // if provided, C will be written to given file. // has to have ".bin", or ".crs" extension. - params.c_mtx_bin_file = getNextArg(i, argc, argv); - } else if (0 == Test::string_compare_no_case(argv[i], "--dynamic")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--dynamic", + params.use_dynamic_scheduling)) { // dynamic scheduling will be used for loops. // currently it is default already. // so has to use the dynamic schedulin. - params.use_dynamic_scheduling = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--DENSEACCMAX")) { + } else if (perf_test::check_arg_int(i, argc, argv, "--DENSEACCMAX", + params.MaxColDenseAcc)) { // on CPUs and KNLs if DEFAULT algorithm or KKSPGEMM is chosen, // it uses dense accumulators for smaller matrices based on the size of // column (k) in B. Max column size is 250,000 for k to use dense // accumulators. this parameter overwrites this. with cache mode, or CPUs // with smaller thread count, where memory bandwidth is not an issue, this // cut-off can be increased to be more than 250,000 - params.MaxColDenseAcc = atoi(getNextArg(i, argc, argv)); - } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "--verbose", + params.verbose)) { // print the timing and information about the inner steps. // if you are timing TPL libraries, for correct timing use verbose option, // because there are pre- post processing in these TPL kernel wraps. - params.verbose = 1; - } else if (0 == Test::string_compare_no_case(argv[i], "--algorithm")) { - char* algoStr = getNextArg(i, argc, argv); - + } else if (perf_test::check_arg_str(i, argc, argv, "--algorithm", + algoStr)) { if (0 == Test::string_compare_no_case(algoStr, "DEFAULT")) { params.algorithm = KokkosSparse::SPGEMM_KK; } else if (0 == Test::string_compare_no_case(algoStr, "KKDEFAULT")) { @@ -218,11 +270,14 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, } else { - std::cerr << "Unrecognized command line argument #" << i << ": " - << argv[i] << std::endl; + std::cerr << "Unrecognized value for --algorithm (argument #" << i + << "): " << argv[i] << std::endl; print_options(); return 1; } + ++i; + } else if (perf_test::check_arg_bool(i, argc, argv, "-h", printHelp)) { + } else if (perf_test::check_arg_bool(i, argc, argv, "--help", printHelp)) { } else { std::cerr << "Unrecognized command line argument #" << i << ": " << argv[i] << std::endl; @@ -230,96 +285,239 @@ int parse_inputs(KokkosKernels::Experiment::Parameters& params, int argc, return 1; } } + if (printHelp) { + print_options(); + return 1; + } return 0; } -int main(int argc, char** argv) { +template +void run_spgemm(int argc, char** argv, perf_test::CommonInputParams) { + using namespace KokkosSparse; + using namespace KokkosSparse::Experimental; + + using MemSpace = typename ExecSpace::memory_space; using size_type = default_size_type; using lno_t = default_lno_t; using scalar_t = default_scalar; + using device_t = Kokkos::Device; + using crsMat_t = typename KokkosSparse::CrsMatrix; + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle< + size_type, lno_t, scalar_t, ExecSpace, MemSpace, MemSpace>; KokkosKernels::Experiment::Parameters params; if (parse_inputs(params, argc, argv)) { - return 1; + return; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a and b matrix files" << std::endl; print_options(); - return 0; + return; + } + + crsMat_t A, B, C; + + // read a and b matrices + + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.a_mtx_bin_file.c_str()); + + if ((params.b_mtx_bin_file == "" || + params.a_mtx_bin_file == params.b_mtx_bin_file)) { + std::cout << "B is not provided or is the same as A. Multiplying AxA." + << std::endl; + B = A; + } else { + B = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.b_mtx_bin_file.c_str()); + } + + int algorithm = params.algorithm; + int repeat = params.repeat; + int chunk_size = params.chunk_size; + + int shmemsize = params.shmemsize; + int team_size = params.team_size; + int use_dynamic_scheduling = params.use_dynamic_scheduling; + int verbose = params.verbose; + int calculate_read_write_cost = params.calculate_read_write_cost; + // char spgemm_step = params.spgemm_step; + int vector_size = params.vector_size; + int check_output = params.check_output; + int mkl_keep_output = params.mkl_keep_output; + // spgemm_step++; + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename crsMat_t::row_map_type::non_const_type lno_view_t; + typedef typename crsMat_t::index_type::non_const_type lno_nnz_view_t; + + lno_view_t row_mapC; + lno_nnz_view_t entriesC; + scalar_view_t valuesC; + + KernelHandle kh; + kh.set_team_work_size(chunk_size); + kh.set_shmem_size(shmemsize); + kh.set_suggested_team_size(team_size); + kh.set_suggested_vector_size(vector_size); + + if (use_dynamic_scheduling) { + kh.set_dynamic_scheduling(true); } - if (params.b_mtx_bin_file == NULL) { - std::cout << "B is not provided. Multiplying AxA." << std::endl; + if (verbose) { + kh.set_verbose(true); } - const int num_threads = std::max(params.use_openmp, params.use_threads); - const int device_id = - params.use_cuda ? params.use_cuda - 1 : params.use_hip - 1; - - Kokkos::initialize(Kokkos::InitializationSettings() - .set_num_threads(num_threads) - .set_device_id(device_id)); - Kokkos::print_configuration(std::cout); - -#if defined(KOKKOS_ENABLE_OPENMP) - - if (params.use_openmp) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_HBWSPACE - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::Experimental::HBWSpace, Kokkos::HostSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::OpenMP, - Kokkos::OpenMP::memory_space, Kokkos::OpenMP::memory_space>(params); -#endif + const lno_t m = A.numRows(); + const lno_t n = B.numRows(); + const lno_t k = B.numCols(); + + if (verbose) std::cout << "m:" << m << " n:" << n << " k:" << k << std::endl; + if (n < A.numCols()) { + std::cerr << "left.numCols():" << A.numCols() + << " right.numRows():" << B.numRows() << std::endl; + exit(1); } -#endif - -#if defined(KOKKOS_ENABLE_CUDA) - if (params.use_cuda) { -#ifdef KOKKOSKERNELS_INST_MEMSPACE_CUDAHOSTPINNEDSPACE - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::CudaHostPinnedSpace>(params); -#else - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Cuda, Kokkos::Cuda::memory_space, - Kokkos::Cuda::memory_space>(params); - -#endif + + // The reference product (for verifying correctness) + // Don't allocate them if they won't be used, but they must be declared here. + lno_view_t row_mapC_ref; + lno_nnz_view_t entriesC_ref; + scalar_view_t valuesC_ref; + // Reference output has same type as actual output + crsMat_t C_ref; + + if (check_output) { + if (verbose) std::cout << "Running a reference algorithm" << std::endl; + row_mapC_ref = lno_view_t("non_const_lnow_row", m + 1); + KernelHandle sequential_kh; + sequential_kh.set_team_work_size(chunk_size); + sequential_kh.set_shmem_size(shmemsize); + sequential_kh.set_suggested_team_size(team_size); + sequential_kh.create_spgemm_handle(KokkosSparse::SPGEMM_SERIAL); + + if (use_dynamic_scheduling) { + sequential_kh.set_dynamic_scheduling(true); + } + + spgemm_symbolic(&sequential_kh, m, n, k, A.graph.row_map, A.graph.entries, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, + TRANSPOSESECOND, row_mapC_ref); + + ExecSpace().fence(); + + size_type c_nnz_size = sequential_kh.get_spgemm_handle()->get_c_nnz(); + entriesC_ref = lno_nnz_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC_ref = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), c_nnz_size); + + spgemm_numeric(&sequential_kh, m, n, k, A.graph.row_map, A.graph.entries, + A.values, TRANSPOSEFIRST, + + B.graph.row_map, B.graph.entries, B.values, TRANSPOSESECOND, + row_mapC_ref, entriesC_ref, valuesC_ref); + ExecSpace().fence(); + + C_ref = crsMat_t("CorrectC", m, k, valuesC_ref.extent(0), valuesC_ref, + row_mapC_ref, entriesC_ref); } -#endif -#if defined(KOKKOS_ENABLE_HIP) - if (params.use_hip) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Experimental::HIP, - Kokkos::Experimental::HIPSpace, Kokkos::Experimental::HIPSpace>(params); + for (int i = 0; i < repeat; ++i) { + kh.create_spgemm_handle(KokkosSparse::SPGEMMAlgorithm(algorithm)); + + kh.get_spgemm_handle()->mkl_keep_output = mkl_keep_output; + kh.get_spgemm_handle()->set_mkl_sort_option(params.mkl_sort_option); + + // if mkl2 input needs to be converted to 1base. + kh.get_spgemm_handle()->mkl_convert_to_1base = true; + + // 250000 default. if cache-mode is used on KNL can increase to 1M. + kh.get_spgemm_handle()->MaxColDenseAcc = params.MaxColDenseAcc; + + if (i == 0) { + kh.get_spgemm_handle()->set_read_write_cost_calc( + calculate_read_write_cost); + } + // do the compression whether in 2 step, or 1 step. + kh.get_spgemm_handle()->set_compression_steps(!params.compression2step); + // whether to scale the hash more. default is 1, so no scale. + kh.get_spgemm_handle()->set_min_hash_size_scale(params.minhashscale); + // max occupancy in 1-level LP hashes. LL hashes can be 100% + kh.get_spgemm_handle()->set_first_level_hash_cut_off( + params.first_level_hash_cut_off); + // min reduction on FLOPs to run compression + kh.get_spgemm_handle()->set_compression_cut_off(params.compression_cut_off); + + row_mapC = lno_view_t("non_const_lnow_row", m + 1); + entriesC = lno_nnz_view_t("entriesC (empty)", 0); + valuesC = scalar_view_t("valuesC (empty)", 0); + + Kokkos::Timer timer1; + spgemm_symbolic(&kh, m, n, k, A.graph.row_map, A.graph.entries, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, + TRANSPOSESECOND, row_mapC); + + ExecSpace().fence(); + double symbolic_time = timer1.seconds(); + + Kokkos::Timer timer3; + size_type c_nnz_size = kh.get_spgemm_handle()->get_c_nnz(); + if (verbose) std::cout << "C SIZE:" << c_nnz_size << std::endl; + if (c_nnz_size) { + entriesC = lno_nnz_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "entriesC"), + c_nnz_size); + valuesC = scalar_view_t( + Kokkos::view_alloc(Kokkos::WithoutInitializing, "valuesC"), + c_nnz_size); + } + spgemm_numeric(&kh, m, n, k, A.graph.row_map, A.graph.entries, A.values, + TRANSPOSEFIRST, B.graph.row_map, B.graph.entries, B.values, + TRANSPOSESECOND, row_mapC, entriesC, valuesC); + + ExecSpace().fence(); + double numeric_time = timer3.seconds(); + + std::cout << "mm_time:" << symbolic_time + numeric_time + << " symbolic_time:" << symbolic_time + << " numeric_time:" << numeric_time << std::endl; } -#endif - -#if defined(KOKKOS_ENABLE_THREADS) - // If only serial is enabled (or no other device was specified), run with - // serial - if (params.use_threads) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Threads, Kokkos::HostSpace, - Kokkos::HostSpace>(params); + if (verbose) { + std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; + std::cout << "entriesC:" << entriesC.extent(0) << std::endl; + std::cout << "valuesC:" << valuesC.extent(0) << std::endl; + KokkosKernels::Impl::print_1Dview(valuesC); + KokkosKernels::Impl::print_1Dview(entriesC); + KokkosKernels::Impl::print_1Dview(row_mapC); } -#endif - -#if defined(KOKKOS_ENABLE_SERIAL) - // If only serial is enabled (or no other device was specified), run with - // serial - if (!params.use_openmp && !params.use_cuda && !params.use_threads) { - KokkosKernels::Experiment::run_multi_mem_spgemm< - size_type, lno_t, scalar_t, Kokkos::Serial, Kokkos::HostSpace, - Kokkos::HostSpace>(params); + crsMat_t C_result("CrsMatrixC", m, k, valuesC.extent(0), valuesC, row_mapC, + entriesC); + if (check_output) { + bool is_identical = is_same_matrix(C_result, C_ref); + if (!is_identical) { + std::cerr << "SpGEMM result differs with reference implementation.\n"; + exit(1); + } else { + std::cerr << "SpGEMM result matches reference implementation.\n"; + } } -#endif - Kokkos::finalize(); + if (params.c_mtx_bin_file != "") { + KokkosSparse::sort_crs_matrix(C_result); - return 0; + KokkosSparse::Impl::write_graph_bin( + (lno_t)(C_result.numRows()), (size_type)(C_result.nnz()), + C_result.graph.row_map.data(), C_result.graph.entries.data(), + C_result.values.data(), params.c_mtx_bin_file.c_str()); + } } + +#define KOKKOSKERNELS_PERF_TEST_NAME run_spgemm +#include "KokkosKernels_perf_test_instantiation.hpp" +int main(int argc, char** argv) { + return main_instantiation(argc, argv); +} // main diff --git a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp index bcb71e951a..ff30fdf565 100644 --- a/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp +++ b/perf_test/sparse/KokkosSparse_spgemm_jacobi.cpp @@ -219,12 +219,12 @@ int main(int argc, char** argv) { if (parse_inputs(params, argc, argv)) { return 1; } - if (params.a_mtx_bin_file == NULL) { + if (params.a_mtx_bin_file == "") { std::cerr << "Provide a and b matrix files" << std::endl; print_options(); return 0; } - if (params.b_mtx_bin_file == NULL) { + if (params.b_mtx_bin_file == "") { std::cout << "B is not provided. Multiplying AxA." << std::endl; } diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp index 713c201a8f..e3312c0a41 100644 --- a/test_common/KokkosKernels_TestParameters.hpp +++ b/test_common/KokkosKernels_TestParameters.hpp @@ -29,26 +29,28 @@ struct Parameters { int multi_color_scale; int shmemsize; int team_size; - int use_dynamic_scheduling; - int verbose; + bool use_dynamic_scheduling; + bool verbose; int spgemm_step; int vector_size; - int check_output; + bool check_output; int mkl_sort_option; int mkl_keep_output; - int calculate_read_write_cost; - char *coloring_input_file; - char *coloring_output_file; + bool calculate_read_write_cost; + std::string coloring_input_file; + std::string coloring_output_file; int minhashscale; int use_threads; int use_openmp; int use_cuda; int use_hip; + int use_sycl; + int use_openmptarget; int use_serial; int a_mem_space, b_mem_space, c_mem_space, work_mem_space; - char *a_mtx_bin_file, *b_mtx_bin_file, *c_mtx_bin_file; + std::string a_mtx_bin_file, b_mtx_bin_file, c_mtx_bin_file; bool compression2step; int left_lower_triangle, right_lower_triangle; int left_sort, right_sort; @@ -62,7 +64,7 @@ struct Parameters { int cache_flush; double first_level_hash_cut_off; double compression_cut_off; - size_t MaxColDenseAcc; + int MaxColDenseAcc; // 0 - no flush // 1 - soft flush // 2 - hard flush with rand. @@ -74,24 +76,26 @@ struct Parameters { multi_color_scale = 1; shmemsize = 16128; team_size = -1; - use_dynamic_scheduling = 0; - verbose = 0; + use_dynamic_scheduling = false; + verbose = false; spgemm_step = '0'; vector_size = -1; - check_output = 0; + check_output = false; mkl_sort_option = 7; mkl_keep_output = 1; - calculate_read_write_cost = 0; - coloring_input_file = NULL; - coloring_output_file = NULL; + calculate_read_write_cost = false; + coloring_input_file = ""; + coloring_output_file = ""; minhashscale = 1; use_threads = 0; use_openmp = 0; use_cuda = 0; use_hip = 0; + use_sycl = 0; + use_openmptarget = 0; use_serial = 0; a_mem_space = b_mem_space = c_mem_space = work_mem_space = 1; - a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = NULL; + a_mtx_bin_file = b_mtx_bin_file = c_mtx_bin_file = ""; compression2step = true; left_lower_triangle = 0; diff --git a/test_common/KokkosKernels_TestUtils.hpp b/test_common/KokkosKernels_TestUtils.hpp index fe68d68d07..130187ef35 100644 --- a/test_common/KokkosKernels_TestUtils.hpp +++ b/test_common/KokkosKernels_TestUtils.hpp @@ -520,6 +520,10 @@ int string_compare_no_case(const char* str1, const char* str2) { return strcmp(str1_s.c_str(), str2_s.c_str()); } +int string_compare_no_case(const std::string& str1, const std::string& str2) { + return string_compare_no_case(str1.c_str(), str2.c_str()); +} + /// /brief Cs (Compressed Sparse) matrix class for testing purposes. /// This class is for testing purposes only and will generate a random /// Crs / Ccs matrix when instantiated. The class is intentionally written