From d715f4ee0b6d088f19b0fb3975c0ca81c7eb77fb Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Mon, 8 Feb 2021 09:39:46 -0700 Subject: [PATCH 1/9] spadd tpl driver for performance testing (cusparse/mkl) --- perf_test/sparse/KokkosSparse_run_spadd.hpp | 160 -------- perf_test/sparse/KokkosSparse_spadd.cpp | 404 +++++++++++++++++-- src/common/KokkosKernels_IOUtils.hpp | 1 - test_common/KokkosKernels_TestParameters.hpp | 5 - 4 files changed, 370 insertions(+), 200 deletions(-) delete mode 100644 perf_test/sparse/KokkosSparse_run_spadd.hpp diff --git a/perf_test/sparse/KokkosSparse_run_spadd.hpp b/perf_test/sparse/KokkosSparse_run_spadd.hpp deleted file mode 100644 index e418639332..0000000000 --- a/perf_test/sparse/KokkosSparse_run_spadd.hpp +++ /dev/null @@ -1,160 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Siva Rajamanickam (srajama@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include "KokkosKernels_Handle.hpp" -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosSparse_spadd.hpp" -#include "KokkosKernels_TestParameters.hpp" - -namespace KokkosKernels { -namespace Experiment { - -template -void run_experiment(Parameters params) -{ - using namespace KokkosSparse; - using namespace KokkosSparse::Experimental; - - using size_type = typename crsMat_t::size_type; - using lno_t = typename crsMat_t::ordinal_type; - using scalar_t = typename crsMat_t::value_type; - using device_t = typename crsMat_t::device_type; - using exec_space = typename device_t::execution_space; - using mem_space = typename device_t::memory_space; - - using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle - ; - - std::cout << "************************************* \n"; - std::cout << "************************************* \n"; - std::cout << "Loading A from " << params.a_mtx_bin_file << '\n'; - crsMat_t A = Impl::read_kokkos_crst_matrix(params.a_mtx_bin_file); - std::cout << "Loading B from " << params.b_mtx_bin_file << '\n'; - crsMat_t B = Impl::read_kokkos_crst_matrix(params.b_mtx_bin_file); - //Make sure dimensions are compatible - if(A.numRows() != B.numRows()) - { - std::cout << "ERROR: A and B have different numbers of rows\n"; - exit(1); - } - if(A.numCols() != B.numCols()) - { - std::cout << "ERROR: A and B have different numbers of columns\n"; - exit(1); - } - lno_t m = A.numRows(); - lno_t n = A.numCols(); - std::cout << "Read in A and B: " << m << "x" << n << '\n'; - - typedef typename crsMat_t::values_type::non_const_type scalar_view_t; - typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t; - typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; - typedef typename crsMat_t::StaticCrsGraphType::row_map_type const_lno_view_t; - typedef typename crsMat_t::StaticCrsGraphType::entries_type const_lno_nnz_view_t; - - lno_view_t row_mapC; - lno_nnz_view_t entriesC; - scalar_view_t valuesC; - - KernelHandle kh; - - if(params.assume_sorted) - std::cout << "Assuming input matrices are sorted.\n"; - else - std::cout << "Assuming input matrices are not sorted.\n"; - kh.create_spadd_handle(params.assume_sorted); - auto addHandle = kh.get_spadd_handle(); - - row_mapC = lno_view_t("non_const_lnow_row", m + 1); - - Kokkos::Impl::Timer timer1; - - spadd_symbolic - (&kh, - A.graph.row_map, A.graph.entries, - B.graph.row_map, B.graph.entries, - row_mapC); - - exec_space().fence(); - double symbolic_time = timer1.seconds(); - - size_type c_nnz = addHandle->get_c_nnz(); - std::cout << "Result matrix will have " << c_nnz << " entries.\n"; - - entriesC = lno_nnz_view_t("entriesC (empty)", c_nnz); - valuesC = scalar_view_t("valuesC (empty)", c_nnz); - - Kokkos::Impl::Timer timer3; - - spadd_numeric(&kh, - A.graph.row_map, A.graph.entries, A.values, 1.0, //A, alpha - B.graph.row_map, B.graph.entries, B.values, 1.0, //B, beta - row_mapC, entriesC, valuesC); //C - - exec_space().fence(); - double numeric_time = timer3.seconds(); - - std::cout - << "total_time:" << symbolic_time + numeric_time - << " symbolic_time:" << symbolic_time - << " numeric_time:" << numeric_time << std::endl; - - if (params.verbose) - { - std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; - std::cout << "entriesC:" << entriesC.extent(0) << std::endl; - std::cout << "valuesC:" << valuesC.extent(0) << std::endl; - KokkosKernels::Impl::print_1Dview(valuesC); - KokkosKernels::Impl::print_1Dview(entriesC); - KokkosKernels::Impl::print_1Dview(row_mapC); - } - if(params.c_mtx_bin_file) - { - std::cout << "Writing C (" << m << "x" << n << ") to " << params.c_mtx_bin_file << "\n"; - crsMat_t C("C", m, n, c_nnz, valuesC, row_mapC, entriesC); - Impl::write_kokkos_crst_matrix(C, params.c_mtx_bin_file); - } -} - -}} // namespace KokkosKernels::Experiment diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 959e9d973c..ab003f5b9b 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -44,33 +44,346 @@ #include #include "KokkosKernels_config.h" +#include "KokkosKernels_Handle.hpp" +#include "KokkosKernels_IOUtils.hpp" +#include "KokkosSparse_spadd.hpp" + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE +#include +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL +#include +#include +#endif + #if defined(KOKKOSKERNELS_INST_DOUBLE) && \ defined(KOKKOSKERNELS_INST_OFFSET_INT) && \ defined(KOKKOSKERNELS_INST_ORDINAL_INT) -#include "KokkosKernels_IOUtils.hpp" -#include "KokkosKernels_TestParameters.hpp" -#include "KokkosSparse_run_spadd.hpp" -#include "KokkosSparse_CrsMatrix.hpp" -#define SIZE_TYPE int -#define INDEX_TYPE int -#define SCALAR_TYPE double -//double +struct Params +{ + int use_cuda = 0; + int use_openmp = 0; + int use_threads = 0; + int use_mkl = 0; + int use_cusparse = 0; + bool sorted = true; + std::string amtx; + std::string bmtx; + std::string cmtx; + int m = 10000; + int n = 10000; + int nnzPerRow = 30; + bool bDiag = false; //Whether B should be diagonal only (requires A square) + bool verbose = false; + int repeat = 1; + int numericRepeat = 1; //how many times to call numeric per overall run +}; + +template +void run_experiment(const Params& params) +{ + using namespace KokkosSparse; + using namespace KokkosSparse::Experimental; + + using size_type = typename crsMat_t::size_type; + using lno_t = typename crsMat_t::ordinal_type; + using scalar_t = typename crsMat_t::value_type; + using device_t = typename crsMat_t::device_type; + using exec_space = typename device_t::execution_space; + using mem_space = typename device_t::memory_space; + + using KernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle + ; + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + + std::cout << "************************************* \n"; + std::cout << "************************************* \n"; + crsMat_t A; + crsMat_t B; + lno_t m = params.m; + lno_t n = params.n; + if(params.amtx.length()) + { + std::cout << "Loading A from " << params.amtx << '\n'; + A = KokkosKernels::Impl::read_kokkos_crst_matrix(params.amtx.c_str()); + m = A.numRows(); + n = A.numCols(); + } + else + { + std::cout << "Randomly generating A\n"; + size_type nnzUnused = m * params.nnzPerRow; + A = KokkosKernels::Impl::kk_generate_sparse_matrix(m, n, nnzUnused, 0, (n + 3) / 3); + } + if(params.bmtx.length()) + { + std::cout << "Loading B from " << params.bmtx << '\n'; + B = KokkosKernels::Impl::read_kokkos_crst_matrix(params.bmtx.c_str()); + if(B.numRows() != m || B.numCols() != n) + throw std::runtime_error("A, B dimensions don't match"); + } + else if(params.bDiag) + { + std::cout << "Generating B as diagonal matrix.\n"; + int diagLength = std::min(m, n); + rowmap_t rowmap(Kokkos::ViewAllocateWithoutInitializing("rowmap_view"), m + 1); + entries_t entries(Kokkos::ViewAllocateWithoutInitializing("colsmap_view"), diagLength); + values_t values(Kokkos::ViewAllocateWithoutInitializing("values_view"), diagLength); + auto rowmapHost = Kokkos::create_mirror_view(rowmap); + auto entriesHost = Kokkos::create_mirror_view(entries); + auto valuesHost = Kokkos::create_mirror_view(values); + for(int i = 0; i < diagLength; i++) + { + rowmapHost(i) = i; + entriesHost(i) = i; + valuesHost(i) = 1.0; + } + for(int i = diagLength; i <= m; i++) + { + rowmapHost(i) = diagLength; + } + Kokkos::deep_copy(rowmap, rowmapHost); + Kokkos::deep_copy(entries, entriesHost); + Kokkos::deep_copy(values, valuesHost); + B = crsMat_t("B", m, n, diagLength, values, rowmap, entries); + } + else + { + std::cout << "Randomly generating B\n"; + size_type nnzUnused = m * params.nnzPerRow; + B = KokkosKernels::Impl::kk_generate_sparse_matrix(m, n, nnzUnused, 0, (n + 3) / 3); + } + //Make sure dimensions are compatible + if(A.numRows() != B.numRows()) + { + std::cout << "ERROR: A and B have different numbers of rows\n"; + exit(1); + } + if(A.numCols() != B.numCols()) + { + std::cout << "ERROR: A and B have different numbers of columns\n"; + exit(1); + } + std::cout << "Have A and B: " << m << "x" << n << ", " << A.nnz() << " and " << B.nnz() << " entries.\n"; + + typedef typename crsMat_t::values_type::non_const_type scalar_view_t; + typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t; + typedef typename crsMat_t::StaticCrsGraphType::entries_type::non_const_type lno_nnz_view_t; + typedef typename crsMat_t::StaticCrsGraphType::row_map_type const_lno_view_t; + typedef typename crsMat_t::StaticCrsGraphType::entries_type const_lno_nnz_view_t; + + lno_view_t row_mapC; + //entriesC, valuesC and cusparseBuffer are allocated inside + //the loop, as part of symbolic + lno_nnz_view_t entriesC; + scalar_view_t valuesC; + + KernelHandle kh; + + if(params.sorted) + { + std::cout << "Assuming input matrices are sorted (explicitly sorting just in case)\n"; + KokkosKernels::Impl::sort_crs_matrix(A); + KokkosKernels::Impl::sort_crs_matrix(B); + } + else + std::cout << "Assuming input matrices are not sorted.\n"; + kh.create_spadd_handle(params.sorted); + auto addHandle = kh.get_spadd_handle(); + + row_mapC = lno_view_t("non_const_lnow_row", m + 1); + + Kokkos::Timer timer; + double symbolicTime = 0; + double numericTime = 0; + + //Do an untimed warm up symbolic, and preallocate space for C entries/values + spadd_symbolic + (&kh, + A.graph.row_map, A.graph.entries, + B.graph.row_map, B.graph.entries, + row_mapC); + + bool use_kk = !params.use_cusparse && !params.use_mkl; + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + cusparseHandle_t cusparseHandle; + cusparseMatDescr_t A_cusparse; + cusparseMatDescr_t B_cusparse; + cusparseMatDescr_t C_cusparse; + if(params.use_cusparse) + { + cusparseCreate(&cusparseHandle); + cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST); + cusparseCreateMatDescr(&A_cusparse); + cusparseCreateMatDescr(&B_cusparse); + cusparseCreateMatDescr(&C_cusparse); + cusparseSetMatType(A_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatType(B_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatType(C_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatDiagType(A_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT); + cusparseSetMatDiagType(B_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT); + cusparseSetMatDiagType(C_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT); + cusparseSetMatIndexBase(A_cusparse, CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatIndexBase(B_cusparse, CUSPARSE_INDEX_BASE_ZERO); + cusparseSetMatIndexBase(C_cusparse, CUSPARSE_INDEX_BASE_ZERO); + } +#endif +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + sparse_matrix_t Amkl, Bmkl, Cmkl; + if(params.use_mkl) + { + if(SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr(&Amkl, SPARSE_INDEX_BASE_ZERO, m, n, + (int*) A.graph.row_map.data(), (int*) A.graph.row_map.data() + 1, A.graph.entries.data(), A.values.data())) + throw std::runtime_error("Failed to create A MKL handle"); + if(SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr(&Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, + (int*) B.graph.row_map.data(), (int*) B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())) + throw std::runtime_error("Failed to create B MKL handle"); + } +#endif + + char* cusparseBuffer; + int c_nnz; + const double alphabeta = 1.0; + + for(int sumRep = 0; sumRep < params.repeat; sumRep++) + { + timer.reset(); + if(use_kk) + { + spadd_symbolic + (&kh, + A.graph.row_map, A.graph.entries, + B.graph.row_map, B.graph.entries, + row_mapC); + c_nnz = addHandle->get_c_nnz(); + } + else if(params.use_cusparse) + { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + //Symbolic phase: compute buffer size, then compute nnz + size_t bufferSize; + cusparseDcsrgeam2_bufferSizeExt(cusparseHandle, + A.numRows(), A.numCols(), + &alphabeta, A_cusparse, A.nnz(), + A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), + &alphabeta, B_cusparse, B.nnz(), + B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), + C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize); + //Allocate work buffer + cudaMalloc((void**) &cusparseBuffer, bufferSize); + cusparseXcsrgeam2Nnz(cusparseHandle, m, n, + A_cusparse, A.nnz(), A.graph.row_map.data(), A.graph.entries.data(), + B_cusparse, B.nnz(), B.graph.row_map.data(), B.graph.entries.data(), + C_cusparse, row_mapC.data(), &c_nnz, + cusparseBuffer); +#endif + } + if(!params.use_mkl) + { + entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC (empty)"), c_nnz); + valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC (empty)"), c_nnz); + } + + //note: symbolic has a fence at the end + symbolicTime += timer.seconds(); + timer.reset(); + //Just time all numeric repetitions together + for(int numericRep = 0; numericRep < params.numericRepeat; numericRep++) + { + if(params.use_cusparse) + { +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + cusparseDcsrgeam2(cusparseHandle, m, n, + &alphabeta, A_cusparse, A.nnz(), + A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), + &alphabeta, B_cusparse, B.nnz(), + B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), + C_cusparse, valuesC.data(), row_mapC.data(), entriesC.data(), + cusparseBuffer); +#endif + } + else if(params.use_mkl) + { +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if(SPARSE_STATUS_SUCCESS != mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)) + { + throw std::runtime_error("MKL spadd failed"); + } + mkl_sparse_destroy(Cmkl); +#endif + } + else + { + spadd_numeric(&kh, + A.graph.row_map, A.graph.entries, A.values, 1.0, //A, alpha + B.graph.row_map, B.graph.entries, B.values, 1.0, //B, beta + row_mapC, entriesC, valuesC); //C + } + } + numericTime += timer.seconds(); +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if(params.use_cusparse) + cudaFree(cusparseBuffer); +#endif + } + +#ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE + if(params.use_cusparse) + cusparseDestroy(cusparseHandle); +#endif + +#ifdef KOKKOSKERNELS_ENABLE_TPL_MKL + if(params.use_mkl) + { + mkl_sparse_destroy(Amkl); + mkl_sparse_destroy(Bmkl); + } +#endif + + int symbolicCalls = params.repeat; + int numericCalls = params.repeat * params.numericRepeat; + + std::cout + << "Mean total time: " << (symbolicTime / symbolicCalls) + (numericTime / numericCalls) << '\n' + << "Mean symbolic time: " << (symbolicTime / symbolicCalls) << '\n' + << "Mean numeric time: " << (numericTime / numericCalls) << '\n'; + + if (params.verbose) + { + std::cout << "row_mapC:" << row_mapC.extent(0) << std::endl; + std::cout << "entriesC:" << entriesC.extent(0) << std::endl; + std::cout << "valuesC:" << valuesC.extent(0) << std::endl; + KokkosKernels::Impl::print_1Dview(valuesC); + KokkosKernels::Impl::print_1Dview(entriesC); + KokkosKernels::Impl::print_1Dview(row_mapC); + } + if(params.cmtx.length()) + { + std::cout << "Writing C (" << m << "x" << n << ") to " << params.cmtx << "\n"; + crsMat_t C("C", m, n, c_nnz, valuesC, row_mapC, entriesC); + KokkosKernels::Impl::write_kokkos_crst_matrix(C, params.cmtx.c_str()); + } +} void print_options(){ std::cerr << "Options\n" << std::endl; std::cerr << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp [numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" << std::endl; - std::cerr << "\t[Required] --amtx :: 1st input matrix" << std::endl; - std::cerr << "\t[Required] --bmtx :: 2nd input matrix" << std::endl; + std::cerr << "\t[Optional] --amtx :: 1st input matrix" << std::endl; + std::cerr << "\t[Optional] --bmtx :: 2nd input matrix" << std::endl; std::cerr << "\t[Optional] --cmtx :: output matrix for C = A+B" << std::endl; std::cerr << "\t[Optional] Verbose Output: '--verbose'" << std::endl; } - -int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char **argv){ - params.assume_sorted = false; +int parse_inputs (Params& params, int argc, char **argv){ for ( int i = 1 ; i < argc ; ++i ) { if ( 0 == strcasecmp( argv[i] , "--threads" ) ) { params.use_threads = atoi( argv[++i] ); @@ -81,26 +394,56 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char else if ( 0 == strcasecmp( argv[i] , "--cuda" ) ) { params.use_cuda = atoi( argv[++i] ) + 1; } + else if ( 0 == strcasecmp( argv[i] , "--mkl" ) ) { + params.use_mkl = 1; + } + else if ( 0 == strcasecmp( argv[i] , "--cusparse" ) ) { + params.use_cusparse = 1; + } else if ( 0 == strcasecmp( argv[i] , "--sorted" ) ) { - params.assume_sorted = true; + params.sorted = true; } else if ( 0 == strcasecmp( argv[i] , "--unsorted" ) ) { - params.assume_sorted = false; + params.sorted = false; } else if ( 0 == strcasecmp( argv[i] , "--amtx" ) ) { //A at C=AxB - params.a_mtx_bin_file = argv[++i]; + params.amtx = argv[++i]; } - else if ( 0 == strcasecmp( argv[i] , "--bmtx" ) ) { //B at C=AxB. //if not provided, C = AxA will be performed. - params.b_mtx_bin_file = argv[++i]; + params.bmtx = argv[++i]; } else if ( 0 == strcasecmp( argv[i] , "--cmtx" ) ) { //if provided, C will be written to given file. //has to have ".bin", or ".crs" extension. - params.c_mtx_bin_file = argv[++i]; + params.cmtx = argv[++i]; + } + else if( 0 == strcasecmp( argv[i], "--m" )) + { + params.m = atoi(argv[++i]); + } + else if( 0 == strcasecmp( argv[i], "--n" )) + { + params.n = atoi(argv[++i]); + } + else if( 0 == strcasecmp( argv[i], "--nnz" )) + { + params.nnzPerRow = atoi(argv[++i]); + } + else if( 0 == strcasecmp( argv[i], "--bdiag" )) + { + params.bDiag = true; + } + else if ( 0 == strcasecmp( argv[i] , "--repeat" ) ) { + //if provided, C will be written to given file. + //has to have ".bin", or ".crs" extension. + params.repeat = atoi( argv[++i] ); + } + else if ( 0 == strcasecmp( argv[i] , "--numeric-repeat" ) ) { + //Reuse the symbolic step this many times. + params.numericRepeat = atoi( argv[++i] ); } else if ( 0 == strcasecmp( argv[i] , "--verbose" ) ) { params.verbose = true; @@ -116,22 +459,16 @@ int parse_inputs (KokkosKernels::Experiment::Parameters ¶ms, int argc, char int main (int argc, char ** argv){ - KokkosKernels::Experiment::Parameters params; + Params params; if (parse_inputs (params, argc, argv) ){ return 1; } - if (!params.a_mtx_bin_file || !params.b_mtx_bin_file) { - std::cerr << "Provide a and b matrix files" << std::endl; - print_options(); - return 0; - } - const int num_threads = params.use_openmp; // Assumption is that use_openmp variable is provided as number of threads const int device_id = params.use_cuda - 1; Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); - Kokkos::print_configuration(std::cout); + //Kokkos::print_configuration(std::cout); bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; @@ -140,8 +477,8 @@ int main (int argc, char ** argv){ if(useOMP) { #if defined( KOKKOS_ENABLE_OPENMP ) - using crsMat_t = KokkosSparse::CrsMatrix; - KokkosKernels::Experiment::run_experiment(params); + using crsMat_t = KokkosSparse::CrsMatrix; + run_experiment(params); #else std::cout << "ERROR: OpenMP requested, but not available.\n"; return 1; @@ -150,8 +487,8 @@ int main (int argc, char ** argv){ if(useCUDA) { #if defined( KOKKOS_ENABLE_CUDA ) - using crsMat_t = KokkosSparse::CrsMatrix; - KokkosKernels::Experiment::run_experiment(params); + using crsMat_t = KokkosSparse::CrsMatrix; + run_experiment(params); #else std::cout << "ERROR: CUDA requested, but not available.\n"; return 1; @@ -160,8 +497,8 @@ int main (int argc, char ** argv){ if(useSerial) { #if defined( KOKKOS_ENABLE_SERIAL ) - using crsMat_t = KokkosSparse::CrsMatrix; - KokkosKernels::Experiment::run_experiment(params); + using crsMat_t = KokkosSparse::CrsMatrix; + run_experiment(params); #else std::cout << "ERROR: Serial device requested, but not available.\n"; return 1; @@ -171,7 +508,6 @@ int main (int argc, char ** argv){ return 0; } - #else int main() { #if !defined(KOKKOSKERNELS_INST_DOUBLE) diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index b74834db5f..6f2b15c729 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -382,7 +382,6 @@ crsMat_t kk_generate_sparse_matrix( Kokkos::deep_copy (rowmap_view , hr); Kokkos::deep_copy (columns_view , hc); Kokkos::deep_copy (values_view , hv); - Kokkos::fence(); } graph_t static_graph (columns_view, rowmap_view); diff --git a/test_common/KokkosKernels_TestParameters.hpp b/test_common/KokkosKernels_TestParameters.hpp index c069c618e6..1723827eaf 100644 --- a/test_common/KokkosKernels_TestParameters.hpp +++ b/test_common/KokkosKernels_TestParameters.hpp @@ -96,9 +96,6 @@ struct Parameters{ // 0 - no flush // 1 - soft flush // 2 - hard flush with rand. - bool assume_sorted; - // For sparse matrix addition, whether to assume - // input matrix entries are sorted within rows. Parameters(){ algorithm = 0; @@ -140,8 +137,6 @@ struct Parameters{ first_level_hash_cut_off = 0.50; compression_cut_off = 0.85; MaxColDenseAcc = 250000; - - assume_sorted = false; } }; } From cb2004a9ed07927541a6c08077e81df66d22f2f5 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Sat, 13 Feb 2021 10:23:57 -0700 Subject: [PATCH 2/9] WIP: speeding up SpAdd --- src/sparse/KokkosSparse_spadd.hpp | 166 +++++++++++++++++++++-- src/sparse/KokkosSparse_spadd_handle.hpp | 2 +- 2 files changed, 157 insertions(+), 11 deletions(-) diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 1efae2c1a7..59ad6b9e28 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -72,7 +72,7 @@ and Bpos each contain the final index within C row where the A/B entry belongs // get C rowmap for sorted input template + typename CRowPtrsT, typename ExecSpace> struct SortedCountEntries { SortedCountEntries(ordinal_type nrows_, const typename ARowPtrsT::const_type& Arowptrs_, @@ -86,6 +86,9 @@ struct SortedCountEntries { Bcolinds(Bcolinds_), Crowcounts(Crowcounts_) {} + using TeamPol = Kokkos::TeamPolicy; + using TeamMem = typename TeamPol::member_type; + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const { const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); @@ -112,12 +115,119 @@ struct SortedCountEntries { } Crowcounts(i) = numEntries; } + + KOKKOS_INLINE_FUNCTION void operator()(const TeamMem t) const + { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + ordinal_type i = t.league_rank() * t.team_size() + t.team_rank(); + if(i >= nrows) + return; + ordinal_type* allScratch = (ordinal_type*) t.team_shmem().get_shmem(totalShared); + ordinal_type* scratch = allScratch + t.team_rank() * sharedPerThread; + ordinal_type Arowstart = Arowptrs(i); + ordinal_type Arowlen = Arowptrs(i + 1) - Arowstart; + ordinal_type Browstart = Browptrs(i); + ordinal_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type n = Arowlen + Browlen; + if(n > sharedPerThread) + { + //fall back to slow serial method + Kokkos::single(Kokkos::PerThread(t), + [=]() + { + this->operator()(i); + }); + return; + } + if(n == 0) + { + Kokkos::single(Kokkos::PerThread(t), + [=]() + { + Crowcounts(i) = 0; + }); + return; + } + //Figure out the number of bitonic steps: ceil(log2(n)) + ordinal_type npot = 1; + ordinal_type levels = 0; + while(npot < n) + { + levels++; + npot <<= 1; + } + //Copy A and B entries to scratch + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Arowlen), + [&](ordinal_type j) + { + scratch[j] = Acolinds(Arowstart + j); + }); + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, Browlen), + [&](ordinal_type j) + { + scratch[npot - 1 - j] = Bcolinds(Browstart + j); + }); + //Fill space between A and B with ORDINAL_MAX, + //to maintain a valid bitonic sequence of power-of-two length + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot - n), + [&](ordinal_type j) + { + scratch[Arowlen + j] = Kokkos::ArithTraits::max(); + }); + // npot = 2^levels + for(ordinal_type level = 0; level < levels; level++) + { + // npot/2 pairs of items are compared in parallel + Kokkos::parallel_for(Kokkos::ThreadVectorRange(t, npot >> 1), + [&](const ordinal_type j) + { + ordinal_type boxSize = npot >> level; + //Which box contains this thread? + //box = (j / boxSize), and boxSize = 2^(levels-level), so + //box = j * 2^(level-levels) = j >> (levels - level) + ordinal_type boxID = (j * 2) >> (levels - level); + //boxStart = boxID * boxSize = boxID * 2^(levels-level) + //= boxID << (levels-level) + ordinal_type boxStart = boxID << (levels - level); + ordinal_type boxOffset = j - boxID * boxSize / 2; + ordinal_type elem1 = boxStart + boxOffset; + ordinal_type elem2 = elem1 + (boxSize >> 1); + if(scratch[elem2] < scratch[elem1]) + { + ordinal_type temp = scratch[elem1]; + scratch[elem1] = scratch[elem2]; + scratch[elem2] = temp; + } + }); + } + //Finally, count the number of distinct entries (this is #rising edges + 1) + ordinal_type risingEdges; + Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(t, n - 1), + [&](const ordinal_type j, ordinal_type& lcount) + { + if(scratch[j] != scratch[j + 1]) + lcount++; + }, risingEdges); + Kokkos::single(Kokkos::PerThread(t), + [=]() + { + Crowcounts(i) = risingEdges + 1; + }); + } + + size_t team_shmem_size(int teamSize) const + { + return sharedPerThread * sizeof(ordinal_type) * teamSize; + } + ordinal_type nrows; const typename ARowPtrsT::const_type Arowptrs; const AColIndsT Acolinds; const typename BRowPtrsT::const_type Browptrs; const BColIndsT Bcolinds; CRowPtrsT Crowcounts; + int sharedPerThread; //Shared for each thread, measured in sizeof(ordinal_type) + int totalShared; //Shared for whole team, measured in bytes }; // get upper bound for C entries per row (assumes worst case, that entries in A @@ -333,13 +443,50 @@ void spadd_symbolic( using NoInitialize = Kokkos::ViewAllocateWithoutInitializing; if (addHandle->is_input_sorted()) { // call entry count functor to get entry counts per row - SortedCountEntries - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - range_type(0, nrows), countEntries); + size_type c_est_nnz = 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; + if(KokkosKernels::Impl::kk_is_gpu_exec_space() && c_est_nnz <= 512) + { + //Convert c_est_nnz to a power of 2 + size_type pot_est_nnz = 1; + while(pot_est_nnz < c_est_nnz) + pot_est_nnz *= 2; + using TeamPol = Kokkos::TeamPolicy; + //Estimate max number of uncompressed entries in each row of C + int vector_length = 1; + int vector_length_max = TeamPol::vector_length_max(); + while(vector_length * 2 <= vector_length_max && + (size_type) vector_length * 2 <= pot_est_nnz) + { + vector_length *= 2; + } + SortedCountEntries + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + countEntries.sharedPerThread = pot_est_nnz; + //compute largest possible team size + TeamPol testPolicy(1, 1, vector_length); + testPolicy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + int team_size = testPolicy.team_size_recommended(countEntries, Kokkos::ParallelForTag()); + //construct real policy + int league_size = (nrows + team_size - 1) / team_size; + TeamPol policy(league_size, team_size, vector_length); + policy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + countEntries.totalShared = countEntries.sharedPerThread * team_size * sizeof(ordinal_type); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + policy, countEntries); + } + else + { + SortedCountEntries + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + range_type(0, nrows), countEntries); + } KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( nrows + 1, c_rowmap); @@ -575,8 +722,7 @@ void spadd_numeric(KernelHandle* kernel_handle, const alno_row_view_t_ a_rowmap, typedef typename KernelHandle::size_type size_type; typedef typename KernelHandle::nnz_lno_t ordinal_type; typedef typename KernelHandle::nnz_scalar_t scalar_type; - typedef - typename KernelHandle::SPADDHandleType::execution_space execution_space; + typedef typename KernelHandle::SPADDHandleType::execution_space execution_space; // Check that A/B/C data types match KernelHandle types, and that C data types // are nonconst (doesn't matter if A/B types are const) static_assert(SAME_TYPE(ascalar_t_, scalar_type), diff --git a/src/sparse/KokkosSparse_spadd_handle.hpp b/src/sparse/KokkosSparse_spadd_handle.hpp index dcc5fcd9ee..6bf72013e4 100644 --- a/src/sparse/KokkosSparse_spadd_handle.hpp +++ b/src/sparse/KokkosSparse_spadd_handle.hpp @@ -83,7 +83,7 @@ class SPADDHandle { * \param result_nnz_size: size of the output matrix. */ - void set_a_b_pos(nnz_lno_view_t a_pos_in, const nnz_lno_view_t b_pos_in) + void set_a_b_pos(const nnz_lno_view_t& a_pos_in, const nnz_lno_view_t& b_pos_in) { a_pos = a_pos_in; b_pos = b_pos_in; From 0ef1c29ebc0ae1f6b8639a2aac6e28712dc5f444 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 6 Apr 2021 13:03:48 -0600 Subject: [PATCH 3/9] Make read_kokkos_crst_matrix get columns when known MatrixMarket files include the number of columns, so use that to construct the CrsMatrix. Previously, would compute #cols as max entry plus 1, but this may not be correct if there are empty columns. --- perf_test/sparse/KokkosSparse_spadd.cpp | 13 ++--- src/common/KokkosKernels_IOUtils.hpp | 69 ++++++++++++++++--------- 2 files changed, 47 insertions(+), 35 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index ab003f5b9b..323d1af240 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -125,8 +125,6 @@ void run_experiment(const Params& params) { std::cout << "Loading B from " << params.bmtx << '\n'; B = KokkosKernels::Impl::read_kokkos_crst_matrix(params.bmtx.c_str()); - if(B.numRows() != m || B.numCols() != n) - throw std::runtime_error("A, B dimensions don't match"); } else if(params.bDiag) { @@ -160,17 +158,12 @@ void run_experiment(const Params& params) B = KokkosKernels::Impl::kk_generate_sparse_matrix(m, n, nnzUnused, 0, (n + 3) / 3); } //Make sure dimensions are compatible - if(A.numRows() != B.numRows()) + if(A.numRows() != B.numRows() || A.numCols() != B.numCols()) { - std::cout << "ERROR: A and B have different numbers of rows\n"; + std::cout << "ERROR: A is " << A.numRows() << 'x' << A.numCols() << ", but B is " << B.numRows() << 'x' << B.numCols() << '\n'; exit(1); } - if(A.numCols() != B.numCols()) - { - std::cout << "ERROR: A and B have different numbers of columns\n"; - exit(1); - } - std::cout << "Have A and B: " << m << "x" << n << ", " << A.nnz() << " and " << B.nnz() << " entries.\n"; + std::cout << "A and B are " << m << "x" << n << ". A, B have " << A.nnz() << " and " << B.nnz() << " entries.\n"; typedef typename crsMat_t::values_type::non_const_type scalar_view_t; typedef typename crsMat_t::StaticCrsGraphType::row_map_type::non_const_type lno_view_t; diff --git a/src/common/KokkosKernels_IOUtils.hpp b/src/common/KokkosKernels_IOUtils.hpp index 6f2b15c729..dae65716ee 100644 --- a/src/common/KokkosKernels_IOUtils.hpp +++ b/src/common/KokkosKernels_IOUtils.hpp @@ -1024,7 +1024,7 @@ void write_kokkos_crst_matrix(crs_matrix_t a_crsmat,const char *filename){ template int read_mtx ( const char *fileName, - lno_t *nv, size_type *ne, + lno_t *nrows, lno_t* ncols, size_type *ne, size_type **xadj, lno_t **adj, scalar_t **ew, bool symmetrize = false, bool remove_diagonal = true, bool transpose = false) @@ -1215,7 +1215,8 @@ int read_mtx ( nc = tmp; } //idx *nv, idx *ne, idx **xadj, idx **adj, wt **wt - *nv = nr; + *nrows = nr; + *ncols = nc; *ne = nE; //*xadj = new idx[nr + 1]; md_malloc(xadj, nr+1); @@ -1243,6 +1244,20 @@ int read_mtx ( return 0; } +//Version of read_mtx which does not capture the number of columns. +//This is the old interface; it's kept for backwards compatibility. +template +int read_mtx ( + const char *fileName, + lno_t *nv, size_type *ne, + size_type **xadj, lno_t **adj, scalar_t **ew, + bool symmetrize = false, bool remove_diagonal = true, + bool transpose = false) +{ + lno_t ncol; //will discard + return read_mtx(fileName, nv, &ncol, ne, xadj, adj, ew, symmetrize, remove_diagonal, transpose); +} + template void read_matrix(lno_t *nv, size_type *ne,size_type **xadj, lno_t **adj, scalar_t **ew, const char *filename){ @@ -1270,6 +1285,8 @@ void read_matrix(lno_t *nv, size_type *ne,size_type **xadj, lno_t **adj, scalar_ template crsMat_t read_kokkos_crst_matrix(const char * filename_){ + std::string strfilename(filename_); + bool isMatrixMarket = endswith(strfilename, ".mtx") || endswith(strfilename, ".mm"); typedef typename crsMat_t::StaticCrsGraphType graph_t; typedef typename graph_t::row_map_type::non_const_type row_map_view_t; @@ -1280,43 +1297,45 @@ crsMat_t read_kokkos_crst_matrix(const char * filename_){ typedef typename cols_view_t::value_type lno_t; typedef typename values_view_t::value_type scalar_t; - - lno_t nv, *adj; + lno_t nr, nc, *adj; size_type *xadj, nnzA; scalar_t *values; - read_matrix( - &nv, &nnzA, &xadj, &adj, &values, filename_); - row_map_view_t rowmap_view("rowmap_view", nv+1); + if(isMatrixMarket) + { + //MatrixMarket file contains the exact number of columns + read_mtx ( + filename_, &nr, &nc, &nnzA, &xadj, &adj, &values, false, false, false); + } + else + { + //.crs and .bin files don't contain #cols, so will compute it later based on the entries + read_matrix( + &nr, &nnzA, &xadj, &adj, &values, filename_); + } + + row_map_view_t rowmap_view("rowmap_view", nr + 1); cols_view_t columns_view("colsmap_view", nnzA); values_view_t values_view("values_view", nnzA); - { - typename row_map_view_t::HostMirror hr = Kokkos::create_mirror_view (rowmap_view); - typename cols_view_t::HostMirror hc = Kokkos::create_mirror_view (columns_view); - typename values_view_t::HostMirror hv = Kokkos::create_mirror_view (values_view); - - for (lno_t i = 0; i <= nv; ++i){ - hr(i) = xadj[i]; - } - - for (size_type i = 0; i < nnzA; ++i){ - hc(i) = adj[i]; - hv(i) = values[i]; - } + Kokkos::View> hr(xadj, nr + 1); + Kokkos::View> hc(adj, nnzA); + Kokkos::View> hv(values, nnzA); Kokkos::deep_copy (rowmap_view , hr); Kokkos::deep_copy (columns_view , hc); Kokkos::deep_copy (values_view , hv); } - lno_t ncols = 0; - KokkosKernels::Impl::kk_view_reduce_max - (nnzA, columns_view, ncols); - ncols += 1; + if(!isMatrixMarket) + { + KokkosKernels::Impl::kk_view_reduce_max + (nnzA, columns_view, nc); + nc++; + } graph_t static_graph (columns_view, rowmap_view); - crsMat_t crsmat("CrsMatrix", ncols, values_view, static_graph); + crsMat_t crsmat("CrsMatrix", nc, values_view, static_graph); delete [] xadj; delete [] adj; delete [] values; return crsmat; } From d33691b58628b8a440923cb3a49f5dde05900621 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 6 Apr 2021 14:51:13 -0600 Subject: [PATCH 4/9] SpAdd: inst TeamPol SortedCountEntries for GPU only to reduce code size. --- src/sparse/KokkosSparse_spadd.hpp | 200 ++++++++++++++++++++++-------- 1 file changed, 148 insertions(+), 52 deletions(-) diff --git a/src/sparse/KokkosSparse_spadd.hpp b/src/sparse/KokkosSparse_spadd.hpp index 59ad6b9e28..b92c9ab928 100644 --- a/src/sparse/KokkosSparse_spadd.hpp +++ b/src/sparse/KokkosSparse_spadd.hpp @@ -73,8 +73,59 @@ and Bpos each contain the final index within C row where the A/B entry belongs template -struct SortedCountEntries { - SortedCountEntries(ordinal_type nrows_, +struct SortedCountEntriesRange { + SortedCountEntriesRange(ordinal_type nrows_, + const typename ARowPtrsT::const_type& Arowptrs_, + const AColIndsT& Acolinds_, + const typename BRowPtrsT::const_type& Browptrs_, + const BColIndsT& Bcolinds_, const CRowPtrsT& Crowcounts_) + : nrows(nrows_), + Arowptrs(Arowptrs_), + Acolinds(Acolinds_), + Browptrs(Browptrs_), + Bcolinds(Bcolinds_), + Crowcounts(Crowcounts_) {} + + KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const + { + const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); + + // count the union of nonzeros in Arow and Brow + size_type numEntries = 0; + size_type ai = 0; + size_type bi = 0; + size_type Arowstart = Arowptrs(i); + size_type Arowlen = Arowptrs(i + 1) - Arowstart; + size_type Browstart = Browptrs(i); + size_type Browlen = Browptrs(i + 1) - Browstart; + ordinal_type Acol = (Arowlen == 0) ? ORDINAL_MAX : Acolinds(Arowstart); + ordinal_type Bcol = (Browlen == 0) ? ORDINAL_MAX : Bcolinds(Browstart); + while (Acol != ORDINAL_MAX || Bcol != ORDINAL_MAX) { + ordinal_type Ccol = (Acol < Bcol) ? Acol : Bcol; + numEntries++; + //Eat all entries in both A and B which have this column + //This also results in Acol/Bcol being updated to following entries for next loop iter + while(Acol == Ccol) + Acol = (ai == Arowlen) ? ORDINAL_MAX : Acolinds(Arowstart + ai++); + while(Bcol == Ccol) + Bcol = (bi == Browlen) ? ORDINAL_MAX : Bcolinds(Browstart + bi++); + } + Crowcounts(i) = numEntries; + } + + ordinal_type nrows; + const typename ARowPtrsT::const_type Arowptrs; + const AColIndsT Acolinds; + const typename BRowPtrsT::const_type Browptrs; + const BColIndsT Bcolinds; + CRowPtrsT Crowcounts; +}; + +template +struct SortedCountEntriesTeam { + SortedCountEntriesTeam(ordinal_type nrows_, const typename ARowPtrsT::const_type& Arowptrs_, const AColIndsT& Acolinds_, const typename BRowPtrsT::const_type& Browptrs_, @@ -89,7 +140,7 @@ struct SortedCountEntries { using TeamPol = Kokkos::TeamPolicy; using TeamMem = typename TeamPol::member_type; - KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type i) const + KOKKOS_INLINE_FUNCTION void longRowFallback(const ordinal_type i) const { const ordinal_type ORDINAL_MAX = Kokkos::ArithTraits::max(); @@ -135,7 +186,7 @@ struct SortedCountEntries { Kokkos::single(Kokkos::PerThread(t), [=]() { - this->operator()(i); + longRowFallback(i); }); return; } @@ -379,6 +430,92 @@ struct MergeEntriesFunctor { CcolindsT Bpos; }; +//Run SortedCountEntries: non-GPU, always uses the RangePolicy version. +template +void runSortedCountEntries( + const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, + const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, + const clno_row_view_t_& c_rowmap, + typename std::enable_if()>::type* = nullptr) +{ + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using execution_space = typename KernelHandle::SPADDHandleType::execution_space; + using range_type = Kokkos::RangePolicy; + auto nrows = c_rowmap.extent(0) - 1; + SortedCountEntriesRange + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + range_type(0, nrows), countEntries); +} + +//Run SortedCountEntries: GPU, uses the TeamPolicy or RangePolicy depending +// on average nz per row (a runtime decision) +template +void runSortedCountEntries( + const alno_row_view_t_& a_rowmap, const alno_nnz_view_t_& a_entries, + const blno_row_view_t_& b_rowmap, const blno_nnz_view_t_& b_entries, + const clno_row_view_t_& c_rowmap, + typename std::enable_if()>::type* = nullptr) +{ + using size_type = typename KernelHandle::size_type; + using ordinal_type = typename KernelHandle::nnz_lno_t; + using execution_space = typename KernelHandle::SPADDHandleType::execution_space; + using RangePol = Kokkos::RangePolicy; + using TeamPol = Kokkos::TeamPolicy; + auto nrows = c_rowmap.extent(0) - 1; + size_type c_est_nnz = 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; + if(c_est_nnz <= 512) + { + //Convert c_est_nnz to a power of 2 + size_type pot_est_nnz = 1; + while(pot_est_nnz < c_est_nnz) + pot_est_nnz *= 2; + //Estimate max number of uncompressed entries in each row of C + int vector_length = 1; + int vector_length_max = TeamPol::vector_length_max(); + while(vector_length * 2 <= vector_length_max && + (size_type) vector_length * 2 <= pot_est_nnz) + { + vector_length *= 2; + } + SortedCountEntriesTeam + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + countEntries.sharedPerThread = pot_est_nnz; + //compute largest possible team size + TeamPol testPolicy(1, 1, vector_length); + testPolicy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + int team_size = testPolicy.team_size_recommended(countEntries, Kokkos::ParallelForTag()); + //construct real policy + int league_size = (nrows + team_size - 1) / team_size; + TeamPol policy(league_size, team_size, vector_length); + policy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); + countEntries.totalShared = countEntries.sharedPerThread * team_size * sizeof(ordinal_type); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + policy, countEntries); + } + else + { + SortedCountEntriesRange + countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + Kokkos::parallel_for( + "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", + RangePol(0, nrows), countEntries); + } +} + // Symbolic: count entries in each row in C to produce rowmap // kernel handle has information about whether it is sorted add or not. template range_type; using NoInitialize = Kokkos::ViewAllocateWithoutInitializing; if (addHandle->is_input_sorted()) { - // call entry count functor to get entry counts per row - size_type c_est_nnz = 1.4 * (a_entries.extent(0) + b_entries.extent(0)) / nrows; - if(KokkosKernels::Impl::kk_is_gpu_exec_space() && c_est_nnz <= 512) - { - //Convert c_est_nnz to a power of 2 - size_type pot_est_nnz = 1; - while(pot_est_nnz < c_est_nnz) - pot_est_nnz *= 2; - using TeamPol = Kokkos::TeamPolicy; - //Estimate max number of uncompressed entries in each row of C - int vector_length = 1; - int vector_length_max = TeamPol::vector_length_max(); - while(vector_length * 2 <= vector_length_max && - (size_type) vector_length * 2 <= pot_est_nnz) - { - vector_length *= 2; - } - SortedCountEntries - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - countEntries.sharedPerThread = pot_est_nnz; - //compute largest possible team size - TeamPol testPolicy(1, 1, vector_length); - testPolicy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); - int team_size = testPolicy.team_size_recommended(countEntries, Kokkos::ParallelForTag()); - //construct real policy - int league_size = (nrows + team_size - 1) / team_size; - TeamPol policy(league_size, team_size, vector_length); - policy.set_scratch_size(0, Kokkos::PerThread(pot_est_nnz * sizeof(ordinal_type))); - countEntries.totalShared = countEntries.sharedPerThread * team_size * sizeof(ordinal_type); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - policy, countEntries); - } - else - { - SortedCountEntries - countEntries(nrows, a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); - Kokkos::parallel_for( - "KokkosSparse::SpAdd::Symbolic::InputSorted::CountEntries", - range_type(0, nrows), countEntries); - } - KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum( - nrows + 1, c_rowmap); + runSortedCountEntries + + (a_rowmap, a_entries, b_rowmap, b_entries, c_rowmap); + KokkosKernels::Impl::kk_exclusive_parallel_prefix_sum + + (nrows + 1, c_rowmap); } else { // note: scoping individual parts of the process to free views sooner, // minimizing peak memory usage run the unsorted c_rowmap upper bound From 0c8d300560366e9dbc2df0e523e7b52a3a2bc305 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 6 Apr 2021 15:33:44 -0600 Subject: [PATCH 5/9] Updated spadd perf test help text --- perf_test/sparse/KokkosSparse_spadd.cpp | 30 ++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 323d1af240..c233fbee47 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -373,7 +373,19 @@ void print_options(){ std::cerr << "\t[Optional] --amtx :: 1st input matrix" << std::endl; std::cerr << "\t[Optional] --bmtx :: 2nd input matrix" << std::endl; std::cerr << "\t[Optional] --cmtx :: output matrix for C = A+B" << std::endl; - std::cerr << "\t[Optional] Verbose Output: '--verbose'" << std::endl; + std::cerr << "\t[Optional] --mkl :: run SpAdd from MKL" << std::endl; + std::cerr << "\t[Optional] --cusparse :: run SpAdd from cuSPARSE " << std::endl; + std::cerr << "\t[Optional] --sorted :: sort rows of inputs, and run the sorted algorithm" << std::endl; + std::cerr << "\t[Optional] --unsorted :: run the unsorted algorithm" << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall spadd (symbolic + repeated numeric)" << std::endl; + std::cerr << "\t[Optional] --numeric-repeat :: how many times to repeat numeric per symbolic" << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" << std::endl; + std::cerr << "\nSettings for randomly generated A/B matrices" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" << std::endl; + std::cerr << "\t[Optional] --nnz :: number of entries per row to generate" << std::endl; + std::cerr << "\t[Optional] --nnz :: number of entries per row to generate" << std::endl; + std::cerr << "\t[Optional] --bdiag :: generate B as a diagonal matrix" << std::endl; } int parse_inputs (Params& params, int argc, char **argv){ @@ -463,8 +475,24 @@ int main (int argc, char ** argv){ Kokkos::initialize( Kokkos::InitArguments( num_threads, -1, device_id ) ); //Kokkos::print_configuration(std::cout); + //First, make sure that requested TPL (if any) is actually available +#if !defined(KOKKOSKERNELS_ENABLE_TPL_MKL) + if(params.use_mkl) + throw std::invalid_argument("To run MKL SpAdd, must enable the MKL TPL in cmake"); +#endif +#if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) + if(params.use_cusparse) + throw std::invalid_argument("To run cuSPARSE SpAdd, must enable the cuSPARSE TPL in cmake"); +#endif + bool useOMP = params.use_openmp != 0; bool useCUDA = params.use_cuda != 0; + + if(params.use_cusparse && !useCUDA) + { + throw std::invalid_argument("To run cuSPARSE SpAdd, must supply the '--cuda ' flag"); + } + bool useSerial = !useOMP && !useCUDA; if(useOMP) From 1ad52735713de49c3b993818b52d05697d053e39 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 6 Apr 2021 16:55:09 -0600 Subject: [PATCH 6/9] spadd perftest: don't allow outputting c with mkl --- perf_test/sparse/KokkosSparse_spadd.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index c233fbee47..cd96a1162f 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -242,7 +242,7 @@ void run_experiment(const Params& params) #endif char* cusparseBuffer; - int c_nnz; + int c_nnz = 0; const double alphabeta = 1.0; for(int sumRep = 0; sumRep < params.repeat; sumRep++) @@ -493,6 +493,11 @@ int main (int argc, char ** argv){ throw std::invalid_argument("To run cuSPARSE SpAdd, must supply the '--cuda ' flag"); } + if(params.cmtx.length() && params.use_mkl) + { + throw std::invalid_argument("If running MKL, can't output the result to file"); + } + bool useSerial = !useOMP && !useCUDA; if(useOMP) From 98f2aabcbb96c9753acb776476644ca7e3e2397b Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Tue, 6 Apr 2021 18:23:01 -0600 Subject: [PATCH 7/9] Fix unused var warnings --- perf_test/sparse/KokkosSparse_spadd.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index cd96a1162f..b1155d8af6 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -210,6 +210,9 @@ void run_experiment(const Params& params) cusparseMatDescr_t A_cusparse; cusparseMatDescr_t B_cusparse; cusparseMatDescr_t C_cusparse; + char* cusparseBuffer; + const double alphabeta = 1.0; + if(params.use_cusparse) { cusparseCreate(&cusparseHandle); @@ -241,9 +244,7 @@ void run_experiment(const Params& params) } #endif - char* cusparseBuffer; int c_nnz = 0; - const double alphabeta = 1.0; for(int sumRep = 0; sumRep < params.repeat; sumRep++) { From f8917fa1e143041f47d04a6623f7912d53ee4827 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Apr 2021 12:21:58 -0600 Subject: [PATCH 8/9] Improve TPL error checking (spadd perftest) --- perf_test/sparse/KokkosSparse_spadd.cpp | 81 ++++++++++++++----------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index b1155d8af6..2e11da6b20 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -46,6 +46,7 @@ #include "KokkosKernels_config.h" #include "KokkosKernels_Handle.hpp" #include "KokkosKernels_IOUtils.hpp" +#include "KokkosKernels_SparseUtils_cusparse.hpp" #include "KokkosSparse_spadd.hpp" #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE @@ -55,6 +56,19 @@ #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL #include #include + +inline void spadd_mkl_internal_safe_call(sparse_status_t mklStatus, + const char* name, + const char* file = nullptr, + const int line = 0) { + if (SPARSE_STATUS_SUCCESS != mklStatus) { + std::cerr << "MKL call \"" << name << "\" encountered error at " << file << ":" << line << '\n'; + Kokkos::abort(); + } +} + +#define SPADD_MKL_SAFE_CALL(call) \ + spadd_mkl_internal_safe_call(call, #call, __FILE__, __LINE__) #endif #if defined(KOKKOSKERNELS_INST_DOUBLE) && \ @@ -215,32 +229,30 @@ void run_experiment(const Params& params) if(params.use_cusparse) { - cusparseCreate(&cusparseHandle); - cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST); - cusparseCreateMatDescr(&A_cusparse); - cusparseCreateMatDescr(&B_cusparse); - cusparseCreateMatDescr(&C_cusparse); - cusparseSetMatType(A_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatType(B_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatType(C_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatDiagType(A_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT); - cusparseSetMatDiagType(B_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT); - cusparseSetMatDiagType(C_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT); - cusparseSetMatIndexBase(A_cusparse, CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatIndexBase(B_cusparse, CUSPARSE_INDEX_BASE_ZERO); - cusparseSetMatIndexBase(C_cusparse, CUSPARSE_INDEX_BASE_ZERO); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreate(&cusparseHandle)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetPointerMode(cusparseHandle, CUSPARSE_POINTER_MODE_HOST)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&A_cusparse)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&B_cusparse)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseCreateMatDescr(&C_cusparse)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(A_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(B_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatType(C_cusparse, CUSPARSE_MATRIX_TYPE_GENERAL)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(A_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(B_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatDiagType(C_cusparse, CUSPARSE_DIAG_TYPE_NON_UNIT)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(A_cusparse, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(B_cusparse, CUSPARSE_INDEX_BASE_ZERO)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseSetMatIndexBase(C_cusparse, CUSPARSE_INDEX_BASE_ZERO)); } #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL sparse_matrix_t Amkl, Bmkl, Cmkl; if(params.use_mkl) { - if(SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr(&Amkl, SPARSE_INDEX_BASE_ZERO, m, n, - (int*) A.graph.row_map.data(), (int*) A.graph.row_map.data() + 1, A.graph.entries.data(), A.values.data())) - throw std::runtime_error("Failed to create A MKL handle"); - if(SPARSE_STATUS_SUCCESS != mkl_sparse_d_create_csr(&Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, - (int*) B.graph.row_map.data(), (int*) B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())) - throw std::runtime_error("Failed to create B MKL handle"); + SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(&Amkl, SPARSE_INDEX_BASE_ZERO, m, n, + (int*) A.graph.row_map.data(), (int*) A.graph.row_map.data() + 1, A.graph.entries.data(), A.values.data())); + SPADD_MKL_SAFE_CALL(mkl_sparse_d_create_csr(&Bmkl, SPARSE_INDEX_BASE_ZERO, m, n, + (int*) B.graph.row_map.data(), (int*) B.graph.row_map.data() + 1, B.graph.entries.data(), B.values.data())); } #endif @@ -263,20 +275,20 @@ void run_experiment(const Params& params) #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE //Symbolic phase: compute buffer size, then compute nnz size_t bufferSize; - cusparseDcsrgeam2_bufferSizeExt(cusparseHandle, + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2_bufferSizeExt(cusparseHandle, A.numRows(), A.numCols(), &alphabeta, A_cusparse, A.nnz(), A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), &alphabeta, B_cusparse, B.nnz(), B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), - C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize); + C_cusparse, NULL, row_mapC.data(), NULL, &bufferSize)); //Allocate work buffer - cudaMalloc((void**) &cusparseBuffer, bufferSize); - cusparseXcsrgeam2Nnz(cusparseHandle, m, n, + CUDA_SAFE_CALL(cudaMalloc((void**) &cusparseBuffer, bufferSize)); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseXcsrgeam2Nnz(cusparseHandle, m, n, A_cusparse, A.nnz(), A.graph.row_map.data(), A.graph.entries.data(), B_cusparse, B.nnz(), B.graph.row_map.data(), B.graph.entries.data(), C_cusparse, row_mapC.data(), &c_nnz, - cusparseBuffer); + cusparseBuffer)); #endif } if(!params.use_mkl) @@ -294,23 +306,20 @@ void run_experiment(const Params& params) if(params.use_cusparse) { #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE - cusparseDcsrgeam2(cusparseHandle, m, n, + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDcsrgeam2(cusparseHandle, m, n, &alphabeta, A_cusparse, A.nnz(), A.values.data(), A.graph.row_map.data(), A.graph.entries.data(), &alphabeta, B_cusparse, B.nnz(), B.values.data(), B.graph.row_map.data(), B.graph.entries.data(), C_cusparse, valuesC.data(), row_mapC.data(), entriesC.data(), - cusparseBuffer); + cusparseBuffer)); #endif } else if(params.use_mkl) { #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL - if(SPARSE_STATUS_SUCCESS != mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)) - { - throw std::runtime_error("MKL spadd failed"); - } - mkl_sparse_destroy(Cmkl); + SPADD_MKL_SAFE_CALL(mkl_sparse_d_add(SPARSE_OPERATION_NON_TRANSPOSE, Amkl, 1.0, Bmkl, &Cmkl)); + SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Cmkl)); #endif } else @@ -324,20 +333,20 @@ void run_experiment(const Params& params) numericTime += timer.seconds(); #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE if(params.use_cusparse) - cudaFree(cusparseBuffer); + CUDA_SAFE_CALL(cudaFree(cusparseBuffer)); #endif } #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE if(params.use_cusparse) - cusparseDestroy(cusparseHandle); + KOKKOS_CUSPARSE_SAFE_CALL(cusparseDestroy(cusparseHandle)); #endif #ifdef KOKKOSKERNELS_ENABLE_TPL_MKL if(params.use_mkl) { - mkl_sparse_destroy(Amkl); - mkl_sparse_destroy(Bmkl); + SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Amkl)); + SPADD_MKL_SAFE_CALL(mkl_sparse_destroy(Bmkl)); } #endif From f53f18b02f0cf1fc79cd0a6c89b1c6894501ac11 Mon Sep 17 00:00:00 2001 From: Brian Kelley Date: Fri, 9 Apr 2021 12:47:18 -0600 Subject: [PATCH 9/9] Fix mkl error checking --- perf_test/sparse/KokkosSparse_spadd.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/perf_test/sparse/KokkosSparse_spadd.cpp b/perf_test/sparse/KokkosSparse_spadd.cpp index 2e11da6b20..ee60789b6c 100644 --- a/perf_test/sparse/KokkosSparse_spadd.cpp +++ b/perf_test/sparse/KokkosSparse_spadd.cpp @@ -62,8 +62,9 @@ inline void spadd_mkl_internal_safe_call(sparse_status_t mklStatus, const char* file = nullptr, const int line = 0) { if (SPARSE_STATUS_SUCCESS != mklStatus) { - std::cerr << "MKL call \"" << name << "\" encountered error at " << file << ":" << line << '\n'; - Kokkos::abort(); + std::ostringstream oss; + oss << "MKL call \"" << name << "\" encountered error at " << file << ":" << line << '\n'; + Kokkos::abort(oss.str().c_str()); } }