From 9095beb5cbb8a34aab5362daa9cd1f14bbf10d85 Mon Sep 17 00:00:00 2001 From: Luc Berger-Vergiat Date: Wed, 21 Sep 2022 09:50:34 -0600 Subject: [PATCH] MDF: improving performance and adding performance test The performance test allows to generate random matrices, random diagonal matrices and to read matrices from file. It collects time for the handle creation, symbolic phase and numeric phase of the MDF algorithm. A small change in the methods names is made to make MDF more uniform with the rest of the library. The unit-test is improved by checking the results in L and U against analytical solution. mostly changing the way the discarded fill is computed at each factorization step, only selecting rows that were impacted by the last factorized row. --- perf_test/sparse/CMakeLists.txt | 5 + perf_test/sparse/KokkosSparse_mdf.cpp | 320 +++++++++++++++++++++++++ sparse/impl/KokkosSparse_mdf_impl.hpp | 236 ++++++++++++++++-- sparse/src/KokkosSparse_mdf.hpp | 67 ++++-- sparse/src/KokkosSparse_mdf_handle.hpp | 25 +- sparse/unit_test/Test_Sparse_mdf.hpp | 87 ++++++- 6 files changed, 688 insertions(+), 52 deletions(-) create mode 100644 perf_test/sparse/KokkosSparse_mdf.cpp diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index a574ed101f..6eac716aca 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -110,3 +110,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE( sparse_spiluk SOURCES KokkosSparse_spiluk.cpp ) + +KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_mdf + SOURCES KokkosSparse_mdf.cpp +) diff --git a/perf_test/sparse/KokkosSparse_mdf.cpp b/perf_test/sparse/KokkosSparse_mdf.cpp new file mode 100644 index 0000000000..ca48df8fd2 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_mdf.cpp @@ -0,0 +1,320 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_config.h" +#include "KokkosKernels_Handle.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosSparse_mdf.hpp" +#include "KokkosKernels_TestUtils.hpp" + +struct Params { + int use_cuda = 0; + int use_hip = 0; + int use_sycl = 0; + int use_openmp = 0; + int use_threads = 0; + std::string amtx; + int m = 10000; + int n = 10000; + int nnzPerRow = 30; + bool diag = false; // Whether B should be diagonal only (requires A square) + bool verbose = false; + int repeat = 1; +}; + +template +struct diag_generator_functor { + using size_type = typename row_map_t::non_const_value_type; + + row_map_t row_map; + entries_t entries; + + diag_generator_functor(row_map_t row_map_, entries_t entries_) + : row_map(row_map_), entries(entries_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type rowIdx) const { + row_map(rowIdx + 1) = rowIdx + 1; + entries(rowIdx) = rowIdx; + } +}; + +template +void run_experiment(const Params& params) { + using size_type = typename crsMat_t::size_type; + using lno_t = typename crsMat_t::ordinal_type; + using scalar_t = typename crsMat_t::value_type; + using device_t = typename crsMat_t::device_type; + using exec_space = typename device_t::execution_space; + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + + std::cout << "************************************* \n"; + std::cout << "************************************* \n"; + crsMat_t A; + lno_t m = params.m; + lno_t n = params.n; + if (params.amtx.length()) { + std::cout << "Loading A from " << params.amtx << '\n'; + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.amtx.c_str()); + m = A.numRows(); + n = A.numCols(); + } else { + if (params.diag) { + std::cout << "Randomly generating diag matrix\n"; + rowmap_t rowmapA("A row map", m + 1); + entries_t entriesA("A entries", m); + values_t valuesA("A values", m); + + // Generate the graph of A + diag_generator_functor diag_generator(rowmapA, entriesA); + Kokkos::parallel_for(Kokkos::RangePolicy(0, m), + diag_generator); + + // Generate the values of A + Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::fill_random(valuesA, rand_pool, + 10 * Kokkos::ArithTraits::one()); + + // Actually put A together + graph_t graph(entriesA, rowmapA); + A = crsMat_t("A matrix", m, valuesA, graph); + } else { + std::cout << "Randomly generating matrix\n"; + size_type nnzUnused = m * params.nnzPerRow; + A = KokkosSparse::Impl::kk_generate_sparse_matrix( + m, n, nnzUnused, 0, (n + 3) / 3); + } + } + + if (params.verbose) { + std::cout << "Matrix A" << std::endl; + std::cout << " row_map A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.graph.row_map); + std::cout << " entries A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.graph.entries); + std::cout << " values A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.values); + std::cout << std::endl; + } + + Kokkos::Timer timer; + double handleTime = 0; + double symbolicTime = 0; + double numericTime = 0; + + timer.reset(); + KokkosSparse::Experimental::MDF_handle handle(A); + handle.set_verbosity(0); + handleTime += timer.seconds(); + + for (int sumRep = 0; sumRep < params.repeat; sumRep++) { + timer.reset(); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + Kokkos::fence(); + symbolicTime += timer.seconds(); + + timer.reset(); + KokkosSparse::Experimental::mdf_numeric(A, handle); + Kokkos::fence(); + numericTime += timer.seconds(); + } + + std::cout << "Mean total time: " + << handleTime + (symbolicTime / params.repeat) + + (numericTime / params.repeat) + << std::endl + << "Handle time: " << handleTime << std::endl + << "Mean symbolic time: " << (symbolicTime / params.repeat) + << std::endl + << "Mean numeric time: " << (numericTime / params.repeat) + << std::endl; + + if (params.verbose) { + entries_t permutation = handle.get_permutation(); + + std::cout << "MDF permutation:" << std::endl; + KokkosKernels::Impl::print_1Dview(permutation); + } +} // run_experiment + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr + << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp " + "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" + " | '--sycl [syclDeviceIndex]'" + << std::endl; + + std::cerr << "\t[Optional] --amtx :: input matrix" << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + "MDF" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\nSettings for randomly generated A matrix" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; + std::cerr + << "\t[Optional] --nnz :: number of entries per row to generate" + << std::endl; + std::cerr << "\t[Optional] --diag :: generate a diagonal matrix" + << std::endl; +} // print_options + +int parse_inputs(Params& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (0 == Test::string_compare_no_case(argv[i], "--threads")) { + params.use_threads = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { + params.use_openmp = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { + params.use_cuda = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { + params.use_hip = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { + params.amtx = argv[++i]; + } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { + params.m = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { + params.n = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) { + params.nnzPerRow = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--diag")) { + params.diag = true; + } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { + params.repeat = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { + params.verbose = true; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +int main(int argc, char** argv) { + Params params; + + if (parse_inputs(params, argc, argv)) { + return 1; + } + const int num_threads = + std::max(params.use_openmp, + params.use_threads); // Assumption is that use_openmp variable + // is provided as number of threads + + // If cuda, hip or sycl is used, set device_id + int device_id = 0; + if (params.use_cuda > 0) { + device_id = params.use_cuda - 1; + } + if (params.use_hip > 0) { + device_id = params.use_hip - 1; + } + if (params.use_sycl > 0) { + device_id = params.use_sycl - 1; + } + + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); + + bool useOMP = params.use_openmp != 0; + bool useThreads = params.use_threads != 0; + bool useCUDA = params.use_cuda != 0; + bool useHIP = params.use_hip != 0; + bool useSYCL = params.use_sycl != 0; + bool useSerial = !useOMP && !useCUDA && !useHIP && !useSYCL; + + if (useOMP) { +#if defined(KOKKOS_ENABLE_OPENMP) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + if (useThreads) { +#if defined(KOKKOS_ENABLE_THREADS) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + if (useCUDA) { +#if defined(KOKKOS_ENABLE_CUDA) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + if (useHIP) { +#if defined(KOKKOS_ENABLE_HIP) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + if (useSYCL) { +#if defined(KOKKOS_ENABLE_SYCL) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + if (useSerial) { +#if defined(KOKKOS_ENABLE_SERIAL) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + Kokkos::finalize(); + return 0; +} // main diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index feee2d765b..b8a25485f5 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -155,12 +155,133 @@ struct MDF_discarded_fill_norm { A.graph.row_map(rowIdx) - 1); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - rowIdx, KAS::sqrt(discard_norm), deficiency(rowIdx), degree); + static_cast(rowIdx), + static_cast(KAS::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); } } }; // MDF_discarded_fill_norm +template +struct MDF_selective_discarded_fill_norm { + using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; + using col_ind_type = + typename static_crs_graph_type::entries_type::non_const_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + + const scalar_type zero = KAS::zero(); + + crs_matrix_type A, At; + ordinal_type factorization_step; + col_ind_type permutation; + col_ind_type update_list; + + values_type discarded_fill; + col_ind_type deficiency; + int verbosity; + + MDF_selective_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, + ordinal_type factorization_step_, + col_ind_type permutation_, + col_ind_type update_list_, + values_type discarded_fill_, + col_ind_type deficiency_, int verbosity_) + : A(A_), + At(At_), + factorization_step(factorization_step_), + permutation(permutation_), + update_list(update_list_), + discarded_fill(discarded_fill_), + deficiency(deficiency_), + verbosity(verbosity_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type i) const { + ordinal_type rowIdx = permutation(update_list(i)); + scalar_type discard_norm = zero, diag_val = zero; + bool entryIsDiscarded = true; + ordinal_type numFillEntries = 0; + for (size_type alphaIdx = At.graph.row_map(rowIdx); + alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { + ordinal_type fillRowIdx = At.graph.entries(alphaIdx); + bool row_not_eliminated = true; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { + if (fillRowIdx == permutation(stepIdx)) { + row_not_eliminated = false; + } + } + + if (fillRowIdx != rowIdx && row_not_eliminated) { + for (size_type betaIdx = A.graph.row_map(rowIdx); + betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { + ordinal_type fillColIdx = A.graph.entries(betaIdx); + bool col_not_eliminated = true; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; + ++stepIdx) { + if (fillColIdx == permutation(stepIdx)) { + col_not_eliminated = false; + } + } + + if (fillColIdx != rowIdx && col_not_eliminated) { + entryIsDiscarded = true; + for (size_type entryIdx = A.graph.row_map(fillRowIdx); + entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { + if (A.graph.entries(entryIdx) == fillColIdx) { + entryIsDiscarded = false; + } + } + if (entryIsDiscarded) { + numFillEntries += 1; + discard_norm += + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Adding value A[%d,%d]=%f to discard norm of row %d\n", + static_cast(At.graph.entries(alphaIdx)), + static_cast(A.graph.entries(betaIdx)), + static_cast( + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), + static_cast(rowIdx)); + } + } + } + } + } else if (fillRowIdx == rowIdx) { + diag_val = At.values(alphaIdx); + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value dected, values(%d)=%f\n", + static_cast(rowIdx), static_cast(alphaIdx), + static_cast(At.values(alphaIdx))); + } + } + } + + // TODO add a check on `diag_val == zero` + discard_norm = discard_norm / (diag_val * diag_val); + discarded_fill(rowIdx) = discard_norm; + deficiency(rowIdx) = numFillEntries; + if (verbosity > 0) { + const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - + A.graph.row_map(rowIdx) - 1); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", + static_cast(rowIdx), + static_cast(KAS::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); + } + } + +}; // MDF_selective_discarded_fill_norm + template struct MDF_select_row { using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -294,6 +415,8 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; + values_type discarded_fill; + col_ind_type factored; ordinal_type selected_row_idx, factorization_step; int verbosity; @@ -303,6 +426,7 @@ struct MDF_factorize_row { values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, + values_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, ordinal_type factorization_step_, int verbosity_) : A(A_), @@ -315,6 +439,8 @@ struct MDF_factorize_row { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + discarded_fill(discarded_fill_), + factored(factored_), selected_row_idx(selected_row_idx_), factorization_step(factorization_step_), verbosity(verbosity_){}; @@ -322,6 +448,7 @@ struct MDF_factorize_row { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type /* idx */) const { const ordinal_type selected_row = permutation(selected_row_idx); + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); // Swap entries in permutation vectors permutation(selected_row_idx) = permutation(factorization_step); @@ -332,7 +459,8 @@ struct MDF_factorize_row { if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", permutation(rowIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(permutation(rowIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -356,23 +484,27 @@ struct MDF_factorize_row { if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - selected_row, diag); + static_cast(selected_row), + static_cast(diag)); } if (verbosity > 2) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(row_mapU(rowIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(row_mapU(rowIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); for (size_type entryIdx = row_mapU(0); entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(entriesU(entryIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesU(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); for (size_type entryIdx = row_mapU(0); entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", valuesU(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesU(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -397,17 +529,21 @@ struct MDF_factorize_row { if (verbosity > 2) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - int(factorization_step), int(factorization_step), - int(factorization_step + 1), int(row_mapL(factorization_step)), - int(row_mapL(factorization_step + 1))); + static_cast(factorization_step), + static_cast(factorization_step), + static_cast(factorization_step + 1), + static_cast(row_mapL(factorization_step)), + static_cast(row_mapL(factorization_step + 1))); for (size_type entryIdx = row_mapL(factorization_step); entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(entriesL(entryIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesL(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); for (size_type entryIdx = row_mapL(factorization_step); entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", valuesL(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesL(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -466,8 +602,10 @@ struct MDF_factorize_row { if (verbosity > 1) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", int(fillRowIdx), int(fillColIdx), - At.values(alphaIdx) * A.values(betaIdx) / diag_val); + "A[%d, %d] -= %f\n", static_cast(fillRowIdx), + static_cast(fillColIdx), + static_cast(At.values(alphaIdx) * + A.values(betaIdx) / diag_val)); } } } @@ -484,15 +622,19 @@ struct MDF_factorize_row { } } + factored(selected_row) = 1; + if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", A.values(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(A.values(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", At.values(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(At.values(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -500,6 +642,70 @@ struct MDF_factorize_row { }; // MDF_factorize_row +template +struct MDF_compute_list_length { + using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: + entries_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + + ordinal_type selected_row_idx; + crs_matrix_type A; + crs_matrix_type At; + col_ind_type permutation; + col_ind_type factored; + col_ind_type update_list_length; + col_ind_type update_list; + + MDF_compute_list_length(const ordinal_type rowIdx_, const crs_matrix_type& A_, + const crs_matrix_type& At_, + const col_ind_type& permutation_, + const col_ind_type factored_, + col_ind_type& update_list_length_, + col_ind_type& update_list_) + : selected_row_idx(rowIdx_), + A(A_), + At(At_), + permutation(permutation_), + factored(factored_), + update_list_length(update_list_length_), + update_list(update_list_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type /*idx*/) const { + const ordinal_type selected_row = permutation(selected_row_idx); + + size_type updateIdx = 0; + for (size_type entryIdx = A.graph.row_map(selected_row); + entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { + if ((A.graph.entries(entryIdx) != selected_row) && + (factored(A.graph.entries(entryIdx)) != 1)) { + update_list(updateIdx) = A.graph.entries(entryIdx); + ++updateIdx; + } + } + size_type update_rows = updateIdx; + for (size_type entryIdx = At.graph.row_map(selected_row); + entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { + if ((At.graph.entries(entryIdx) != selected_row) && + (factored(A.graph.entries(entryIdx)) != 1)) { + bool already_updated = false; + for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { + if (At.graph.entries(entryIdx) == update_list(checkIdx)) { + already_updated = true; + break; + } + } + if (already_updated == false) { + update_list(updateIdx) = At.graph.entries(entryIdx); + ++updateIdx; + } + } + } + update_list_length(0) = updateIdx; + } +}; + template struct MDF_reindex_matrix { col_ind_type permutation_inv; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 33229b6cdb..90fa3beeef 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -34,7 +34,7 @@ namespace KokkosSparse { namespace Experimental { template -void mdf_symbolic_phase(crs_matrix_type& A, MDF_handle& handle) { +void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; @@ -60,10 +60,10 @@ void mdf_symbolic_phase(crs_matrix_type& A, MDF_handle& handle) { } return; -} // mdf_symbolic_phase +} // mdf_symbolic template -void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { +void mdf_numeric(crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -78,13 +78,26 @@ void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { // compute discarded fill of each row // selected pivot based on MDF // factorize pivot row of A - crs_matrix_type Atmp = crs_matrix_type("A fill", A); + const int verbosity_level = handle.verbosity; + crs_matrix_type Atmp = crs_matrix_type("A fill", A); crs_matrix_type At = KokkosSparse::Impl::transpose_matrix(A); KokkosSparse::sort_crs_matrix(At); values_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); + col_ind_type update_list_length("update list length", 1); + typename col_ind_type::HostMirror update_list_length_host = + Kokkos::create_mirror_view(update_list_length); + col_ind_type update_list("update list", A.numRows()); + col_ind_type factored("factored rows", A.numRows()); + Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); + Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); + + KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( + Atmp, At, 0, handle.permutation, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for("MDF: initial fill computation", + range_policy_type(0, Atmp.numRows()), MDF_df_norm); - const int verbosity_level = handle.verbosity; for (ordinal_type factorization_step = 0; factorization_step < A.numRows(); ++factorization_step) { if (verbosity_level > 0) { @@ -92,44 +105,58 @@ void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { static_cast(factorization_step)); } - range_policy_type stepPolicy(factorization_step, Atmp.numRows()); - Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); - Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); - KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( - Atmp, At, factorization_step, handle.permutation, discarded_fill, - deficiency, verbosity_level); - Kokkos::parallel_for(stepPolicy, MDF_df_norm); + Kokkos::deep_copy(update_list_length_host, update_list_length); + range_policy_type updatePolicy(0, update_list_length_host(0)); + KokkosSparse::Impl::MDF_selective_discarded_fill_norm + MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, + update_list, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, + MDF_update_df_norm); + range_policy_type stepPolicy(factorization_step, Atmp.numRows()); ordinal_type selected_row_idx = 0; KokkosSparse::Impl::MDF_select_row MDF_row_selector( factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, handle.permutation); - Kokkos::parallel_reduce(stepPolicy, MDF_row_selector, selected_row_idx); + Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, + selected_row_idx); + + KokkosSparse::Impl::MDF_compute_list_length + compute_list_length(selected_row_idx, Atmp, At, handle.permutation, + factored, update_list_length, update_list); + Kokkos::parallel_for("MDF: compute update list", range_policy_type(0, 1), + compute_list_length); KokkosSparse::Impl::MDF_factorize_row factorize_row( Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, selected_row_idx, factorization_step, - verbosity_level); - Kokkos::parallel_for(range_policy_type(0, 1), factorize_row); + handle.permutation_inv, discarded_fill, factored, selected_row_idx, + factorization_step, verbosity_level); + Kokkos::parallel_for("MDF: factorize row", range_policy_type(0, 1), + factorize_row); if (verbosity_level > 0) { printf("\n"); } - } + } // Loop over factorization steps KokkosSparse::Impl::MDF_reindex_matrix reindex_U( handle.permutation_inv, handle.entriesU); - Kokkos::parallel_for(range_policy_type(0, handle.entriesU.extent(0)), + Kokkos::parallel_for("MDF: re-index U", + range_policy_type(0, handle.entriesU.extent(0)), reindex_U); KokkosSparse::Impl::MDF_reindex_matrix reindex_L( handle.permutation_inv, handle.entriesL); - Kokkos::parallel_for(range_policy_type(0, handle.entriesL.extent(0)), + Kokkos::parallel_for("MDF: re-index L", + range_policy_type(0, handle.entriesL.extent(0)), reindex_L); + handle.L = KokkosSparse::Impl::transpose_matrix(handle.L); + return; -} // mdf_numeric_phase +} // mdf_numeric } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index eb44657337..6f6f2658be 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -60,6 +60,8 @@ struct MDF_handle { int verbosity; + crs_matrix_type L, U; + MDF_handle(const crs_matrix_type A) : numRows(A.numRows()), permutation(col_ind_type("row permutation", A.numRows())), @@ -74,31 +76,28 @@ struct MDF_handle { entriesL = col_ind_type("entries L", nnzL); valuesL = values_type("values L", nnzL); + L = crs_matrix_type("L", numRows, numRows, nnzL, valuesL, row_mapL, + entriesL); + // Allocate U row_mapU = row_map_type("row map U", numRows + 1); entriesU = col_ind_type("entries U", nnzU); valuesU = values_type("values U", nnzU); + + U = crs_matrix_type("U", numRows, numRows, nnzU, valuesU, row_mapU, + entriesU); } col_ind_type get_permutation() { return permutation; } void sort_factors() { - KokkosSparse::sort_crs_matrix(row_mapL, entriesL, valuesL); - KokkosSparse::sort_crs_matrix(row_mapU, entriesU, valuesU); + KokkosSparse::sort_crs_matrix(L); + KokkosSparse::sort_crs_matrix(U); } - crs_matrix_type getL() { - return KokkosSparse::Impl::transpose_matrix( - crs_matrix_type("L", numRows, numRows, entriesL.extent(0), valuesL, - row_mapL, entriesL)); - } + crs_matrix_type getL() { return L; } - crs_matrix_type getU() { - return crs_matrix_type("U", numRows, numRows, entriesU.extent(0), valuesU, - row_mapU, entriesU); - } + crs_matrix_type getU() { return U; } }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index 3fcd827292..41204c9b4d 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -32,6 +32,8 @@ void run_test_mdf() { using values_type = typename crs_matrix_type::values_type::non_const_type; using value_type = typename crs_matrix_type::value_type; + const value_type four = static_cast(4.0); + constexpr ordinal_type numRows = 16; constexpr ordinal_type numCols = 16; constexpr size_type numNonZeros = 64; @@ -70,8 +72,8 @@ void run_test_mdf() { KokkosSparse::Experimental::MDF_handle handle(A); handle.set_verbosity(0); - mdf_symbolic_phase(A, handle); - mdf_numeric_phase(A, handle); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + KokkosSparse::Experimental::mdf_numeric(A, handle); col_ind_type permutation = handle.get_permutation(); @@ -83,20 +85,97 @@ void run_test_mdf() { 7, 11, 13, 14, 5, 6, 9, 10}; printf("MDF ordering: { "); for (ordinal_type idx = 0; idx < A.numRows(); ++idx) { - ; printf("%d ", static_cast(permutation_h(idx))); if (permutation_h(idx) != permutation_ref[idx]) { success = false; } } printf("}\n"); - EXPECT_TRUE(success) << "The permutation computed is different from the reference solution!"; + // Check the factors L and U handle.sort_factors(); crs_matrix_type U = handle.getU(); crs_matrix_type L = handle.getL(); + + EXPECT_TRUE(U.numRows() == 16); + EXPECT_TRUE(U.nnz() == 40); + + { + auto row_map_U = Kokkos::create_mirror(U.graph.row_map); + Kokkos::deep_copy(row_map_U, U.graph.row_map); + auto entries_U = Kokkos::create_mirror(U.graph.entries); + Kokkos::deep_copy(entries_U, U.graph.entries); + auto values_U = Kokkos::create_mirror(U.values); + Kokkos::deep_copy(values_U, U.values); + + const size_type row_map_U_ref[17] = {0, 3, 6, 9, 12, 15, 17, 20, 22, + 25, 27, 30, 32, 35, 37, 39, 40}; + const ordinal_type entries_U_ref[40] = { + 0, 4, 6, 1, 5, 8, 2, 7, 10, 3, 9, 11, 4, 5, + 12, 5, 13, 6, 7, 12, 7, 14, 8, 9, 13, 9, 15, 10, + 11, 14, 11, 15, 12, 13, 14, 13, 15, 14, 15, 15}; + + const scalar_type val0 = static_cast(15. / 4.); + const scalar_type val1 = static_cast(val0 - 1 / val0); + const scalar_type val2 = static_cast(4 - 2 / val0); + const scalar_type val3 = + static_cast(4 - 1 / val0 - 1 / val1 - 1 / val2); + const scalar_type val4 = static_cast(4 - 2 / val1 - 2 / val3); + const scalar_type values_U_ref[40] = { + 4, -1, -1, 4, -1, -1, 4, -1, -1, 4, -1, -1, val0, -1, -1, + val1, -1, val0, -1, -1, val1, -1, val0, -1, -1, val1, -1, val0, -1, -1, + val1, -1, val2, -1, -1, val3, -1, val3, -1, val4}; + + for (int idx = 0; idx < 17; ++idx) { + EXPECT_TRUE(row_map_U_ref[idx] == row_map_U(idx)) + << "rowmap_U(" << idx << ") is wrong!"; + } + for (int idx = 0; idx < 40; ++idx) { + EXPECT_TRUE(entries_U_ref[idx] == entries_U(idx)) + << "entries_U(" << idx << ") is wrong!"; + EXPECT_NEAR_KK(values_U_ref[idx], values_U(idx), + 10 * Kokkos::ArithTraits::eps(), + "An entry in U.values is wrong!"); + } + + auto row_map_L = Kokkos::create_mirror(L.graph.row_map); + Kokkos::deep_copy(row_map_L, L.graph.row_map); + auto entries_L = Kokkos::create_mirror(L.graph.entries); + Kokkos::deep_copy(entries_L, L.graph.entries); + auto values_L = Kokkos::create_mirror(L.values); + Kokkos::deep_copy(values_L, L.values); + + const size_type row_map_L_ref[17] = {0, 1, 2, 3, 4, 6, 9, 11, 14, + 16, 19, 21, 24, 27, 31, 35, 40}; + const ordinal_type entries_L_ref[40] = { + 0, 1, 2, 3, 0, 4, 1, 4, 5, 0, 6, 2, 6, 7, + 1, 8, 3, 8, 9, 2, 10, 3, 10, 11, 4, 6, 12, 5, + 8, 12, 13, 7, 10, 12, 14, 9, 11, 13, 14, 15}; + const scalar_type values_L_ref[40] = { + 1, 1, 1, 1, -1 / four, 1, + -1 / four, -1 / val0, 1, -1 / four, 1, -1 / four, + -1 / val0, 1, -1 / four, 1, -1 / four, -1 / val0, + 1, -1 / four, 1, -1 / four, -1 / val0, 1, + -1 / val0, -1 / val0, 1, -1 / val1, -1 / val0, -1 / val2, + 1, -1 / val1, -1 / val0, -1 / val2, 1, -1 / val1, + -1 / val1, -1 / val3, -1 / val3, 1}; + + for (int idx = 0; idx < 17; ++idx) { + EXPECT_TRUE(row_map_L_ref[idx] == row_map_L(idx)) + << "rowmap_L(" << idx << ")=" << row_map_L(idx) << " is wrong!"; + } + for (int idx = 0; idx < 40; ++idx) { + EXPECT_TRUE(entries_L_ref[idx] == entries_L(idx)) + << "entries_L(" << idx << ")=" << entries_L(idx) + << " is wrong, entries_L_ref[" << idx << "]=" << entries_L_ref[idx] + << "!"; + EXPECT_NEAR_KK(values_L_ref[idx], values_L(idx), + 10 * Kokkos::ArithTraits::eps(), + "An entry in L.values is wrong!"); + } + } } } // namespace Test