diff --git a/perf_test/sparse/CMakeLists.txt b/perf_test/sparse/CMakeLists.txt index a574ed101f..6eac716aca 100644 --- a/perf_test/sparse/CMakeLists.txt +++ b/perf_test/sparse/CMakeLists.txt @@ -110,3 +110,8 @@ KOKKOSKERNELS_ADD_EXECUTABLE( sparse_spiluk SOURCES KokkosSparse_spiluk.cpp ) + +KOKKOSKERNELS_ADD_EXECUTABLE( + sparse_mdf + SOURCES KokkosSparse_mdf.cpp +) diff --git a/perf_test/sparse/KokkosSparse_mdf.cpp b/perf_test/sparse/KokkosSparse_mdf.cpp new file mode 100644 index 0000000000..ca48df8fd2 --- /dev/null +++ b/perf_test/sparse/KokkosSparse_mdf.cpp @@ -0,0 +1,320 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include "KokkosKernels_config.h" +#include "KokkosKernels_Handle.hpp" +#include "KokkosSparse_IOUtils.hpp" +#include "KokkosSparse_Utils_cusparse.hpp" +#include "KokkosSparse_mdf.hpp" +#include "KokkosKernels_TestUtils.hpp" + +struct Params { + int use_cuda = 0; + int use_hip = 0; + int use_sycl = 0; + int use_openmp = 0; + int use_threads = 0; + std::string amtx; + int m = 10000; + int n = 10000; + int nnzPerRow = 30; + bool diag = false; // Whether B should be diagonal only (requires A square) + bool verbose = false; + int repeat = 1; +}; + +template +struct diag_generator_functor { + using size_type = typename row_map_t::non_const_value_type; + + row_map_t row_map; + entries_t entries; + + diag_generator_functor(row_map_t row_map_, entries_t entries_) + : row_map(row_map_), entries(entries_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type rowIdx) const { + row_map(rowIdx + 1) = rowIdx + 1; + entries(rowIdx) = rowIdx; + } +}; + +template +void run_experiment(const Params& params) { + using size_type = typename crsMat_t::size_type; + using lno_t = typename crsMat_t::ordinal_type; + using scalar_t = typename crsMat_t::value_type; + using device_t = typename crsMat_t::device_type; + using exec_space = typename device_t::execution_space; + + using graph_t = typename crsMat_t::StaticCrsGraphType; + using rowmap_t = typename graph_t::row_map_type::non_const_type; + using entries_t = typename graph_t::entries_type::non_const_type; + using values_t = typename crsMat_t::values_type::non_const_type; + + std::cout << "************************************* \n"; + std::cout << "************************************* \n"; + crsMat_t A; + lno_t m = params.m; + lno_t n = params.n; + if (params.amtx.length()) { + std::cout << "Loading A from " << params.amtx << '\n'; + A = KokkosSparse::Impl::read_kokkos_crst_matrix( + params.amtx.c_str()); + m = A.numRows(); + n = A.numCols(); + } else { + if (params.diag) { + std::cout << "Randomly generating diag matrix\n"; + rowmap_t rowmapA("A row map", m + 1); + entries_t entriesA("A entries", m); + values_t valuesA("A values", m); + + // Generate the graph of A + diag_generator_functor diag_generator(rowmapA, entriesA); + Kokkos::parallel_for(Kokkos::RangePolicy(0, m), + diag_generator); + + // Generate the values of A + Kokkos::Random_XorShift64_Pool rand_pool(13718); + Kokkos::fill_random(valuesA, rand_pool, + 10 * Kokkos::ArithTraits::one()); + + // Actually put A together + graph_t graph(entriesA, rowmapA); + A = crsMat_t("A matrix", m, valuesA, graph); + } else { + std::cout << "Randomly generating matrix\n"; + size_type nnzUnused = m * params.nnzPerRow; + A = KokkosSparse::Impl::kk_generate_sparse_matrix( + m, n, nnzUnused, 0, (n + 3) / 3); + } + } + + if (params.verbose) { + std::cout << "Matrix A" << std::endl; + std::cout << " row_map A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.graph.row_map); + std::cout << " entries A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.graph.entries); + std::cout << " values A:" << std::endl; + KokkosKernels::Impl::print_1Dview(A.values); + std::cout << std::endl; + } + + Kokkos::Timer timer; + double handleTime = 0; + double symbolicTime = 0; + double numericTime = 0; + + timer.reset(); + KokkosSparse::Experimental::MDF_handle handle(A); + handle.set_verbosity(0); + handleTime += timer.seconds(); + + for (int sumRep = 0; sumRep < params.repeat; sumRep++) { + timer.reset(); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + Kokkos::fence(); + symbolicTime += timer.seconds(); + + timer.reset(); + KokkosSparse::Experimental::mdf_numeric(A, handle); + Kokkos::fence(); + numericTime += timer.seconds(); + } + + std::cout << "Mean total time: " + << handleTime + (symbolicTime / params.repeat) + + (numericTime / params.repeat) + << std::endl + << "Handle time: " << handleTime << std::endl + << "Mean symbolic time: " << (symbolicTime / params.repeat) + << std::endl + << "Mean numeric time: " << (numericTime / params.repeat) + << std::endl; + + if (params.verbose) { + entries_t permutation = handle.get_permutation(); + + std::cout << "MDF permutation:" << std::endl; + KokkosKernels::Impl::print_1Dview(permutation); + } +} // run_experiment + +void print_options() { + std::cerr << "Options\n" << std::endl; + + std::cerr + << "\t[Required] BACKEND: '--threads[numThreads]' | '--openmp " + "[numThreads]' | '--cuda [cudaDeviceIndex]' | '--hip [hipDeviceIndex]'" + " | '--sycl [syclDeviceIndex]'" + << std::endl; + + std::cerr << "\t[Optional] --amtx :: input matrix" << std::endl; + std::cerr << "\t[Optional] --repeat :: how many times to repeat overall " + "MDF" + << std::endl; + std::cerr << "\t[Optional] --verbose :: enable verbose output" + << std::endl; + std::cerr << "\nSettings for randomly generated A matrix" << std::endl; + std::cerr << "\t[Optional] --m :: number of rows to generate" + << std::endl; + std::cerr << "\t[Optional] --n :: number of cols to generate" + << std::endl; + std::cerr + << "\t[Optional] --nnz :: number of entries per row to generate" + << std::endl; + std::cerr << "\t[Optional] --diag :: generate a diagonal matrix" + << std::endl; +} // print_options + +int parse_inputs(Params& params, int argc, char** argv) { + for (int i = 1; i < argc; ++i) { + if (0 == Test::string_compare_no_case(argv[i], "--threads")) { + params.use_threads = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--openmp")) { + params.use_openmp = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--cuda")) { + params.use_cuda = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--hip")) { + params.use_hip = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--sycl")) { + params.use_sycl = atoi(argv[++i]) + 1; + } else if (0 == Test::string_compare_no_case(argv[i], "--amtx")) { + params.amtx = argv[++i]; + } else if (0 == Test::string_compare_no_case(argv[i], "--m")) { + params.m = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--n")) { + params.n = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--nnz")) { + params.nnzPerRow = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--diag")) { + params.diag = true; + } else if (0 == Test::string_compare_no_case(argv[i], "--repeat")) { + params.repeat = atoi(argv[++i]); + } else if (0 == Test::string_compare_no_case(argv[i], "--verbose")) { + params.verbose = true; + } else { + std::cerr << "Unrecognized command line argument #" << i << ": " + << argv[i] << std::endl; + print_options(); + return 1; + } + } + return 0; +} // parse_inputs + +int main(int argc, char** argv) { + Params params; + + if (parse_inputs(params, argc, argv)) { + return 1; + } + const int num_threads = + std::max(params.use_openmp, + params.use_threads); // Assumption is that use_openmp variable + // is provided as number of threads + + // If cuda, hip or sycl is used, set device_id + int device_id = 0; + if (params.use_cuda > 0) { + device_id = params.use_cuda - 1; + } + if (params.use_hip > 0) { + device_id = params.use_hip - 1; + } + if (params.use_sycl > 0) { + device_id = params.use_sycl - 1; + } + + Kokkos::initialize(Kokkos::InitializationSettings() + .set_num_threads(num_threads) + .set_device_id(device_id)); + + bool useOMP = params.use_openmp != 0; + bool useThreads = params.use_threads != 0; + bool useCUDA = params.use_cuda != 0; + bool useHIP = params.use_hip != 0; + bool useSYCL = params.use_sycl != 0; + bool useSerial = !useOMP && !useCUDA && !useHIP && !useSYCL; + + if (useOMP) { +#if defined(KOKKOS_ENABLE_OPENMP) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + if (useThreads) { +#if defined(KOKKOS_ENABLE_THREADS) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: OpenMP requested, but not available.\n"; + return 1; +#endif + } + if (useCUDA) { +#if defined(KOKKOS_ENABLE_CUDA) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: CUDA requested, but not available.\n"; + return 1; +#endif + } + if (useHIP) { +#if defined(KOKKOS_ENABLE_HIP) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: HIP requested, but not available.\n"; + return 1; +#endif + } + if (useSYCL) { +#if defined(KOKKOS_ENABLE_SYCL) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: SYCL requested, but not available.\n"; + return 1; +#endif + } + if (useSerial) { +#if defined(KOKKOS_ENABLE_SERIAL) + using crsMat_t = + KokkosSparse::CrsMatrix; + run_experiment(params); +#else + std::cout << "ERROR: Serial device requested, but not available.\n"; + return 1; +#endif + } + Kokkos::finalize(); + return 0; +} // main diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index feee2d765b..b8a25485f5 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -155,12 +155,133 @@ struct MDF_discarded_fill_norm { A.graph.row_map(rowIdx) - 1); KOKKOS_IMPL_DO_NOT_USE_PRINTF( "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - rowIdx, KAS::sqrt(discard_norm), deficiency(rowIdx), degree); + static_cast(rowIdx), + static_cast(KAS::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); } } }; // MDF_discarded_fill_norm +template +struct MDF_selective_discarded_fill_norm { + using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; + using col_ind_type = + typename static_crs_graph_type::entries_type::non_const_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using size_type = typename crs_matrix_type::size_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using scalar_type = typename crs_matrix_type::value_type; + using KAS = typename Kokkos::ArithTraits; + + const scalar_type zero = KAS::zero(); + + crs_matrix_type A, At; + ordinal_type factorization_step; + col_ind_type permutation; + col_ind_type update_list; + + values_type discarded_fill; + col_ind_type deficiency; + int verbosity; + + MDF_selective_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, + ordinal_type factorization_step_, + col_ind_type permutation_, + col_ind_type update_list_, + values_type discarded_fill_, + col_ind_type deficiency_, int verbosity_) + : A(A_), + At(At_), + factorization_step(factorization_step_), + permutation(permutation_), + update_list(update_list_), + discarded_fill(discarded_fill_), + deficiency(deficiency_), + verbosity(verbosity_){}; + + KOKKOS_INLINE_FUNCTION + void operator()(const ordinal_type i) const { + ordinal_type rowIdx = permutation(update_list(i)); + scalar_type discard_norm = zero, diag_val = zero; + bool entryIsDiscarded = true; + ordinal_type numFillEntries = 0; + for (size_type alphaIdx = At.graph.row_map(rowIdx); + alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { + ordinal_type fillRowIdx = At.graph.entries(alphaIdx); + bool row_not_eliminated = true; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { + if (fillRowIdx == permutation(stepIdx)) { + row_not_eliminated = false; + } + } + + if (fillRowIdx != rowIdx && row_not_eliminated) { + for (size_type betaIdx = A.graph.row_map(rowIdx); + betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { + ordinal_type fillColIdx = A.graph.entries(betaIdx); + bool col_not_eliminated = true; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; + ++stepIdx) { + if (fillColIdx == permutation(stepIdx)) { + col_not_eliminated = false; + } + } + + if (fillColIdx != rowIdx && col_not_eliminated) { + entryIsDiscarded = true; + for (size_type entryIdx = A.graph.row_map(fillRowIdx); + entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { + if (A.graph.entries(entryIdx) == fillColIdx) { + entryIsDiscarded = false; + } + } + if (entryIsDiscarded) { + numFillEntries += 1; + discard_norm += + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Adding value A[%d,%d]=%f to discard norm of row %d\n", + static_cast(At.graph.entries(alphaIdx)), + static_cast(A.graph.entries(betaIdx)), + static_cast( + KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * + KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), + static_cast(rowIdx)); + } + } + } + } + } else if (fillRowIdx == rowIdx) { + diag_val = At.values(alphaIdx); + if (verbosity > 1) { + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d diagonal value dected, values(%d)=%f\n", + static_cast(rowIdx), static_cast(alphaIdx), + static_cast(At.values(alphaIdx))); + } + } + } + + // TODO add a check on `diag_val == zero` + discard_norm = discard_norm / (diag_val * diag_val); + discarded_fill(rowIdx) = discard_norm; + deficiency(rowIdx) = numFillEntries; + if (verbosity > 0) { + const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - + A.graph.row_map(rowIdx) - 1); + KOKKOS_IMPL_DO_NOT_USE_PRINTF( + "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", + static_cast(rowIdx), + static_cast(KAS::sqrt(discard_norm)), + static_cast(deficiency(rowIdx)), static_cast(degree)); + } + } + +}; // MDF_selective_discarded_fill_norm + template struct MDF_select_row { using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -294,6 +415,8 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; + values_type discarded_fill; + col_ind_type factored; ordinal_type selected_row_idx, factorization_step; int verbosity; @@ -303,6 +426,7 @@ struct MDF_factorize_row { values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, + values_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, ordinal_type factorization_step_, int verbosity_) : A(A_), @@ -315,6 +439,8 @@ struct MDF_factorize_row { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + discarded_fill(discarded_fill_), + factored(factored_), selected_row_idx(selected_row_idx_), factorization_step(factorization_step_), verbosity(verbosity_){}; @@ -322,6 +448,7 @@ struct MDF_factorize_row { KOKKOS_INLINE_FUNCTION void operator()(const ordinal_type /* idx */) const { const ordinal_type selected_row = permutation(selected_row_idx); + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); // Swap entries in permutation vectors permutation(selected_row_idx) = permutation(factorization_step); @@ -332,7 +459,8 @@ struct MDF_factorize_row { if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", permutation(rowIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(permutation(rowIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -356,23 +484,27 @@ struct MDF_factorize_row { if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - selected_row, diag); + static_cast(selected_row), + static_cast(diag)); } if (verbosity > 2) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(row_mapU(rowIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(row_mapU(rowIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); for (size_type entryIdx = row_mapU(0); entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(entriesU(entryIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesU(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); for (size_type entryIdx = row_mapU(0); entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", valuesU(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesU(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -397,17 +529,21 @@ struct MDF_factorize_row { if (verbosity > 2) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - int(factorization_step), int(factorization_step), - int(factorization_step + 1), int(row_mapL(factorization_step)), - int(row_mapL(factorization_step + 1))); + static_cast(factorization_step), + static_cast(factorization_step), + static_cast(factorization_step + 1), + static_cast(row_mapL(factorization_step)), + static_cast(row_mapL(factorization_step + 1))); for (size_type entryIdx = row_mapL(factorization_step); entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", int(entriesL(entryIdx))); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", + static_cast(entriesL(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); for (size_type entryIdx = row_mapL(factorization_step); entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", valuesL(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(valuesL(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -466,8 +602,10 @@ struct MDF_factorize_row { if (verbosity > 1) { KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", int(fillRowIdx), int(fillColIdx), - At.values(alphaIdx) * A.values(betaIdx) / diag_val); + "A[%d, %d] -= %f\n", static_cast(fillRowIdx), + static_cast(fillColIdx), + static_cast(At.values(alphaIdx) * + A.values(betaIdx) / diag_val)); } } } @@ -484,15 +622,19 @@ struct MDF_factorize_row { } } + factored(selected_row) = 1; + if (verbosity > 0) { KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", A.values(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(A.values(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", At.values(entryIdx)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", + static_cast(At.values(entryIdx))); } KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); } @@ -500,6 +642,70 @@ struct MDF_factorize_row { }; // MDF_factorize_row +template +struct MDF_compute_list_length { + using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: + entries_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + + ordinal_type selected_row_idx; + crs_matrix_type A; + crs_matrix_type At; + col_ind_type permutation; + col_ind_type factored; + col_ind_type update_list_length; + col_ind_type update_list; + + MDF_compute_list_length(const ordinal_type rowIdx_, const crs_matrix_type& A_, + const crs_matrix_type& At_, + const col_ind_type& permutation_, + const col_ind_type factored_, + col_ind_type& update_list_length_, + col_ind_type& update_list_) + : selected_row_idx(rowIdx_), + A(A_), + At(At_), + permutation(permutation_), + factored(factored_), + update_list_length(update_list_length_), + update_list(update_list_) {} + + KOKKOS_INLINE_FUNCTION + void operator()(const size_type /*idx*/) const { + const ordinal_type selected_row = permutation(selected_row_idx); + + size_type updateIdx = 0; + for (size_type entryIdx = A.graph.row_map(selected_row); + entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { + if ((A.graph.entries(entryIdx) != selected_row) && + (factored(A.graph.entries(entryIdx)) != 1)) { + update_list(updateIdx) = A.graph.entries(entryIdx); + ++updateIdx; + } + } + size_type update_rows = updateIdx; + for (size_type entryIdx = At.graph.row_map(selected_row); + entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { + if ((At.graph.entries(entryIdx) != selected_row) && + (factored(A.graph.entries(entryIdx)) != 1)) { + bool already_updated = false; + for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { + if (At.graph.entries(entryIdx) == update_list(checkIdx)) { + already_updated = true; + break; + } + } + if (already_updated == false) { + update_list(updateIdx) = At.graph.entries(entryIdx); + ++updateIdx; + } + } + } + update_list_length(0) = updateIdx; + } +}; + template struct MDF_reindex_matrix { col_ind_type permutation_inv; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 33229b6cdb..90fa3beeef 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -34,7 +34,7 @@ namespace KokkosSparse { namespace Experimental { template -void mdf_symbolic_phase(crs_matrix_type& A, MDF_handle& handle) { +void mdf_symbolic(crs_matrix_type& A, MDF_handle& handle) { using size_type = typename crs_matrix_type::size_type; using ordinal_type = typename crs_matrix_type::ordinal_type; @@ -60,10 +60,10 @@ void mdf_symbolic_phase(crs_matrix_type& A, MDF_handle& handle) { } return; -} // mdf_symbolic_phase +} // mdf_symbolic template -void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { +void mdf_numeric(crs_matrix_type& A, MDF_handle& handle) { using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -78,13 +78,26 @@ void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { // compute discarded fill of each row // selected pivot based on MDF // factorize pivot row of A - crs_matrix_type Atmp = crs_matrix_type("A fill", A); + const int verbosity_level = handle.verbosity; + crs_matrix_type Atmp = crs_matrix_type("A fill", A); crs_matrix_type At = KokkosSparse::Impl::transpose_matrix(A); KokkosSparse::sort_crs_matrix(At); values_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); + col_ind_type update_list_length("update list length", 1); + typename col_ind_type::HostMirror update_list_length_host = + Kokkos::create_mirror_view(update_list_length); + col_ind_type update_list("update list", A.numRows()); + col_ind_type factored("factored rows", A.numRows()); + Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); + Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); + + KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( + Atmp, At, 0, handle.permutation, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for("MDF: initial fill computation", + range_policy_type(0, Atmp.numRows()), MDF_df_norm); - const int verbosity_level = handle.verbosity; for (ordinal_type factorization_step = 0; factorization_step < A.numRows(); ++factorization_step) { if (verbosity_level > 0) { @@ -92,44 +105,58 @@ void mdf_numeric_phase(crs_matrix_type& A, MDF_handle& handle) { static_cast(factorization_step)); } - range_policy_type stepPolicy(factorization_step, Atmp.numRows()); - Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); - Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); - KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( - Atmp, At, factorization_step, handle.permutation, discarded_fill, - deficiency, verbosity_level); - Kokkos::parallel_for(stepPolicy, MDF_df_norm); + Kokkos::deep_copy(update_list_length_host, update_list_length); + range_policy_type updatePolicy(0, update_list_length_host(0)); + KokkosSparse::Impl::MDF_selective_discarded_fill_norm + MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, + update_list, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, + MDF_update_df_norm); + range_policy_type stepPolicy(factorization_step, Atmp.numRows()); ordinal_type selected_row_idx = 0; KokkosSparse::Impl::MDF_select_row MDF_row_selector( factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, handle.permutation); - Kokkos::parallel_reduce(stepPolicy, MDF_row_selector, selected_row_idx); + Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, + selected_row_idx); + + KokkosSparse::Impl::MDF_compute_list_length + compute_list_length(selected_row_idx, Atmp, At, handle.permutation, + factored, update_list_length, update_list); + Kokkos::parallel_for("MDF: compute update list", range_policy_type(0, 1), + compute_list_length); KokkosSparse::Impl::MDF_factorize_row factorize_row( Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, selected_row_idx, factorization_step, - verbosity_level); - Kokkos::parallel_for(range_policy_type(0, 1), factorize_row); + handle.permutation_inv, discarded_fill, factored, selected_row_idx, + factorization_step, verbosity_level); + Kokkos::parallel_for("MDF: factorize row", range_policy_type(0, 1), + factorize_row); if (verbosity_level > 0) { printf("\n"); } - } + } // Loop over factorization steps KokkosSparse::Impl::MDF_reindex_matrix reindex_U( handle.permutation_inv, handle.entriesU); - Kokkos::parallel_for(range_policy_type(0, handle.entriesU.extent(0)), + Kokkos::parallel_for("MDF: re-index U", + range_policy_type(0, handle.entriesU.extent(0)), reindex_U); KokkosSparse::Impl::MDF_reindex_matrix reindex_L( handle.permutation_inv, handle.entriesL); - Kokkos::parallel_for(range_policy_type(0, handle.entriesL.extent(0)), + Kokkos::parallel_for("MDF: re-index L", + range_policy_type(0, handle.entriesL.extent(0)), reindex_L); + handle.L = KokkosSparse::Impl::transpose_matrix(handle.L); + return; -} // mdf_numeric_phase +} // mdf_numeric } // namespace Experimental } // namespace KokkosSparse diff --git a/sparse/src/KokkosSparse_mdf_handle.hpp b/sparse/src/KokkosSparse_mdf_handle.hpp index eb44657337..6f6f2658be 100644 --- a/sparse/src/KokkosSparse_mdf_handle.hpp +++ b/sparse/src/KokkosSparse_mdf_handle.hpp @@ -60,6 +60,8 @@ struct MDF_handle { int verbosity; + crs_matrix_type L, U; + MDF_handle(const crs_matrix_type A) : numRows(A.numRows()), permutation(col_ind_type("row permutation", A.numRows())), @@ -74,31 +76,28 @@ struct MDF_handle { entriesL = col_ind_type("entries L", nnzL); valuesL = values_type("values L", nnzL); + L = crs_matrix_type("L", numRows, numRows, nnzL, valuesL, row_mapL, + entriesL); + // Allocate U row_mapU = row_map_type("row map U", numRows + 1); entriesU = col_ind_type("entries U", nnzU); valuesU = values_type("values U", nnzU); + + U = crs_matrix_type("U", numRows, numRows, nnzU, valuesU, row_mapU, + entriesU); } col_ind_type get_permutation() { return permutation; } void sort_factors() { - KokkosSparse::sort_crs_matrix(row_mapL, entriesL, valuesL); - KokkosSparse::sort_crs_matrix(row_mapU, entriesU, valuesU); + KokkosSparse::sort_crs_matrix(L); + KokkosSparse::sort_crs_matrix(U); } - crs_matrix_type getL() { - return KokkosSparse::Impl::transpose_matrix( - crs_matrix_type("L", numRows, numRows, entriesL.extent(0), valuesL, - row_mapL, entriesL)); - } + crs_matrix_type getL() { return L; } - crs_matrix_type getU() { - return crs_matrix_type("U", numRows, numRows, entriesU.extent(0), valuesU, - row_mapU, entriesU); - } + crs_matrix_type getU() { return U; } }; } // namespace Experimental diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index 3fcd827292..41204c9b4d 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -32,6 +32,8 @@ void run_test_mdf() { using values_type = typename crs_matrix_type::values_type::non_const_type; using value_type = typename crs_matrix_type::value_type; + const value_type four = static_cast(4.0); + constexpr ordinal_type numRows = 16; constexpr ordinal_type numCols = 16; constexpr size_type numNonZeros = 64; @@ -70,8 +72,8 @@ void run_test_mdf() { KokkosSparse::Experimental::MDF_handle handle(A); handle.set_verbosity(0); - mdf_symbolic_phase(A, handle); - mdf_numeric_phase(A, handle); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + KokkosSparse::Experimental::mdf_numeric(A, handle); col_ind_type permutation = handle.get_permutation(); @@ -83,20 +85,97 @@ void run_test_mdf() { 7, 11, 13, 14, 5, 6, 9, 10}; printf("MDF ordering: { "); for (ordinal_type idx = 0; idx < A.numRows(); ++idx) { - ; printf("%d ", static_cast(permutation_h(idx))); if (permutation_h(idx) != permutation_ref[idx]) { success = false; } } printf("}\n"); - EXPECT_TRUE(success) << "The permutation computed is different from the reference solution!"; + // Check the factors L and U handle.sort_factors(); crs_matrix_type U = handle.getU(); crs_matrix_type L = handle.getL(); + + EXPECT_TRUE(U.numRows() == 16); + EXPECT_TRUE(U.nnz() == 40); + + { + auto row_map_U = Kokkos::create_mirror(U.graph.row_map); + Kokkos::deep_copy(row_map_U, U.graph.row_map); + auto entries_U = Kokkos::create_mirror(U.graph.entries); + Kokkos::deep_copy(entries_U, U.graph.entries); + auto values_U = Kokkos::create_mirror(U.values); + Kokkos::deep_copy(values_U, U.values); + + const size_type row_map_U_ref[17] = {0, 3, 6, 9, 12, 15, 17, 20, 22, + 25, 27, 30, 32, 35, 37, 39, 40}; + const ordinal_type entries_U_ref[40] = { + 0, 4, 6, 1, 5, 8, 2, 7, 10, 3, 9, 11, 4, 5, + 12, 5, 13, 6, 7, 12, 7, 14, 8, 9, 13, 9, 15, 10, + 11, 14, 11, 15, 12, 13, 14, 13, 15, 14, 15, 15}; + + const scalar_type val0 = static_cast(15. / 4.); + const scalar_type val1 = static_cast(val0 - 1 / val0); + const scalar_type val2 = static_cast(4 - 2 / val0); + const scalar_type val3 = + static_cast(4 - 1 / val0 - 1 / val1 - 1 / val2); + const scalar_type val4 = static_cast(4 - 2 / val1 - 2 / val3); + const scalar_type values_U_ref[40] = { + 4, -1, -1, 4, -1, -1, 4, -1, -1, 4, -1, -1, val0, -1, -1, + val1, -1, val0, -1, -1, val1, -1, val0, -1, -1, val1, -1, val0, -1, -1, + val1, -1, val2, -1, -1, val3, -1, val3, -1, val4}; + + for (int idx = 0; idx < 17; ++idx) { + EXPECT_TRUE(row_map_U_ref[idx] == row_map_U(idx)) + << "rowmap_U(" << idx << ") is wrong!"; + } + for (int idx = 0; idx < 40; ++idx) { + EXPECT_TRUE(entries_U_ref[idx] == entries_U(idx)) + << "entries_U(" << idx << ") is wrong!"; + EXPECT_NEAR_KK(values_U_ref[idx], values_U(idx), + 10 * Kokkos::ArithTraits::eps(), + "An entry in U.values is wrong!"); + } + + auto row_map_L = Kokkos::create_mirror(L.graph.row_map); + Kokkos::deep_copy(row_map_L, L.graph.row_map); + auto entries_L = Kokkos::create_mirror(L.graph.entries); + Kokkos::deep_copy(entries_L, L.graph.entries); + auto values_L = Kokkos::create_mirror(L.values); + Kokkos::deep_copy(values_L, L.values); + + const size_type row_map_L_ref[17] = {0, 1, 2, 3, 4, 6, 9, 11, 14, + 16, 19, 21, 24, 27, 31, 35, 40}; + const ordinal_type entries_L_ref[40] = { + 0, 1, 2, 3, 0, 4, 1, 4, 5, 0, 6, 2, 6, 7, + 1, 8, 3, 8, 9, 2, 10, 3, 10, 11, 4, 6, 12, 5, + 8, 12, 13, 7, 10, 12, 14, 9, 11, 13, 14, 15}; + const scalar_type values_L_ref[40] = { + 1, 1, 1, 1, -1 / four, 1, + -1 / four, -1 / val0, 1, -1 / four, 1, -1 / four, + -1 / val0, 1, -1 / four, 1, -1 / four, -1 / val0, + 1, -1 / four, 1, -1 / four, -1 / val0, 1, + -1 / val0, -1 / val0, 1, -1 / val1, -1 / val0, -1 / val2, + 1, -1 / val1, -1 / val0, -1 / val2, 1, -1 / val1, + -1 / val1, -1 / val3, -1 / val3, 1}; + + for (int idx = 0; idx < 17; ++idx) { + EXPECT_TRUE(row_map_L_ref[idx] == row_map_L(idx)) + << "rowmap_L(" << idx << ")=" << row_map_L(idx) << " is wrong!"; + } + for (int idx = 0; idx < 40; ++idx) { + EXPECT_TRUE(entries_L_ref[idx] == entries_L(idx)) + << "entries_L(" << idx << ")=" << entries_L(idx) + << " is wrong, entries_L_ref[" << idx << "]=" << entries_L_ref[idx] + << "!"; + EXPECT_NEAR_KK(values_L_ref[idx], values_L(idx), + 10 * Kokkos::ArithTraits::eps(), + "An entry in L.values is wrong!"); + } + } } } // namespace Test