From 54d372005cf8d30328f6383f2de8b5a827702b82 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 10:14:32 -0600 Subject: [PATCH 1/3] mdf: move most expensive kernels over to hierarchical parallism --- sparse/impl/KokkosSparse_mdf_impl.hpp | 1348 +++++++++++++++++++------ sparse/src/KokkosSparse_mdf.hpp | 91 +- sparse/unit_test/Test_Sparse_mdf.hpp | 187 +++- 3 files changed, 1267 insertions(+), 359 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index d8754e591c..51f3ae98c3 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -18,6 +18,8 @@ #define KOKKOSSPARSE_MDF_IMPL_HPP_ #include +#include "KokkosKernels_Sorting.hpp" +#include "KokkosSparse_findRelOffset.hpp" #include #include "Kokkos_ArithTraits.hpp" @@ -63,7 +65,7 @@ struct MDF_count_lower { }; // MDF_count_lower -template +template struct MDF_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = @@ -80,6 +82,7 @@ struct MDF_discarded_fill_norm { crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; + col_ind_type update_list; values_mag_type discarded_fill; col_ind_type deficiency; @@ -89,107 +92,289 @@ struct MDF_discarded_fill_norm { ordinal_type factorization_step_, col_ind_type permutation_, values_mag_type discarded_fill_, - col_ind_type deficiency_, int verbosity_) + col_ind_type deficiency_, int verbosity_, + col_ind_type update_list_ = col_ind_type{}) : A(A_), At(At_), factorization_step(factorization_step_), permutation(permutation_), + update_list(update_list_), discarded_fill(discarded_fill_), deficiency(deficiency_), verbosity(verbosity_){}; - KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(i); - scalar_mag_type discard_norm = KAM::zero(); - scalar_type diag_val = KAS::zero(); - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; - for (size_type alphaIdx = At.graph.row_map(rowIdx); - alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + + struct DiscNormReducer { + using reducer = DiscNormReducer; + struct value_type { + scalar_mag_type discarded_norm; + ordinal_type numFillEntries; + scalar_type diag_val; + }; + using result_view_type = Kokkos::View; + + private: + result_view_type value; + + public: + KOKKOS_INLINE_FUNCTION + DiscNormReducer(value_type& value_) : value(&value_) {} + + KOKKOS_INLINE_FUNCTION + static void join(value_type& dest, const value_type& src) { + dest.discarded_norm += src.discarded_norm; + dest.numFillEntries += src.numFillEntries; + if (dest.diag_val == KAS::zero()) dest.diag_val = src.diag_val; + } - if (fillRowIdx != rowIdx && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(rowIdx); - betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } + KOKKOS_INLINE_FUNCTION + static void init(value_type& val) { + val.discarded_norm = Kokkos::reduction_identity::sum(); + val.numFillEntries = Kokkos::reduction_identity::sum(); + val.diag_val = KAS::zero(); + } - if (fillColIdx != rowIdx && col_not_eliminated) { - entryIsDiscarded = true; - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - entryIsDiscarded = false; - } - } - if (entryIsDiscarded) { - numFillEntries += 1; - discard_norm += - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - int(At.graph.entries(alphaIdx)), - int(A.graph.entries(betaIdx)), - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), - int(rowIdx)); - } - } - } + KOKKOS_INLINE_FUNCTION + static value_type init() { + value_type out; + init(out); + return out; + } + + KOKKOS_INLINE_FUNCTION + value_type& reference() const { return *value.data(); } + + KOKKOS_INLINE_FUNCTION + result_view_type view() const { return value; } + }; + + KOKKOS_INLINE_FUNCTION + void operator()(team_member_t team) const { + const ordinal_type rowIdx = + is_initial_fill ? permutation(team.league_rank()) + : permutation(update_list(team.league_rank())); + const auto colView = At.rowConst(rowIdx); + const auto rowView = A.rowConst(rowIdx); + + using reduction_val_t = typename DiscNormReducer::value_type; + reduction_val_t reduction_val = DiscNormReducer::init(); + Kokkos::parallel_reduce( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, reduction_val_t& running_disc_norm) { + const ordinal_type fillRowIdx = colView.colidx(alpha); + + // Record diagonal term + if (fillRowIdx == rowIdx) { + Kokkos::single(Kokkos::PerThread(team), [&] { + running_disc_norm.diag_val = colView.value(alpha); + }); + return; } - } - } else if (fillRowIdx == rowIdx) { - diag_val = At.values(alphaIdx); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value detected, values(%d)=%f\n", int(rowIdx), - int(alphaIdx), At.values(alphaIdx)); - } else if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value detected, |values(%d)|=%f\n", - int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); + + // Check if row already eliminated + if constexpr (!is_initial_fill) { + bool row_eliminated = false; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, factorization_step), + [&](const ordinal_type stepIdx, bool& running_row_eliminated) { + running_row_eliminated |= fillRowIdx == permutation(stepIdx); + }, + Kokkos::LOr(row_eliminated)); + + if (row_eliminated) return; } - } - } - } - // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / KAS::abs(diag_val * diag_val); - discarded_fill(rowIdx) = discard_norm; - deficiency(rowIdx) = numFillEntries; + const auto fillRowView = A.rowConst(fillRowIdx); + reduction_val_t local_reduction_val = DiscNormReducer::init(); + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, rowView.length), + [&](const ordinal_type beta, + reduction_val_t& vect_running_disc_norm) { + const ordinal_type fillColIdx = rowView.colidx(beta); + + if (fillColIdx == rowIdx) return; + + if constexpr (!is_initial_fill) { + bool col_eliminated = false; + for (ordinal_type stepIdx = 0; stepIdx < factorization_step; + ++stepIdx) { + col_eliminated |= fillColIdx == permutation(stepIdx); + } - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAM::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); - } - } - } + if (col_eliminated) return; + } + bool entryIsDiscarded = true; + for (ordinal_type gamma = 0; gamma < fillRowView.length; + ++gamma) { + if (fillRowView.colidx(gamma) == fillColIdx) { + entryIsDiscarded = false; + } + } + if (entryIsDiscarded) { + vect_running_disc_norm.numFillEntries += 1; + vect_running_disc_norm.discarded_norm += + KAS::abs(colView.value(alpha) * rowView.value(beta)) * + KAS::abs(colView.value(alpha) * rowView.value(beta)); + } + }, + DiscNormReducer(local_reduction_val)); + + Kokkos::single(Kokkos::PerThread(team), [&] { + running_disc_norm.discarded_norm += + local_reduction_val.discarded_norm; + running_disc_norm.numFillEntries += + local_reduction_val.numFillEntries; + }); + }, + DiscNormReducer(reduction_val)); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + const scalar_mag_type& discard_norm = reduction_val.discarded_norm; + const ordinal_type& numFillEntries = reduction_val.numFillEntries; + const scalar_type& diag_val = reduction_val.diag_val; + + // TODO add a check on `diag_val == zero` + discarded_fill(rowIdx) = discard_norm / KAS::abs(diag_val * diag_val); + deficiency(rowIdx) = numFillEntries; + }); + } }; // MDF_discarded_fill_norm +// template +// struct MDF_discarded_fill_norm_old { +// using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; +// using col_ind_type = +// typename static_crs_graph_type::entries_type::non_const_type; +// using values_type = typename +// crs_matrix_type::values_type::non_const_type; using values_mag_type = +// typename MDF_types::values_mag_type; using size_type +// = typename crs_matrix_type::size_type; using ordinal_type = typename +// crs_matrix_type::ordinal_type; using scalar_type = typename +// crs_matrix_type::value_type; using KAS = typename +// Kokkos::ArithTraits; using scalar_mag_type = typename +// KAS::mag_type; using KAM = typename +// Kokkos::ArithTraits; + +// crs_matrix_type A, At; +// ordinal_type factorization_step; +// col_ind_type permutation; + +// values_mag_type discarded_fill; +// col_ind_type deficiency; +// int verbosity; + +// MDF_discarded_fill_norm_old(crs_matrix_type A_, crs_matrix_type At_, +// ordinal_type factorization_step_, +// col_ind_type permutation_, +// values_mag_type discarded_fill_, +// col_ind_type deficiency_, int verbosity_) +// : A(A_), +// At(At_), +// factorization_step(factorization_step_), +// permutation(permutation_), +// discarded_fill(discarded_fill_), +// deficiency(deficiency_), +// verbosity(verbosity_){}; + +// KOKKOS_INLINE_FUNCTION +// void operator()(const ordinal_type i) const { +// ordinal_type rowIdx = permutation(i); +// scalar_mag_type discard_norm = KAM::zero(); +// scalar_type diag_val = KAS::zero(); +// bool entryIsDiscarded = true; +// ordinal_type numFillEntries = 0; +// for (size_type alphaIdx = At.graph.row_map(rowIdx); +// alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { +// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); +// bool row_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) +// { +// if (fillRowIdx == permutation(stepIdx)) { +// row_not_eliminated = false; +// } +// } + +// if (fillRowIdx != rowIdx && row_not_eliminated) { +// for (size_type betaIdx = A.graph.row_map(rowIdx); +// betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { +// ordinal_type fillColIdx = A.graph.entries(betaIdx); +// bool col_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; +// ++stepIdx) { +// if (fillColIdx == permutation(stepIdx)) { +// col_not_eliminated = false; +// } +// } + +// if (fillColIdx != rowIdx && col_not_eliminated) { +// entryIsDiscarded = true; +// for (size_type entryIdx = A.graph.row_map(fillRowIdx); +// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { +// if (A.graph.entries(entryIdx) == fillColIdx) { +// entryIsDiscarded = false; +// } +// } +// if (entryIsDiscarded) { +// numFillEntries += 1; +// discard_norm += +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); +// if (verbosity > 1) { +// if constexpr (std::is_arithmetic_v) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Adding value A[%d,%d]=%f to discard norm of row %d\n", +// int(At.graph.entries(alphaIdx)), +// int(A.graph.entries(betaIdx)), +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * +// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), +// int(rowIdx)); +// } +// } +// } +// } +// } +// } else if (fillRowIdx == rowIdx) { +// diag_val = At.values(alphaIdx); +// if (verbosity > 1) { +// if constexpr (std::is_arithmetic_v) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Row %d diagonal value detected, values(%d)=%f\n", +// int(rowIdx), int(alphaIdx), At.values(alphaIdx)); +// } else if constexpr (std::is_arithmetic_v) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Row %d diagonal value detected, |values(%d)|=%f\n", +// int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); +// } +// } +// } +// } + +// // TODO add a check on `diag_val == zero` +// discard_norm = discard_norm / KAS::abs(diag_val * diag_val); +// discarded_fill(rowIdx) = discard_norm; +// deficiency(rowIdx) = numFillEntries; + +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 0) { +// const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) +// - +// A.graph.row_map(rowIdx) - +// 1); +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "Row %d has discarded fill of %f, deficiency of %d and degree +// %d\n", static_cast(rowIdx), +// static_cast(KAM::sqrt(discard_norm)), +// static_cast(deficiency(rowIdx)), static_cast(degree)); +// } +// } +// } + +// }; // MDF_discarded_fill_norm_old + template struct MDF_selective_discarded_fill_norm { using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; @@ -435,6 +620,14 @@ struct MDF_select_row { }; // MDF_select_row +template +KOKKOS_INLINE_FUNCTION bool sorted_view_contains( + const view_type& values, const ordinal_type size, + typename view_type::const_value_type search_val) { + return KokkosSparse::findRelOffset(values, size, search_val, size, true) != + size; +} + template struct MDF_factorize_row { using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -463,8 +656,14 @@ struct MDF_factorize_row { col_ind_type factored; ordinal_type selected_row_idx, factorization_step; + col_ind_type update_list; + int verbosity; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + MDF_factorize_row(crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, @@ -472,7 +671,8 @@ struct MDF_factorize_row { col_ind_type permutation_, col_ind_type permutation_inv_, values_mag_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, - ordinal_type factorization_step_, int verbosity_) + ordinal_type factorization_step_, + col_ind_type& update_list_, int verbosity_) : A(A_), At(At_), row_mapL(row_mapL_), @@ -487,276 +687,782 @@ struct MDF_factorize_row { factored(factored_), selected_row_idx(selected_row_idx_), factorization_step(factorization_step_), + update_list(update_list_), verbosity(verbosity_){}; + // Phase 2, do facrotization KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type /* idx */) const { - const ordinal_type selected_row = permutation(selected_row_idx); - discarded_fill(selected_row) = Kokkos::ArithTraits::max(); - - // Swap entries in permutation vectors - permutation(selected_row_idx) = permutation(factorization_step); - permutation(factorization_step) = selected_row; - permutation_inv(permutation(factorization_step)) = factorization_step; - permutation_inv(permutation(selected_row_idx)) = selected_row_idx; - - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); - for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(permutation(rowIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - - // Insert the upper part of the selected row in U - // including the diagonal term. - value_type diag = Kokkos::ArithTraits::zero(); - size_type U_entryIdx = row_mapU(factorization_step); - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - if (permutation_inv(A.graph.entries(entryIdx)) >= factorization_step) { - entriesU(U_entryIdx) = A.graph.entries(entryIdx); - valuesU(U_entryIdx) = A.values(entryIdx); - ++U_entryIdx; - if (A.graph.entries(entryIdx) == selected_row) { - diag = A.values(entryIdx); - } - } - } - row_mapU(factorization_step + 1) = U_entryIdx; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", - static_cast(selected_row), - static_cast(diag)); - } - - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); - for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; - ++rowIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(row_mapU(rowIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesU(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapU(0); - entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesU(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - - // Insert the lower part of the selected column of A - // divided by its the diagonal value to obtain a unit - // diagonal value in L. - size_type L_entryIdx = row_mapL(factorization_step); - entriesL(L_entryIdx) = selected_row; - valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); - ++L_entryIdx; - for (size_type entryIdx = At.graph.row_map(selected_row); - entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { - if (permutation_inv(At.graph.entries(entryIdx)) > factorization_step) { - entriesL(L_entryIdx) = At.graph.entries(entryIdx); - valuesL(L_entryIdx) = At.values(entryIdx) / diag; - ++L_entryIdx; - } + void operator()(team_member_t team) const { + const auto alpha = team.league_rank(); + const ordinal_type selected_row = permutation(factorization_step); + const auto colView = At.rowConst(selected_row); + + const auto rowInd = colView.colidx(alpha); + if (rowInd == selected_row) return; + + { + bool row_eliminated = false; + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, factorization_step), + [&](const ordinal_type step, bool& partial) { + partial |= rowInd == permutation(step); + }, + Kokkos::LOr(row_eliminated)); + + if (row_eliminated) return; } - row_mapL(factorization_step + 1) = L_entryIdx; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 2) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", - static_cast(factorization_step), - static_cast(factorization_step), - static_cast(factorization_step + 1), - static_cast(row_mapL(factorization_step)), - static_cast(row_mapL(factorization_step + 1))); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", - static_cast(entriesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); - for (size_type entryIdx = row_mapL(factorization_step); - entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", - static_cast(valuesL(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - - // If this was the last row no need to update A and At! - if (factorization_step == A.numRows() - 1) { - return; - } - - // Finally we want to update A and At with the values - // that where not discarded during factorization. - // Note: this is almost the same operation as computing - // the norm of the discarded fill... - - // First step: find the diagonal entry in selected_row - value_type diag_val = Kokkos::ArithTraits::zero(); - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - ordinal_type colIdx = A.graph.entries(entryIdx); - if (selected_row == colIdx) { - diag_val = A.values(entryIdx); - } - } + // Only one of the values will match selected so can just sum all contribs + const auto rowView = A.rowConst(selected_row); + value_type diag = Kokkos::ArithTraits::zero(); + Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team, rowView.length), + [&](const size_type ind, value_type& running_diag) { + if (rowView.colidx(ind) == selected_row) + running_diag = rowView.value(ind); + }, + Kokkos::Sum(diag)); // Extract alpha and beta vectors // Then insert alpha*beta/diag_val if the corresponding // entry in A is non-zero. - for (size_type alphaIdx = At.graph.row_map(selected_row); - alphaIdx < At.graph.row_map(selected_row + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } - - if ((fillRowIdx != selected_row) && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(selected_row); - betaIdx < A.graph.row_map(selected_row + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } - - if ((fillColIdx != selected_row) && col_not_eliminated) { - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - A.values(entryIdx) -= - At.values(alphaIdx) * A.values(betaIdx) / diag_val; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 1) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "A[%d, %d] -= %f\n", static_cast(fillRowIdx), - static_cast(fillColIdx), - static_cast(At.values(alphaIdx) * - A.values(betaIdx) / diag_val)); - } - } - } - } - - for (size_type entryIdx = At.graph.row_map(fillColIdx); - entryIdx < At.graph.row_map(fillColIdx + 1); ++entryIdx) { - if (At.graph.entries(entryIdx) == fillRowIdx) { - At.values(entryIdx) -= - At.values(alphaIdx) * A.values(betaIdx) / diag_val; - } - } + auto fillRowView = A.row(rowInd); + Kokkos::parallel_for( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const ordinal_type beta) { + const auto colInd = rowView.colidx(beta); + + if (colInd == selected_row) return; + + { + bool col_eliminated = false; + Kokkos::parallel_reduce( + Kokkos::ThreadVectorRange(team, factorization_step), + [&](const ordinal_type step, bool& partial) { + partial |= colInd == permutation(step); + }, + Kokkos::LOr(col_eliminated)); + + if (col_eliminated) return; } - } - } - } - factored(selected_row) = 1; + const auto subVal = colView.value(alpha) * rowView.value(beta) / diag; - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); - for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "%f ", static_cast(A.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); - for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "%f ", static_cast(At.values(entryIdx))); - } - KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); - } - } - } // operator() + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, fillRowView.length), + [&](const ordinal_type gamma) { + if (colInd == fillRowView.colidx(gamma)) { + Kokkos::atomic_sub(&fillRowView.value(gamma), subVal); + } + }); + + auto fillColView = At.row(colInd); + Kokkos::parallel_for( + Kokkos::ThreadVectorRange(team, fillColView.length), + [&](const ordinal_type delt) { + if (rowInd == fillColView.colidx(delt)) { + Kokkos::atomic_sub(&fillColView.value(delt), subVal); + } + }); + }); + } +}; -}; // MDF_factorize_row +// template +// struct MDF_factorize_row_heir_old { +// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: +// row_map_type::non_const_type; +// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: +// entries_type::non_const_type; +// using values_type = typename +// crs_matrix_type::values_type::non_const_type; using ordinal_type = +// typename crs_matrix_type::ordinal_type; using size_type = typename +// crs_matrix_type::size_type; using value_type = typename +// crs_matrix_type::value_type; using values_mag_type = typename +// MDF_types::values_mag_type; using value_mag_type = +// typename values_mag_type::value_type; + +// crs_matrix_type A, At; + +// row_map_type row_mapL; +// col_ind_type entriesL; +// values_type valuesL; + +// row_map_type row_mapU; +// col_ind_type entriesU; +// values_type valuesU; + +// col_ind_type permutation, permutation_inv; +// values_mag_type discarded_fill; +// col_ind_type factored; +// ordinal_type selected_row_idx, factorization_step; + +// col_ind_type update_list; + +// int verbosity; + +// using execution_space = typename crs_matrix_type::execution_space; +// using team_policy_t = Kokkos::TeamPolicy; +// using team_member_t = typename team_policy_t::member_type; + +// MDF_factorize_row_heir_old(crs_matrix_type A_, crs_matrix_type At_, +// row_map_type row_mapL_, col_ind_type entriesL_, +// values_type valuesL_, row_map_type row_mapU_, +// col_ind_type entriesU_, values_type valuesU_, +// col_ind_type permutation_, col_ind_type permutation_inv_, +// values_mag_type discarded_fill_, col_ind_type factored_, +// ordinal_type selected_row_idx_, +// ordinal_type factorization_step_, col_ind_type& +// update_list_, int verbosity_) +// : A(A_), +// At(At_), +// row_mapL(row_mapL_), +// entriesL(entriesL_), +// valuesL(valuesL_), +// row_mapU(row_mapU_), +// entriesU(entriesU_), +// valuesU(valuesU_), +// permutation(permutation_), +// permutation_inv(permutation_inv_), +// discarded_fill(discarded_fill_), +// factored(factored_), +// selected_row_idx(selected_row_idx_), +// factorization_step(factorization_step_), +// update_list(update_list_), +// verbosity(verbosity_){}; + +// //Phase 2, do facrotization +// KOKKOS_INLINE_FUNCTION +// void operator()(team_member_t team) const{ +// const ordinal_type selected_row = permutation(factorization_step); +// const auto rowView = A.rowConst(selected_row); +// const auto colView = At.rowConst(selected_row); + +// // If this was the last row no need to update A and At! +// if (factorization_step == A.numRows() - 1) { +// return; +// } + +// // Only one of the values will match selected so can just sum all +// contribs value_type diag = Kokkos::ArithTraits::zero(); +// Kokkos::parallel_reduce( +// Kokkos::TeamVectorRange(team,rowView.length), +// [&](const size_type alpha,value_type & running_diag){ +// if (rowView.colidx(alpha) == selected_row) +// running_diag = rowView.value(alpha); +// }, +// Kokkos::Sum(diag) +// ); + +// // Extract alpha and beta vectors +// // Then insert alpha*beta/diag_val if the corresponding +// // entry in A is non-zero. +// Kokkos::parallel_for( +// Kokkos::TeamThreadRange(team,colView.length), +// [&](const ordinal_type alpha){ +// const auto rowInd = colView.colidx(alpha); +// auto fillRowView = A.row(rowInd); + +// if (rowInd == selected_row) return; + +// bool row_eliminated = false; +// Kokkos::parallel_reduce( +// Kokkos::ThreadVectorRange(team,factorization_step), +// [&](const ordinal_type step, bool & partial){ +// partial |= rowInd == permutation(step); +// }, +// Kokkos::LOr(row_eliminated) +// ); + +// if (row_eliminated) return; + +// Kokkos::parallel_for( +// Kokkos::ThreadVectorRange(team,rowView.length), +// [&](const ordinal_type beta){ +// const auto colInd = rowView.colidx(beta); + +// if (colInd == selected_row) return; + +// bool col_eliminated = false; +// for (ordinal_type step = 0; step < factorization_step; ++step){ +// col_eliminated |= colInd == permutation(step); +// } + +// if (col_eliminated) return; + +// const auto subVal = colView.colidx(alpha) * rowView.colidx(beta) +// / diag; for (ordinal_type gamma = 0; gamma < fillRowView.length; +// ++gamma){ +// if (colInd == fillRowView.colidx(gamma)){ +// Kokkos::atomic_sub( +// &fillRowView.value(gamma), +// subVal +// ); +// } +// } +// auto fillColView = At.row(colInd); +// for (ordinal_type delt = 0; delt < fillColView.length; ++delt){ +// if (rowInd == fillColView.colidx(delt)){ +// Kokkos::atomic_sub( +// &fillColView.value(delt), +// subVal +// ); +// } +// } +// }); +// } +// ); +// } +// }; template struct MDF_compute_list_length { + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: + row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: entries_type::non_const_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using size_type = typename crs_matrix_type::size_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using ordinal_type = typename crs_matrix_type::ordinal_type; + using size_type = typename crs_matrix_type::size_type; + using value_type = typename crs_matrix_type::value_type; + using values_mag_type = typename MDF_types::values_mag_type; + using value_mag_type = typename values_mag_type::value_type; - ordinal_type selected_row_idx; - crs_matrix_type A; - crs_matrix_type At; - col_ind_type permutation; + crs_matrix_type A, At; + + row_map_type row_mapL; + col_ind_type entriesL; + values_type valuesL; + + row_map_type row_mapU; + col_ind_type entriesU; + values_type valuesU; + + col_ind_type permutation, permutation_inv; + values_mag_type discarded_fill; col_ind_type factored; - col_ind_type update_list_length; + ordinal_type selected_row_idx, factorization_step; + col_ind_type update_list; - MDF_compute_list_length(const ordinal_type rowIdx_, const crs_matrix_type& A_, - const crs_matrix_type& At_, - const col_ind_type& permutation_, - const col_ind_type factored_, - col_ind_type& update_list_length_, - col_ind_type& update_list_) - : selected_row_idx(rowIdx_), - A(A_), + int verbosity; + + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + + MDF_compute_list_length( + crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, + col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, + col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, + col_ind_type permutation_inv_, values_mag_type discarded_fill_, + col_ind_type factored_, ordinal_type selected_row_idx_, + ordinal_type factorization_step_, col_ind_type& update_list_, + int verbosity_) + : A(A_), At(At_), + row_mapL(row_mapL_), + entriesL(entriesL_), + valuesL(valuesL_), + row_mapU(row_mapU_), + entriesU(entriesU_), + valuesU(valuesU_), permutation(permutation_), + permutation_inv(permutation_inv_), + discarded_fill(discarded_fill_), factored(factored_), - update_list_length(update_list_length_), - update_list(update_list_) {} + selected_row_idx(selected_row_idx_), + factorization_step(factorization_step_), + update_list(update_list_), + verbosity(verbosity_){}; + // Phase 1, update list length KOKKOS_INLINE_FUNCTION - void operator()(const size_type /*idx*/) const { + void operator()(const team_member_t team, ordinal_type& update_list_len, + ordinal_type& selected_row_len) const { const ordinal_type selected_row = permutation(selected_row_idx); - size_type updateIdx = 0; - for (size_type entryIdx = A.graph.row_map(selected_row); - entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { - if ((A.graph.entries(entryIdx) != selected_row) && - (factored(A.graph.entries(entryIdx)) != 1)) { - update_list(updateIdx) = A.graph.entries(entryIdx); - ++updateIdx; - } + const auto rowView = A.rowConst(selected_row); + const auto colView = At.rowConst(selected_row); + + size_type U_entryIdx = row_mapU(factorization_step); + size_type L_entryIdx = row_mapL(factorization_step); + + Kokkos::single(Kokkos::PerTeam(team), [&] { + discarded_fill(selected_row) = Kokkos::ArithTraits::max(); + + // Swap entries in permutation vectors + permutation(selected_row_idx) = permutation(factorization_step); + permutation(factorization_step) = selected_row; + permutation_inv(permutation(factorization_step)) = factorization_step; + permutation_inv(permutation(selected_row_idx)) = selected_row_idx; + + // Diagonal value of L + entriesL(L_entryIdx) = selected_row; + valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); + }); + ++L_entryIdx; + + // Insert the upper part of the selected row in U + // including the diagonal term. + ordinal_type updateIdx = 0; + value_type diag = Kokkos::ArithTraits::zero(); + { + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type alpha, ordinal_type& running_update, + bool is_final) { + const auto colInd = rowView.colidx(alpha); + if ((colInd != selected_row) && (factored(colInd) != 1)) { + if (is_final) { + update_list(running_update) = colInd; + ++updateIdx; + } + ++running_update; + } + } + // ,updateIdx + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(updateIdx)); + + // Sort update list + KokkosKernels::TeamBitonicSort(&update_list(0), updateIdx, team); } - size_type update_rows = updateIdx; - for (size_type entryIdx = At.graph.row_map(selected_row); - entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { - if ((At.graph.entries(entryIdx) != selected_row) && - (factored(A.graph.entries(entryIdx)) != 1)) { - bool already_updated = false; - for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { - if (At.graph.entries(entryIdx) == update_list(checkIdx)) { - already_updated = true; - break; + { + size_type numEntrU = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, rowView.length), + [&](const size_type alpha, size_type& running_nEntr, bool is_final) { + const auto colInd = rowView.colidx(alpha); + if (permutation_inv(colInd) >= factorization_step) { + if (is_final) { + ++numEntrU; + entriesU(U_entryIdx + running_nEntr) = colInd; + valuesU(U_entryIdx + running_nEntr) = rowView.value(alpha); + if (colInd == selected_row) diag = rowView.value(alpha); + } + ++running_nEntr; + } } - } - if (already_updated == false) { - update_list(updateIdx) = At.graph.entries(entryIdx); - ++updateIdx; - } - } + // , numEntrU + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numEntrU)); + + U_entryIdx += numEntrU; } - update_list_length(0) = updateIdx; + + // Only one thread found diagonal so just sum over all + team.team_reduce(Kokkos::Sum(diag)); + + // Insert the lower part of the selected column of A + // divided by its the diagonal value to obtain a unit + // diagonal value in L. + { + size_type numEntrL = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, size_type& running_nEntr, bool is_final) { + const auto rowInd = colView.colidx(alpha); + if (permutation_inv(rowInd) > factorization_step) { + if (is_final) { + ++numEntrL; + entriesL(L_entryIdx + running_nEntr) = rowInd; + valuesL(L_entryIdx + running_nEntr) = + colView.value(alpha) / diag; + } + ++running_nEntr; + } + } + // , numEntrL + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numEntrL)); + + L_entryIdx += numEntrL; + } + { + ordinal_type numUpdateL = 0; + Kokkos::parallel_scan( + Kokkos::TeamThreadRange(team, colView.length), + [&](const size_type alpha, ordinal_type& running_update, + bool is_final) { + const auto rowInd = colView.colidx(alpha); + if ((rowInd != selected_row) && (factored(rowInd) != 1)) { + // updateIdx currently holds the rows that were updated. don't add + // duplicates + const size_type& update_rows = updateIdx; + + const bool already_updated = + sorted_view_contains(update_list, update_rows, rowInd); + + if (!already_updated) { + // Cannot make use of vector ranges until + // https://github.com/kokkos/kokkos/issues/6259 is resolved + // Kokkos::single(Kokkos::PerThread(team),[&]{ + if (is_final) { + update_list(updateIdx + running_update) = rowInd; + ++numUpdateL; + } + ++running_update; + // }); + } + } + } + // , numUpdateL + ); + + // Until https://github.com/kokkos/kokkos/issues/6259 is resolved, do + // reduction outside of parallel_scan + team.team_reduce(Kokkos::Sum(numUpdateL)); + + updateIdx += numUpdateL; + } + + Kokkos::single(Kokkos::PerTeam(team), [&] { + row_mapU(factorization_step + 1) = U_entryIdx; + row_mapL(factorization_step + 1) = L_entryIdx; + + update_list_len = updateIdx; + selected_row_len = rowView.length; + + factored(selected_row) = 1; + }); } }; +// template +// struct MDF_factorize_row_old { +// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: +// row_map_type::non_const_type; +// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: +// entries_type::non_const_type; +// using values_type = typename +// crs_matrix_type::values_type::non_const_type; using ordinal_type = +// typename crs_matrix_type::ordinal_type; using size_type = typename +// crs_matrix_type::size_type; using value_type = typename +// crs_matrix_type::value_type; using values_mag_type = typename +// MDF_types::values_mag_type; using value_mag_type = +// typename values_mag_type::value_type; + +// crs_matrix_type A, At; + +// row_map_type row_mapL; +// col_ind_type entriesL; +// values_type valuesL; + +// row_map_type row_mapU; +// col_ind_type entriesU; +// values_type valuesU; + +// col_ind_type permutation, permutation_inv; +// values_mag_type discarded_fill; +// col_ind_type factored; +// ordinal_type selected_row_idx, factorization_step; + +// int verbosity; + +// MDF_factorize_row_old(crs_matrix_type A_, crs_matrix_type At_, +// row_map_type row_mapL_, col_ind_type entriesL_, +// values_type valuesL_, row_map_type row_mapU_, +// col_ind_type entriesU_, values_type valuesU_, +// col_ind_type permutation_, col_ind_type permutation_inv_, +// values_mag_type discarded_fill_, col_ind_type factored_, +// ordinal_type selected_row_idx_, +// ordinal_type factorization_step_, int verbosity_) +// : A(A_), +// At(At_), +// row_mapL(row_mapL_), +// entriesL(entriesL_), +// valuesL(valuesL_), +// row_mapU(row_mapU_), +// entriesU(entriesU_), +// valuesU(valuesU_), +// permutation(permutation_), +// permutation_inv(permutation_inv_), +// discarded_fill(discarded_fill_), +// factored(factored_), +// selected_row_idx(selected_row_idx_), +// factorization_step(factorization_step_), +// verbosity(verbosity_){}; + +// KOKKOS_INLINE_FUNCTION +// void operator()(const ordinal_type /* idx */) const { +// const ordinal_type selected_row = permutation(selected_row_idx); +// discarded_fill(selected_row) = +// Kokkos::ArithTraits::max(); + +// // Swap entries in permutation vectors +// permutation(selected_row_idx) = permutation(factorization_step); +// permutation(factorization_step) = selected_row; +// permutation_inv(permutation(factorization_step)) = factorization_step; +// permutation_inv(permutation(selected_row_idx)) = selected_row_idx; + +// if (verbosity > 0) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); +// for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(permutation(rowIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } + +// // Insert the upper part of the selected row in U +// // including the diagonal term. +// value_type diag = Kokkos::ArithTraits::zero(); +// size_type U_entryIdx = row_mapU(factorization_step); +// for (size_type entryIdx = A.graph.row_map(selected_row); +// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { +// if (permutation_inv(A.graph.entries(entryIdx)) >= factorization_step) { +// entriesU(U_entryIdx) = A.graph.entries(entryIdx); +// valuesU(U_entryIdx) = A.values(entryIdx); +// ++U_entryIdx; +// if (A.graph.entries(entryIdx) == selected_row) { +// diag = A.values(entryIdx); +// } +// } +// } +// row_mapU(factorization_step + 1) = U_entryIdx; +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 0) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", +// static_cast(selected_row), +// static_cast(diag)); +// } + +// if (verbosity > 2) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); +// for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; +// ++rowIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(row_mapU(rowIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); +// for (size_type entryIdx = row_mapU(0); +// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(entriesU(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); +// for (size_type entryIdx = row_mapU(0); +// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", +// static_cast(valuesU(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } +// } + +// // Insert the lower part of the selected column of A +// // divided by its the diagonal value to obtain a unit +// // diagonal value in L. +// size_type L_entryIdx = row_mapL(factorization_step); +// entriesL(L_entryIdx) = selected_row; +// valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); +// ++L_entryIdx; +// for (size_type entryIdx = At.graph.row_map(selected_row); +// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { +// if (permutation_inv(At.graph.entries(entryIdx)) > factorization_step) { +// entriesL(L_entryIdx) = At.graph.entries(entryIdx); +// valuesL(L_entryIdx) = At.values(entryIdx) / diag; +// ++L_entryIdx; +// } +// } +// row_mapL(factorization_step + 1) = L_entryIdx; + +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 2) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", +// static_cast(factorization_step), +// static_cast(factorization_step), +// static_cast(factorization_step + 1), +// static_cast(row_mapL(factorization_step)), +// static_cast(row_mapL(factorization_step + 1))); +// for (size_type entryIdx = row_mapL(factorization_step); +// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", +// static_cast(entriesL(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); +// for (size_type entryIdx = row_mapL(factorization_step); +// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", +// static_cast(valuesL(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } +// } + +// // If this was the last row no need to update A and At! +// if (factorization_step == A.numRows() - 1) { +// return; +// } + +// // Finally we want to update A and At with the values +// // that where not discarded during factorization. +// // Note: this is almost the same operation as computing +// // the norm of the discarded fill... + +// // First step: find the diagonal entry in selected_row +// value_type diag_val = Kokkos::ArithTraits::zero(); +// for (size_type entryIdx = A.graph.row_map(selected_row); +// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { +// ordinal_type colIdx = A.graph.entries(entryIdx); +// if (selected_row == colIdx) { +// diag_val = A.values(entryIdx); +// } +// } + +// // Extract alpha and beta vectors +// // Then insert alpha*beta/diag_val if the corresponding +// // entry in A is non-zero. +// for (size_type alphaIdx = At.graph.row_map(selected_row); +// alphaIdx < At.graph.row_map(selected_row + 1); ++alphaIdx) { +// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); +// bool row_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) +// { +// if (fillRowIdx == permutation(stepIdx)) { +// row_not_eliminated = false; +// } +// } + +// if ((fillRowIdx != selected_row) && row_not_eliminated) { +// for (size_type betaIdx = A.graph.row_map(selected_row); +// betaIdx < A.graph.row_map(selected_row + 1); ++betaIdx) { +// ordinal_type fillColIdx = A.graph.entries(betaIdx); +// bool col_not_eliminated = true; +// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; +// ++stepIdx) { +// if (fillColIdx == permutation(stepIdx)) { +// col_not_eliminated = false; +// } +// } + +// if ((fillColIdx != selected_row) && col_not_eliminated) { +// for (size_type entryIdx = A.graph.row_map(fillRowIdx); +// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { +// if (A.graph.entries(entryIdx) == fillColIdx) { +// A.values(entryIdx) -= +// At.values(alphaIdx) * A.values(betaIdx) / diag_val; +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 1) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "A[%d, %d] -= %f\n", static_cast(fillRowIdx), +// static_cast(fillColIdx), +// static_cast(At.values(alphaIdx) * +// A.values(betaIdx) / diag_val)); +// } +// } +// } +// } + +// for (size_type entryIdx = At.graph.row_map(fillColIdx); +// entryIdx < At.graph.row_map(fillColIdx + 1); ++entryIdx) { +// if (At.graph.entries(entryIdx) == fillRowIdx) { +// At.values(entryIdx) -= +// At.values(alphaIdx) * A.values(betaIdx) / diag_val; +// } +// } +// } +// } +// } +// } + +// factored(selected_row) = 1; + +// if constexpr (std::is_arithmetic_v) { +// if (verbosity > 0) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); +// for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "%f ", static_cast(A.values(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); +// for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { +// KOKKOS_IMPL_DO_NOT_USE_PRINTF( +// "%f ", static_cast(At.values(entryIdx))); +// } +// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); +// } +// } +// } // operator() + +// }; // MDF_factorize_row_old + +// template +// struct MDF_compute_list_length_old { +// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: +// entries_type::non_const_type; +// using ordinal_type = typename crs_matrix_type::ordinal_type; +// using size_type = typename crs_matrix_type::size_type; + +// ordinal_type selected_row_idx; +// crs_matrix_type A; +// crs_matrix_type At; +// col_ind_type permutation; +// col_ind_type factored; +// col_ind_type update_list_length; +// col_ind_type update_list; + +// MDF_compute_list_length_old(const ordinal_type rowIdx_, const +// crs_matrix_type& A_, +// const crs_matrix_type& At_, +// const col_ind_type& permutation_, +// const col_ind_type factored_, +// col_ind_type& update_list_length_, +// col_ind_type& update_list_) +// : selected_row_idx(rowIdx_), +// A(A_), +// At(At_), +// permutation(permutation_), +// factored(factored_), +// update_list_length(update_list_length_), +// update_list(update_list_) {} + +// KOKKOS_INLINE_FUNCTION +// void operator()(const size_type /*idx*/) const { +// const ordinal_type selected_row = permutation(selected_row_idx); + +// size_type updateIdx = 0; +// for (size_type entryIdx = A.graph.row_map(selected_row); +// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { +// if ((A.graph.entries(entryIdx) != selected_row) && +// (factored(A.graph.entries(entryIdx)) != 1)) { +// update_list(updateIdx) = A.graph.entries(entryIdx); +// ++updateIdx; +// } +// } +// size_type update_rows = updateIdx; +// for (size_type entryIdx = At.graph.row_map(selected_row); +// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { +// if ((At.graph.entries(entryIdx) != selected_row) && +// (factored(A.graph.entries(entryIdx)) != 1)) { +// bool already_updated = false; +// for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { +// if (At.graph.entries(entryIdx) == update_list(checkIdx)) { +// already_updated = true; +// break; +// } +// } +// if (already_updated == false) { +// update_list(updateIdx) = At.graph.entries(entryIdx); +// ++updateIdx; +// } +// } +// } +// update_list_length(0) = updateIdx; +// } +// }; + template struct MDF_reindex_matrix { col_ind_type permutation_inv; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index 1c5216bfe5..a69e7a0e75 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -73,6 +73,7 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; + using team_range_policy_type = Kokkos::TeamPolicy; // Numerical phase: // loop over rows @@ -85,19 +86,19 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { KokkosSparse::sort_crs_matrix(At); values_mag_type discarded_fill("discarded fill", A.numRows()); col_ind_type deficiency("deficiency", A.numRows()); - col_ind_type update_list_length("update list length", 1); - typename col_ind_type::HostMirror update_list_length_host = - Kokkos::create_mirror_view(update_list_length); + ordinal_type update_list_len = 0; col_ind_type update_list("update list", A.numRows()); col_ind_type factored("factored rows", A.numRows()); Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); - KokkosSparse::Impl::MDF_discarded_fill_norm MDF_df_norm( - Atmp, At, 0, handle.permutation, discarded_fill, deficiency, - verbosity_level); - Kokkos::parallel_for("MDF: initial fill computation", - range_policy_type(0, Atmp.numRows()), MDF_df_norm); + KokkosSparse::Impl::MDF_discarded_fill_norm + MDF_df_norm(Atmp, At, 0, handle.permutation, discarded_fill, deficiency, + verbosity_level); + Kokkos::parallel_for( + "MDF: initial fill computation", + team_range_policy_type(Atmp.numRows(), Kokkos::AUTO, Kokkos::AUTO), + MDF_df_norm); for (ordinal_type factorization_step = 0; factorization_step < A.numRows(); ++factorization_step) { @@ -106,36 +107,54 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { static_cast(factorization_step)); } - Kokkos::deep_copy(update_list_length_host, update_list_length); - range_policy_type updatePolicy(0, update_list_length_host(0)); - KokkosSparse::Impl::MDF_selective_discarded_fill_norm - MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, - update_list, discarded_fill, deficiency, - verbosity_level); - Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, - MDF_update_df_norm); + { + team_range_policy_type updatePolicy(update_list_len, Kokkos::AUTO, + Kokkos::AUTO); + KokkosSparse::Impl::MDF_discarded_fill_norm + MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, + discarded_fill, deficiency, verbosity_level, + update_list); + Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, + MDF_update_df_norm); + } - range_policy_type stepPolicy(factorization_step, Atmp.numRows()); ordinal_type selected_row_idx = 0; - KokkosSparse::Impl::MDF_select_row MDF_row_selector( - factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, - handle.permutation); - Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, - selected_row_idx); - - KokkosSparse::Impl::MDF_compute_list_length - compute_list_length(selected_row_idx, Atmp, At, handle.permutation, - factored, update_list_length, update_list); - Kokkos::parallel_for("MDF: compute update list", range_policy_type(0, 1), - compute_list_length); - - KokkosSparse::Impl::MDF_factorize_row factorize_row( - Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, - handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, discarded_fill, factored, selected_row_idx, - factorization_step, verbosity_level); - Kokkos::parallel_for("MDF: factorize row", range_policy_type(0, 1), - factorize_row); + { + range_policy_type stepPolicy(factorization_step, Atmp.numRows()); + KokkosSparse::Impl::MDF_select_row MDF_row_selector( + factorization_step, discarded_fill, deficiency, Atmp.graph.row_map, + handle.permutation); + Kokkos::parallel_reduce("MDF: select pivot", stepPolicy, MDF_row_selector, + selected_row_idx); + } + + ordinal_type selected_row_len = 0; + { + team_range_policy_type updateListPolicy( + 1, Kokkos::AUTO); // (vector overloads required for scans to use + // vector parallel not provided by kokkos yet) + KokkosSparse::Impl::MDF_compute_list_length updateList( + Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, + handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, + handle.permutation_inv, discarded_fill, factored, selected_row_idx, + factorization_step, update_list, verbosity_level); + update_list_len = 0; + Kokkos::parallel_reduce("MDF: compute update list", updateListPolicy, + updateList, update_list_len, selected_row_len); + } + + // If this was the last row no need to update A and At! + if (factorization_step < A.numRows() - 1) { + team_range_policy_type factorizePolicy(selected_row_len, Kokkos::AUTO, + Kokkos::AUTO); + KokkosSparse::Impl::MDF_factorize_row factorize_row( + Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, + handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, + handle.permutation_inv, discarded_fill, factored, selected_row_idx, + factorization_step, update_list, verbosity_level); + Kokkos::parallel_for("MDF: factorize row", factorizePolicy, + factorize_row); + } if (verbosity_level > 0) { printf("\n"); diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index f6e4d0bc84..67aee2cbdc 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -16,14 +16,197 @@ #include #include - #include "KokkosSparse_mdf.hpp" +#include "KokkosSparse_CrsMatrix.hpp" namespace Test { +// void foo(){ + +// // const value_type four = static_cast(4.0); + +// constexpr ordinal_type numRows = 100; +// constexpr ordinal_type numCols = numRows; +// row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), +// numRows + 1); Kokkos::deep_copy(row_map,0); + +// constexpr value_type perc_fill = 0.3; +// constexpr size_type targetNonZerosPerRow = numRows*perc_fill; +// constexpr value_type num_fill_scl = 0.6; + +// Kokkos::Random_XorShift64_Pool random(13718 + 3); +// Kokkos::fill_random(row_map, random, +// size_type(targetNonZerosPerRow*num_fill_scl), +// value_type(targetNonZerosPerRow/num_fill_scl)); + +// size_type numNonZeros = 0; +// Kokkos::parallel_scan( +// Kokkos::RangePolicy(0,numRows+1), +// KOKKOS_LAMBDA(ordinal_type i,bool is_final,size_type & runningNZ){ +// if (is_final) { +// const auto curr_val = row_map[i]; +// row_map[i] = runningNZ; +// if (i < numRows) runningNZ += curr_val; +// } +// else { +// runningNZ += row_map[i]; +// } +// }, +// numNonZeros +// ); + +// // constexpr size_type numNonZeros = 64; +// // row_map_type row_map("row map", numRows + 1); +// col_ind_type col_ind("column indices", numNonZeros); +// values_type values("values", numNonZeros); +// Kokkos::fill_random(values, random, value_type(1.0), value_type(10.)); + +// } + +template +KokkosSparse::CrsMatrix +make_adv_diffusion_matrix(const scalar_type beta, const scalar_type vel_mag, + const size_type Nx, const size_type Ny) { + using crs_matrix_type = KokkosSparse::CrsMatrix; + using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; + using row_map_type = typename crs_graph_type::row_map_type::non_const_type; + using col_ind_type = typename crs_graph_type::entries_type::non_const_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using value_type = typename crs_matrix_type::value_type; + using execution_space = typename crs_matrix_type::execution_space; + + const ordinal_type numRows = Nx * Ny; + const ordinal_type& numCols = numRows; + row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), + numRows + 1); + + ordinal_type numNonZeros = 0; + Kokkos::parallel_scan( + Kokkos::RangePolicy(ordinal_type(0), + ordinal_type(numRows + 1)), + KOKKOS_LAMBDA(ordinal_type i, ordinal_type & runningNZ, bool is_final) { + const auto curr_val = (i == 0) ? 1 : 5; + if (is_final) row_map[i] = runningNZ; + if (i < numRows) runningNZ += curr_val; + }, + numNonZeros); + + col_ind_type col_ind("column indices", numNonZeros); + values_type values("values", numNonZeros); + Kokkos::parallel_for( + Kokkos::MDRangePolicy >({0, 0}, {Nx, Ny}), + KOKKOS_LAMBDA(ordinal_type iX, ordinal_type iY) { + const ordinal_type row_XY = iX + Nx * iY; + auto map_ind = row_map(row_XY); + if (row_XY == 0) { + col_ind(map_ind) = row_XY; + values(map_ind) = 1.; + return; + } + + const ordinal_type nX = (iX + Nx - 1) % Nx; + const ordinal_type pX = (iX + 1) % Nx; + const ordinal_type nY = (iY + Ny - 1) % Ny; + const ordinal_type pY = (iY + 1) % Ny; + + const ordinal_type row_pXY = pX + Nx * iY; + const ordinal_type row_nXY = nX + Nx * iY; + const ordinal_type row_XpY = iX + Nx * pY; + const ordinal_type row_XnY = iX + Nx * nY; + + // Negative y dir + col_ind(map_ind) = row_XnY; + values(map_ind) = beta; + ++map_ind; + // Negative x dir + col_ind(map_ind) = row_nXY; + values(map_ind) = beta - vel_mag; + ++map_ind; + // Middle + col_ind(map_ind) = row_XY; + values(map_ind) = -4.0 * beta + vel_mag; + ++map_ind; + // Positive x dir + col_ind(map_ind) = row_pXY; + values(map_ind) = beta; + ++map_ind; + // Positive y dir + col_ind(map_ind) = row_XpY; + values(map_ind) = beta; + }); + + return crs_matrix_type("A", numRows, numCols, numNonZeros, values, row_map, + col_ind); +} + +template +void run_test_mdf_recr_issue() { // + + // using execution_space = Kokkos::Serial; + using execution_space = typename device::execution_space; + + constexpr int num_teams = 10; + constexpr int num_per_team = 10; + Kokkos::View m_data( + Kokkos::ViewAllocateWithoutInitializing("data"), num_teams, num_per_team); + Kokkos::View m_num_entr( + Kokkos::ViewAllocateWithoutInitializing("data"), num_teams); + + using team_policy_t = Kokkos::TeamPolicy; + using member_t = typename team_policy_t::member_type; + + Kokkos::parallel_for(team_policy_t(num_teams, Kokkos::AUTO, Kokkos::AUTO), + KOKKOS_LAMBDA(member_t team) { + const auto iTeam = team.league_rank(); + + // int num_added; + Kokkos::parallel_scan( + Kokkos::TeamVectorRange(team, num_per_team), + [&](int i, int& partial_num, bool final) { + if (final) m_data(iTeam, i) = partial_num; + partial_num += i; + }); + + // // Do something with num_entr ... + // Kokkos::single(Kokkos::PerTeam(team),[&]{ + // m_num_entr(iTeam) = num_added; + // }); + }); +} + +template +void run_test_mdf() { //_timing + using crs_matrix_type = KokkosSparse::CrsMatrix; + using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; + using row_map_type = typename crs_graph_type::row_map_type::non_const_type; + using col_ind_type = typename crs_graph_type::entries_type::non_const_type; + using values_type = typename crs_matrix_type::values_type::non_const_type; + using value_type = typename crs_matrix_type::value_type; + using execution_space = typename crs_matrix_type::execution_space; + + const scalar_type beta = 1.0; + const scalar_type vel_mag = 0.5; + const size_type Nx = 400; + const size_type Ny = 400; + + crs_matrix_type A = + make_adv_diffusion_matrix( + beta, vel_mag, Nx, Ny); + + KokkosSparse::Experimental::MDF_handle handle(A); + handle.set_verbosity(0); + KokkosSparse::Experimental::mdf_symbolic(A, handle); + KokkosSparse::Experimental::mdf_numeric(A, handle); +} + template -void run_test_mdf() { +void run_test_mdf_real() { // using crs_matrix_type = KokkosSparse::CrsMatrix; using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; From 64c08fcab406a91323635ee7580f1d453e072c47 Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 12:22:20 -0600 Subject: [PATCH 2/3] move to unordered_set for factored rows --- sparse/impl/KokkosSparse_mdf_impl.hpp | 828 ++------------------------ sparse/src/KokkosSparse_mdf.hpp | 28 +- sparse/unit_test/Test_Sparse_mdf.hpp | 185 +----- 3 files changed, 59 insertions(+), 982 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 51f3ae98c3..1042c453f9 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -18,6 +18,7 @@ #define KOKKOSSPARSE_MDF_IMPL_HPP_ #include +#include #include "KokkosKernels_Sorting.hpp" #include "KokkosSparse_findRelOffset.hpp" #include @@ -67,6 +68,8 @@ struct MDF_count_lower { template struct MDF_discarded_fill_norm { + using device_type = typename crs_matrix_type::device_type; + using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; using col_ind_type = typename static_crs_graph_type::entries_type::non_const_type; @@ -78,10 +81,13 @@ struct MDF_discarded_fill_norm { using KAS = typename Kokkos::ArithTraits; using scalar_mag_type = typename KAS::mag_type; using KAM = typename Kokkos::ArithTraits; + using permutation_set_type = + Kokkos::UnorderedMap; crs_matrix_type A, At; ordinal_type factorization_step; col_ind_type permutation; + permutation_set_type permutation_set; col_ind_type update_list; values_mag_type discarded_fill; @@ -91,6 +97,7 @@ struct MDF_discarded_fill_norm { MDF_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, ordinal_type factorization_step_, col_ind_type permutation_, + permutation_set_type permutation_set_, values_mag_type discarded_fill_, col_ind_type deficiency_, int verbosity_, col_ind_type update_list_ = col_ind_type{}) @@ -98,6 +105,7 @@ struct MDF_discarded_fill_norm { At(At_), factorization_step(factorization_step_), permutation(permutation_), + permutation_set(permutation_set_), update_list(update_list_), discarded_fill(discarded_fill_), deficiency(deficiency_), @@ -176,15 +184,7 @@ struct MDF_discarded_fill_norm { // Check if row already eliminated if constexpr (!is_initial_fill) { - bool row_eliminated = false; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, factorization_step), - [&](const ordinal_type stepIdx, bool& running_row_eliminated) { - running_row_eliminated |= fillRowIdx == permutation(stepIdx); - }, - Kokkos::LOr(row_eliminated)); - - if (row_eliminated) return; + if (permutation_set.exists(fillRowIdx)) return; } const auto fillRowView = A.rowConst(fillRowIdx); @@ -198,13 +198,7 @@ struct MDF_discarded_fill_norm { if (fillColIdx == rowIdx) return; if constexpr (!is_initial_fill) { - bool col_eliminated = false; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - col_eliminated |= fillColIdx == permutation(stepIdx); - } - - if (col_eliminated) return; + if (permutation_set.exists(fillColIdx)) return; } bool entryIsDiscarded = true; @@ -244,270 +238,6 @@ struct MDF_discarded_fill_norm { } }; // MDF_discarded_fill_norm -// template -// struct MDF_discarded_fill_norm_old { -// using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; -// using col_ind_type = -// typename static_crs_graph_type::entries_type::non_const_type; -// using values_type = typename -// crs_matrix_type::values_type::non_const_type; using values_mag_type = -// typename MDF_types::values_mag_type; using size_type -// = typename crs_matrix_type::size_type; using ordinal_type = typename -// crs_matrix_type::ordinal_type; using scalar_type = typename -// crs_matrix_type::value_type; using KAS = typename -// Kokkos::ArithTraits; using scalar_mag_type = typename -// KAS::mag_type; using KAM = typename -// Kokkos::ArithTraits; - -// crs_matrix_type A, At; -// ordinal_type factorization_step; -// col_ind_type permutation; - -// values_mag_type discarded_fill; -// col_ind_type deficiency; -// int verbosity; - -// MDF_discarded_fill_norm_old(crs_matrix_type A_, crs_matrix_type At_, -// ordinal_type factorization_step_, -// col_ind_type permutation_, -// values_mag_type discarded_fill_, -// col_ind_type deficiency_, int verbosity_) -// : A(A_), -// At(At_), -// factorization_step(factorization_step_), -// permutation(permutation_), -// discarded_fill(discarded_fill_), -// deficiency(deficiency_), -// verbosity(verbosity_){}; - -// KOKKOS_INLINE_FUNCTION -// void operator()(const ordinal_type i) const { -// ordinal_type rowIdx = permutation(i); -// scalar_mag_type discard_norm = KAM::zero(); -// scalar_type diag_val = KAS::zero(); -// bool entryIsDiscarded = true; -// ordinal_type numFillEntries = 0; -// for (size_type alphaIdx = At.graph.row_map(rowIdx); -// alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { -// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); -// bool row_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) -// { -// if (fillRowIdx == permutation(stepIdx)) { -// row_not_eliminated = false; -// } -// } - -// if (fillRowIdx != rowIdx && row_not_eliminated) { -// for (size_type betaIdx = A.graph.row_map(rowIdx); -// betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { -// ordinal_type fillColIdx = A.graph.entries(betaIdx); -// bool col_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; -// ++stepIdx) { -// if (fillColIdx == permutation(stepIdx)) { -// col_not_eliminated = false; -// } -// } - -// if (fillColIdx != rowIdx && col_not_eliminated) { -// entryIsDiscarded = true; -// for (size_type entryIdx = A.graph.row_map(fillRowIdx); -// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { -// if (A.graph.entries(entryIdx) == fillColIdx) { -// entryIsDiscarded = false; -// } -// } -// if (entryIsDiscarded) { -// numFillEntries += 1; -// discard_norm += -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); -// if (verbosity > 1) { -// if constexpr (std::is_arithmetic_v) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Adding value A[%d,%d]=%f to discard norm of row %d\n", -// int(At.graph.entries(alphaIdx)), -// int(A.graph.entries(betaIdx)), -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * -// KAS::abs(At.values(alphaIdx) * A.values(betaIdx)), -// int(rowIdx)); -// } -// } -// } -// } -// } -// } else if (fillRowIdx == rowIdx) { -// diag_val = At.values(alphaIdx); -// if (verbosity > 1) { -// if constexpr (std::is_arithmetic_v) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Row %d diagonal value detected, values(%d)=%f\n", -// int(rowIdx), int(alphaIdx), At.values(alphaIdx)); -// } else if constexpr (std::is_arithmetic_v) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Row %d diagonal value detected, |values(%d)|=%f\n", -// int(rowIdx), int(alphaIdx), KAS::abs(At.values(alphaIdx))); -// } -// } -// } -// } - -// // TODO add a check on `diag_val == zero` -// discard_norm = discard_norm / KAS::abs(diag_val * diag_val); -// discarded_fill(rowIdx) = discard_norm; -// deficiency(rowIdx) = numFillEntries; - -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 0) { -// const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) -// - -// A.graph.row_map(rowIdx) - -// 1); -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "Row %d has discarded fill of %f, deficiency of %d and degree -// %d\n", static_cast(rowIdx), -// static_cast(KAM::sqrt(discard_norm)), -// static_cast(deficiency(rowIdx)), static_cast(degree)); -// } -// } -// } - -// }; // MDF_discarded_fill_norm_old - -template -struct MDF_selective_discarded_fill_norm { - using static_crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; - using col_ind_type = - typename static_crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using size_type = typename crs_matrix_type::size_type; - using ordinal_type = typename crs_matrix_type::ordinal_type; - using scalar_type = typename crs_matrix_type::value_type; - using KAS = typename Kokkos::ArithTraits; - using scalar_mag_type = typename KAS::mag_type; - using KAM = typename Kokkos::ArithTraits; - using values_mag_type = typename MDF_types::values_mag_type; - - crs_matrix_type A, At; - ordinal_type factorization_step; - col_ind_type permutation; - col_ind_type update_list; - - values_mag_type discarded_fill; - col_ind_type deficiency; - int verbosity; - - MDF_selective_discarded_fill_norm(crs_matrix_type A_, crs_matrix_type At_, - ordinal_type factorization_step_, - col_ind_type permutation_, - col_ind_type update_list_, - values_mag_type discarded_fill_, - col_ind_type deficiency_, int verbosity_) - : A(A_), - At(At_), - factorization_step(factorization_step_), - permutation(permutation_), - update_list(update_list_), - discarded_fill(discarded_fill_), - deficiency(deficiency_), - verbosity(verbosity_){}; - - KOKKOS_INLINE_FUNCTION - void operator()(const ordinal_type i) const { - ordinal_type rowIdx = permutation(update_list(i)); - scalar_mag_type discard_norm = KAM::zero(); - scalar_type diag_val = KAS::zero(); - bool entryIsDiscarded = true; - ordinal_type numFillEntries = 0; - for (size_type alphaIdx = At.graph.row_map(rowIdx); - alphaIdx < At.graph.row_map(rowIdx + 1); ++alphaIdx) { - ordinal_type fillRowIdx = At.graph.entries(alphaIdx); - bool row_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) { - if (fillRowIdx == permutation(stepIdx)) { - row_not_eliminated = false; - } - } - - if (fillRowIdx != rowIdx && row_not_eliminated) { - for (size_type betaIdx = A.graph.row_map(rowIdx); - betaIdx < A.graph.row_map(rowIdx + 1); ++betaIdx) { - ordinal_type fillColIdx = A.graph.entries(betaIdx); - bool col_not_eliminated = true; - for (ordinal_type stepIdx = 0; stepIdx < factorization_step; - ++stepIdx) { - if (fillColIdx == permutation(stepIdx)) { - col_not_eliminated = false; - } - } - - if (fillColIdx != rowIdx && col_not_eliminated) { - entryIsDiscarded = true; - for (size_type entryIdx = A.graph.row_map(fillRowIdx); - entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { - if (A.graph.entries(entryIdx) == fillColIdx) { - entryIsDiscarded = false; - } - } - if (entryIsDiscarded) { - numFillEntries += 1; - discard_norm += - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Adding value A[%d,%d]=%f to discard norm of row %d\n", - static_cast(At.graph.entries(alphaIdx)), - static_cast(A.graph.entries(betaIdx)), - static_cast( - KAS::abs(At.values(alphaIdx) * A.values(betaIdx)) * - KAS::abs(At.values(alphaIdx) * A.values(betaIdx))), - static_cast(rowIdx)); - } - } - } - } - } - } else if (fillRowIdx == rowIdx) { - diag_val = At.values(alphaIdx); - if (verbosity > 1) { - if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, values(%d)=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(At.values(alphaIdx))); - } else if constexpr (std::is_arithmetic_v) { - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d diagonal value dected, |values(%d)|=%f\n", - static_cast(rowIdx), static_cast(alphaIdx), - static_cast(KAS::abs(At.values(alphaIdx)))); - } - } - } - } - - // TODO add a check on `diag_val == zero` - discard_norm = discard_norm / KAS::abs(diag_val * diag_val); - discarded_fill(rowIdx) = discard_norm; - deficiency(rowIdx) = numFillEntries; - - if constexpr (std::is_arithmetic_v) { - if (verbosity > 0) { - const ordinal_type degree = ordinal_type(A.graph.row_map(rowIdx + 1) - - A.graph.row_map(rowIdx) - 1); - KOKKOS_IMPL_DO_NOT_USE_PRINTF( - "Row %d has discarded fill of %f, deficiency of %d and degree %d\n", - static_cast(rowIdx), - static_cast(KAM::sqrt(discard_norm)), - static_cast(deficiency(rowIdx)), static_cast(degree)); - } - } - } - -}; // MDF_selective_discarded_fill_norm - template struct MDF_select_row { using values_type = typename crs_matrix_type::values_type::non_const_type; @@ -630,6 +360,11 @@ KOKKOS_INLINE_FUNCTION bool sorted_view_contains( template struct MDF_factorize_row { + using device_type = typename crs_matrix_type::device_type; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -640,6 +375,8 @@ struct MDF_factorize_row { using value_type = typename crs_matrix_type::value_type; using values_mag_type = typename MDF_types::values_mag_type; using value_mag_type = typename values_mag_type::value_type; + using permutation_set_type = + Kokkos::UnorderedMap; crs_matrix_type A, At; @@ -652,6 +389,7 @@ struct MDF_factorize_row { values_type valuesU; col_ind_type permutation, permutation_inv; + permutation_set_type permutation_set; values_mag_type discarded_fill; col_ind_type factored; ordinal_type selected_row_idx, factorization_step; @@ -660,15 +398,12 @@ struct MDF_factorize_row { int verbosity; - using execution_space = typename crs_matrix_type::execution_space; - using team_policy_t = Kokkos::TeamPolicy; - using team_member_t = typename team_policy_t::member_type; - MDF_factorize_row(crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, col_ind_type permutation_inv_, + permutation_set_type permutation_set_, values_mag_type discarded_fill_, col_ind_type factored_, ordinal_type selected_row_idx_, ordinal_type factorization_step_, @@ -683,6 +418,7 @@ struct MDF_factorize_row { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + permutation_set(permutation_set_), discarded_fill(discarded_fill_), factored(factored_), selected_row_idx(selected_row_idx_), @@ -700,17 +436,7 @@ struct MDF_factorize_row { const auto rowInd = colView.colidx(alpha); if (rowInd == selected_row) return; - { - bool row_eliminated = false; - Kokkos::parallel_reduce( - Kokkos::TeamVectorRange(team, factorization_step), - [&](const ordinal_type step, bool& partial) { - partial |= rowInd == permutation(step); - }, - Kokkos::LOr(row_eliminated)); - - if (row_eliminated) return; - } + if (permutation_set.exists(rowInd)) return; // Only one of the values will match selected so can just sum all contribs const auto rowView = A.rowConst(selected_row); @@ -733,17 +459,7 @@ struct MDF_factorize_row { if (colInd == selected_row) return; - { - bool col_eliminated = false; - Kokkos::parallel_reduce( - Kokkos::ThreadVectorRange(team, factorization_step), - [&](const ordinal_type step, bool& partial) { - partial |= colInd == permutation(step); - }, - Kokkos::LOr(col_eliminated)); - - if (col_eliminated) return; - } + if (permutation_set.exists(colInd)) return; const auto subVal = colView.value(alpha) * rowView.value(beta) / diag; @@ -767,155 +483,13 @@ struct MDF_factorize_row { } }; -// template -// struct MDF_factorize_row_heir_old { -// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: -// row_map_type::non_const_type; -// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: -// entries_type::non_const_type; -// using values_type = typename -// crs_matrix_type::values_type::non_const_type; using ordinal_type = -// typename crs_matrix_type::ordinal_type; using size_type = typename -// crs_matrix_type::size_type; using value_type = typename -// crs_matrix_type::value_type; using values_mag_type = typename -// MDF_types::values_mag_type; using value_mag_type = -// typename values_mag_type::value_type; - -// crs_matrix_type A, At; - -// row_map_type row_mapL; -// col_ind_type entriesL; -// values_type valuesL; - -// row_map_type row_mapU; -// col_ind_type entriesU; -// values_type valuesU; - -// col_ind_type permutation, permutation_inv; -// values_mag_type discarded_fill; -// col_ind_type factored; -// ordinal_type selected_row_idx, factorization_step; - -// col_ind_type update_list; - -// int verbosity; - -// using execution_space = typename crs_matrix_type::execution_space; -// using team_policy_t = Kokkos::TeamPolicy; -// using team_member_t = typename team_policy_t::member_type; - -// MDF_factorize_row_heir_old(crs_matrix_type A_, crs_matrix_type At_, -// row_map_type row_mapL_, col_ind_type entriesL_, -// values_type valuesL_, row_map_type row_mapU_, -// col_ind_type entriesU_, values_type valuesU_, -// col_ind_type permutation_, col_ind_type permutation_inv_, -// values_mag_type discarded_fill_, col_ind_type factored_, -// ordinal_type selected_row_idx_, -// ordinal_type factorization_step_, col_ind_type& -// update_list_, int verbosity_) -// : A(A_), -// At(At_), -// row_mapL(row_mapL_), -// entriesL(entriesL_), -// valuesL(valuesL_), -// row_mapU(row_mapU_), -// entriesU(entriesU_), -// valuesU(valuesU_), -// permutation(permutation_), -// permutation_inv(permutation_inv_), -// discarded_fill(discarded_fill_), -// factored(factored_), -// selected_row_idx(selected_row_idx_), -// factorization_step(factorization_step_), -// update_list(update_list_), -// verbosity(verbosity_){}; - -// //Phase 2, do facrotization -// KOKKOS_INLINE_FUNCTION -// void operator()(team_member_t team) const{ -// const ordinal_type selected_row = permutation(factorization_step); -// const auto rowView = A.rowConst(selected_row); -// const auto colView = At.rowConst(selected_row); - -// // If this was the last row no need to update A and At! -// if (factorization_step == A.numRows() - 1) { -// return; -// } - -// // Only one of the values will match selected so can just sum all -// contribs value_type diag = Kokkos::ArithTraits::zero(); -// Kokkos::parallel_reduce( -// Kokkos::TeamVectorRange(team,rowView.length), -// [&](const size_type alpha,value_type & running_diag){ -// if (rowView.colidx(alpha) == selected_row) -// running_diag = rowView.value(alpha); -// }, -// Kokkos::Sum(diag) -// ); - -// // Extract alpha and beta vectors -// // Then insert alpha*beta/diag_val if the corresponding -// // entry in A is non-zero. -// Kokkos::parallel_for( -// Kokkos::TeamThreadRange(team,colView.length), -// [&](const ordinal_type alpha){ -// const auto rowInd = colView.colidx(alpha); -// auto fillRowView = A.row(rowInd); - -// if (rowInd == selected_row) return; - -// bool row_eliminated = false; -// Kokkos::parallel_reduce( -// Kokkos::ThreadVectorRange(team,factorization_step), -// [&](const ordinal_type step, bool & partial){ -// partial |= rowInd == permutation(step); -// }, -// Kokkos::LOr(row_eliminated) -// ); - -// if (row_eliminated) return; - -// Kokkos::parallel_for( -// Kokkos::ThreadVectorRange(team,rowView.length), -// [&](const ordinal_type beta){ -// const auto colInd = rowView.colidx(beta); - -// if (colInd == selected_row) return; - -// bool col_eliminated = false; -// for (ordinal_type step = 0; step < factorization_step; ++step){ -// col_eliminated |= colInd == permutation(step); -// } - -// if (col_eliminated) return; - -// const auto subVal = colView.colidx(alpha) * rowView.colidx(beta) -// / diag; for (ordinal_type gamma = 0; gamma < fillRowView.length; -// ++gamma){ -// if (colInd == fillRowView.colidx(gamma)){ -// Kokkos::atomic_sub( -// &fillRowView.value(gamma), -// subVal -// ); -// } -// } -// auto fillColView = At.row(colInd); -// for (ordinal_type delt = 0; delt < fillColView.length; ++delt){ -// if (rowInd == fillColView.colidx(delt)){ -// Kokkos::atomic_sub( -// &fillColView.value(delt), -// subVal -// ); -// } -// } -// }); -// } -// ); -// } -// }; - template struct MDF_compute_list_length { + using device_type = typename crs_matrix_type::device_type; + using execution_space = typename crs_matrix_type::execution_space; + using team_policy_t = Kokkos::TeamPolicy; + using team_member_t = typename team_policy_t::member_type; + using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: row_map_type::non_const_type; using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: @@ -927,6 +501,9 @@ struct MDF_compute_list_length { using values_mag_type = typename MDF_types::values_mag_type; using value_mag_type = typename values_mag_type::value_type; + using permutation_set_type = + Kokkos::UnorderedMap; + crs_matrix_type A, At; row_map_type row_mapL; @@ -938,6 +515,7 @@ struct MDF_compute_list_length { values_type valuesU; col_ind_type permutation, permutation_inv; + permutation_set_type permutation_set; values_mag_type discarded_fill; col_ind_type factored; ordinal_type selected_row_idx, factorization_step; @@ -946,18 +524,14 @@ struct MDF_compute_list_length { int verbosity; - using execution_space = typename crs_matrix_type::execution_space; - using team_policy_t = Kokkos::TeamPolicy; - using team_member_t = typename team_policy_t::member_type; - MDF_compute_list_length( crs_matrix_type A_, crs_matrix_type At_, row_map_type row_mapL_, col_ind_type entriesL_, values_type valuesL_, row_map_type row_mapU_, col_ind_type entriesU_, values_type valuesU_, col_ind_type permutation_, - col_ind_type permutation_inv_, values_mag_type discarded_fill_, - col_ind_type factored_, ordinal_type selected_row_idx_, - ordinal_type factorization_step_, col_ind_type& update_list_, - int verbosity_) + col_ind_type permutation_inv_, permutation_set_type permutation_set_, + values_mag_type discarded_fill_, col_ind_type factored_, + ordinal_type selected_row_idx_, ordinal_type factorization_step_, + col_ind_type& update_list_, int verbosity_) : A(A_), At(At_), row_mapL(row_mapL_), @@ -968,6 +542,7 @@ struct MDF_compute_list_length { valuesU(valuesU_), permutation(permutation_), permutation_inv(permutation_inv_), + permutation_set(permutation_set_), discarded_fill(discarded_fill_), factored(factored_), selected_row_idx(selected_row_idx_), @@ -999,6 +574,11 @@ struct MDF_compute_list_length { // Diagonal value of L entriesL(L_entryIdx) = selected_row; valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); + + // Insert into permutation set for later + const auto res = permutation_set.insert(selected_row); + (void)res; // avoid unused error + assert(res.success()); }); ++L_entryIdx; @@ -1137,332 +717,6 @@ struct MDF_compute_list_length { } }; -// template -// struct MDF_factorize_row_old { -// using row_map_type = typename crs_matrix_type::StaticCrsGraphType:: -// row_map_type::non_const_type; -// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: -// entries_type::non_const_type; -// using values_type = typename -// crs_matrix_type::values_type::non_const_type; using ordinal_type = -// typename crs_matrix_type::ordinal_type; using size_type = typename -// crs_matrix_type::size_type; using value_type = typename -// crs_matrix_type::value_type; using values_mag_type = typename -// MDF_types::values_mag_type; using value_mag_type = -// typename values_mag_type::value_type; - -// crs_matrix_type A, At; - -// row_map_type row_mapL; -// col_ind_type entriesL; -// values_type valuesL; - -// row_map_type row_mapU; -// col_ind_type entriesU; -// values_type valuesU; - -// col_ind_type permutation, permutation_inv; -// values_mag_type discarded_fill; -// col_ind_type factored; -// ordinal_type selected_row_idx, factorization_step; - -// int verbosity; - -// MDF_factorize_row_old(crs_matrix_type A_, crs_matrix_type At_, -// row_map_type row_mapL_, col_ind_type entriesL_, -// values_type valuesL_, row_map_type row_mapU_, -// col_ind_type entriesU_, values_type valuesU_, -// col_ind_type permutation_, col_ind_type permutation_inv_, -// values_mag_type discarded_fill_, col_ind_type factored_, -// ordinal_type selected_row_idx_, -// ordinal_type factorization_step_, int verbosity_) -// : A(A_), -// At(At_), -// row_mapL(row_mapL_), -// entriesL(entriesL_), -// valuesL(valuesL_), -// row_mapU(row_mapU_), -// entriesU(entriesU_), -// valuesU(valuesU_), -// permutation(permutation_), -// permutation_inv(permutation_inv_), -// discarded_fill(discarded_fill_), -// factored(factored_), -// selected_row_idx(selected_row_idx_), -// factorization_step(factorization_step_), -// verbosity(verbosity_){}; - -// KOKKOS_INLINE_FUNCTION -// void operator()(const ordinal_type /* idx */) const { -// const ordinal_type selected_row = permutation(selected_row_idx); -// discarded_fill(selected_row) = -// Kokkos::ArithTraits::max(); - -// // Swap entries in permutation vectors -// permutation(selected_row_idx) = permutation(factorization_step); -// permutation(factorization_step) = selected_row; -// permutation_inv(permutation(factorization_step)) = factorization_step; -// permutation_inv(permutation(selected_row_idx)) = selected_row_idx; - -// if (verbosity > 0) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Permutation vector: { "); -// for (ordinal_type rowIdx = 0; rowIdx < A.numRows(); ++rowIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(permutation(rowIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } - -// // Insert the upper part of the selected row in U -// // including the diagonal term. -// value_type diag = Kokkos::ArithTraits::zero(); -// size_type U_entryIdx = row_mapU(factorization_step); -// for (size_type entryIdx = A.graph.row_map(selected_row); -// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { -// if (permutation_inv(A.graph.entries(entryIdx)) >= factorization_step) { -// entriesU(U_entryIdx) = A.graph.entries(entryIdx); -// valuesU(U_entryIdx) = A.values(entryIdx); -// ++U_entryIdx; -// if (A.graph.entries(entryIdx) == selected_row) { -// diag = A.values(entryIdx); -// } -// } -// } -// row_mapU(factorization_step + 1) = U_entryIdx; -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 0) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("Diagonal values of row %d is %f\n", -// static_cast(selected_row), -// static_cast(diag)); -// } - -// if (verbosity > 2) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("U, row_map={ "); -// for (ordinal_type rowIdx = 0; rowIdx < factorization_step + 1; -// ++rowIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(row_mapU(rowIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, entries={ "); -// for (size_type entryIdx = row_mapU(0); -// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(entriesU(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); -// for (size_type entryIdx = row_mapU(0); -// entryIdx < row_mapU(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", -// static_cast(valuesU(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } -// } - -// // Insert the lower part of the selected column of A -// // divided by its the diagonal value to obtain a unit -// // diagonal value in L. -// size_type L_entryIdx = row_mapL(factorization_step); -// entriesL(L_entryIdx) = selected_row; -// valuesL(L_entryIdx) = Kokkos::ArithTraits::one(); -// ++L_entryIdx; -// for (size_type entryIdx = At.graph.row_map(selected_row); -// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { -// if (permutation_inv(At.graph.entries(entryIdx)) > factorization_step) { -// entriesL(L_entryIdx) = At.graph.entries(entryIdx); -// valuesL(L_entryIdx) = At.values(entryIdx) / diag; -// ++L_entryIdx; -// } -// } -// row_mapL(factorization_step + 1) = L_entryIdx; - -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 2) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "L(%d), [row_map(%d), row_map(%d)[ = [%d, %d[, entries={ ", -// static_cast(factorization_step), -// static_cast(factorization_step), -// static_cast(factorization_step + 1), -// static_cast(row_mapL(factorization_step)), -// static_cast(row_mapL(factorization_step + 1))); -// for (size_type entryIdx = row_mapL(factorization_step); -// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d ", -// static_cast(entriesL(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}, values={ "); -// for (size_type entryIdx = row_mapL(factorization_step); -// entryIdx < row_mapL(factorization_step + 1); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("%f ", -// static_cast(valuesL(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } -// } - -// // If this was the last row no need to update A and At! -// if (factorization_step == A.numRows() - 1) { -// return; -// } - -// // Finally we want to update A and At with the values -// // that where not discarded during factorization. -// // Note: this is almost the same operation as computing -// // the norm of the discarded fill... - -// // First step: find the diagonal entry in selected_row -// value_type diag_val = Kokkos::ArithTraits::zero(); -// for (size_type entryIdx = A.graph.row_map(selected_row); -// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { -// ordinal_type colIdx = A.graph.entries(entryIdx); -// if (selected_row == colIdx) { -// diag_val = A.values(entryIdx); -// } -// } - -// // Extract alpha and beta vectors -// // Then insert alpha*beta/diag_val if the corresponding -// // entry in A is non-zero. -// for (size_type alphaIdx = At.graph.row_map(selected_row); -// alphaIdx < At.graph.row_map(selected_row + 1); ++alphaIdx) { -// ordinal_type fillRowIdx = At.graph.entries(alphaIdx); -// bool row_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; ++stepIdx) -// { -// if (fillRowIdx == permutation(stepIdx)) { -// row_not_eliminated = false; -// } -// } - -// if ((fillRowIdx != selected_row) && row_not_eliminated) { -// for (size_type betaIdx = A.graph.row_map(selected_row); -// betaIdx < A.graph.row_map(selected_row + 1); ++betaIdx) { -// ordinal_type fillColIdx = A.graph.entries(betaIdx); -// bool col_not_eliminated = true; -// for (ordinal_type stepIdx = 0; stepIdx < factorization_step; -// ++stepIdx) { -// if (fillColIdx == permutation(stepIdx)) { -// col_not_eliminated = false; -// } -// } - -// if ((fillColIdx != selected_row) && col_not_eliminated) { -// for (size_type entryIdx = A.graph.row_map(fillRowIdx); -// entryIdx < A.graph.row_map(fillRowIdx + 1); ++entryIdx) { -// if (A.graph.entries(entryIdx) == fillColIdx) { -// A.values(entryIdx) -= -// At.values(alphaIdx) * A.values(betaIdx) / diag_val; -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 1) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "A[%d, %d] -= %f\n", static_cast(fillRowIdx), -// static_cast(fillColIdx), -// static_cast(At.values(alphaIdx) * -// A.values(betaIdx) / diag_val)); -// } -// } -// } -// } - -// for (size_type entryIdx = At.graph.row_map(fillColIdx); -// entryIdx < At.graph.row_map(fillColIdx + 1); ++entryIdx) { -// if (At.graph.entries(entryIdx) == fillRowIdx) { -// At.values(entryIdx) -= -// At.values(alphaIdx) * A.values(betaIdx) / diag_val; -// } -// } -// } -// } -// } -// } - -// factored(selected_row) = 1; - -// if constexpr (std::is_arithmetic_v) { -// if (verbosity > 0) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in A: { "); -// for (size_type entryIdx = 0; entryIdx < A.nnz(); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "%f ", static_cast(A.values(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("New values in At: { "); -// for (size_type entryIdx = 0; entryIdx < At.nnz(); ++entryIdx) { -// KOKKOS_IMPL_DO_NOT_USE_PRINTF( -// "%f ", static_cast(At.values(entryIdx))); -// } -// KOKKOS_IMPL_DO_NOT_USE_PRINTF("}\n"); -// } -// } -// } // operator() - -// }; // MDF_factorize_row_old - -// template -// struct MDF_compute_list_length_old { -// using col_ind_type = typename crs_matrix_type::StaticCrsGraphType:: -// entries_type::non_const_type; -// using ordinal_type = typename crs_matrix_type::ordinal_type; -// using size_type = typename crs_matrix_type::size_type; - -// ordinal_type selected_row_idx; -// crs_matrix_type A; -// crs_matrix_type At; -// col_ind_type permutation; -// col_ind_type factored; -// col_ind_type update_list_length; -// col_ind_type update_list; - -// MDF_compute_list_length_old(const ordinal_type rowIdx_, const -// crs_matrix_type& A_, -// const crs_matrix_type& At_, -// const col_ind_type& permutation_, -// const col_ind_type factored_, -// col_ind_type& update_list_length_, -// col_ind_type& update_list_) -// : selected_row_idx(rowIdx_), -// A(A_), -// At(At_), -// permutation(permutation_), -// factored(factored_), -// update_list_length(update_list_length_), -// update_list(update_list_) {} - -// KOKKOS_INLINE_FUNCTION -// void operator()(const size_type /*idx*/) const { -// const ordinal_type selected_row = permutation(selected_row_idx); - -// size_type updateIdx = 0; -// for (size_type entryIdx = A.graph.row_map(selected_row); -// entryIdx < A.graph.row_map(selected_row + 1); ++entryIdx) { -// if ((A.graph.entries(entryIdx) != selected_row) && -// (factored(A.graph.entries(entryIdx)) != 1)) { -// update_list(updateIdx) = A.graph.entries(entryIdx); -// ++updateIdx; -// } -// } -// size_type update_rows = updateIdx; -// for (size_type entryIdx = At.graph.row_map(selected_row); -// entryIdx < At.graph.row_map(selected_row + 1); ++entryIdx) { -// if ((At.graph.entries(entryIdx) != selected_row) && -// (factored(A.graph.entries(entryIdx)) != 1)) { -// bool already_updated = false; -// for (size_type checkIdx = 0; checkIdx < update_rows; ++checkIdx) { -// if (At.graph.entries(entryIdx) == update_list(checkIdx)) { -// already_updated = true; -// break; -// } -// } -// if (already_updated == false) { -// update_list(updateIdx) = At.graph.entries(entryIdx); -// ++updateIdx; -// } -// } -// } -// update_list_length(0) = updateIdx; -// } -// }; - template struct MDF_reindex_matrix { col_ind_type permutation_inv; diff --git a/sparse/src/KokkosSparse_mdf.hpp b/sparse/src/KokkosSparse_mdf.hpp index a69e7a0e75..272180debe 100644 --- a/sparse/src/KokkosSparse_mdf.hpp +++ b/sparse/src/KokkosSparse_mdf.hpp @@ -27,6 +27,7 @@ #ifndef KOKKOSSPARSE_MDF_HPP_ #define KOKKOSSPARSE_MDF_HPP_ +#include #include "KokkosSparse_mdf_handle.hpp" #include "KokkosSparse_mdf_impl.hpp" @@ -71,10 +72,14 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { using ordinal_type = typename crs_matrix_type::ordinal_type; using value_mag_type = typename values_mag_type::value_type; + using device_type = typename crs_matrix_type::device_type; using execution_space = typename crs_matrix_type::execution_space; using range_policy_type = Kokkos::RangePolicy; using team_range_policy_type = Kokkos::TeamPolicy; + using permutation_set_type = + Kokkos::UnorderedMap; + // Numerical phase: // loop over rows // compute discarded fill of each row @@ -91,10 +96,11 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { col_ind_type factored("factored rows", A.numRows()); Kokkos::deep_copy(discarded_fill, Kokkos::ArithTraits::max()); Kokkos::deep_copy(deficiency, Kokkos::ArithTraits::max()); + permutation_set_type permutation_set(A.numRows()); KokkosSparse::Impl::MDF_discarded_fill_norm - MDF_df_norm(Atmp, At, 0, handle.permutation, discarded_fill, deficiency, - verbosity_level); + MDF_df_norm(Atmp, At, 0, handle.permutation, permutation_set, + discarded_fill, deficiency, verbosity_level); Kokkos::parallel_for( "MDF: initial fill computation", team_range_policy_type(Atmp.numRows(), Kokkos::AUTO, Kokkos::AUTO), @@ -112,8 +118,8 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { Kokkos::AUTO); KokkosSparse::Impl::MDF_discarded_fill_norm MDF_update_df_norm(Atmp, At, factorization_step, handle.permutation, - discarded_fill, deficiency, verbosity_level, - update_list); + permutation_set, discarded_fill, deficiency, + verbosity_level, update_list); Kokkos::parallel_for("MDF: updating fill norms", updatePolicy, MDF_update_df_norm); } @@ -130,14 +136,14 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { ordinal_type selected_row_len = 0; { - team_range_policy_type updateListPolicy( - 1, Kokkos::AUTO); // (vector overloads required for scans to use - // vector parallel not provided by kokkos yet) + // vector overloads required for scans to use vector parallel not yet + // provided by kokkos (https://github.com/kokkos/kokkos/issues/6259) + team_range_policy_type updateListPolicy(1, Kokkos::AUTO); KokkosSparse::Impl::MDF_compute_list_length updateList( Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, discarded_fill, factored, selected_row_idx, - factorization_step, update_list, verbosity_level); + handle.permutation_inv, permutation_set, discarded_fill, factored, + selected_row_idx, factorization_step, update_list, verbosity_level); update_list_len = 0; Kokkos::parallel_reduce("MDF: compute update list", updateListPolicy, updateList, update_list_len, selected_row_len); @@ -150,8 +156,8 @@ void mdf_numeric(const crs_matrix_type& A, MDF_handle& handle) { KokkosSparse::Impl::MDF_factorize_row factorize_row( Atmp, At, handle.row_mapL, handle.entriesL, handle.valuesL, handle.row_mapU, handle.entriesU, handle.valuesU, handle.permutation, - handle.permutation_inv, discarded_fill, factored, selected_row_idx, - factorization_step, update_list, verbosity_level); + handle.permutation_inv, permutation_set, discarded_fill, factored, + selected_row_idx, factorization_step, update_list, verbosity_level); Kokkos::parallel_for("MDF: factorize row", factorizePolicy, factorize_row); } diff --git a/sparse/unit_test/Test_Sparse_mdf.hpp b/sparse/unit_test/Test_Sparse_mdf.hpp index 67aee2cbdc..4b5b65aeb3 100644 --- a/sparse/unit_test/Test_Sparse_mdf.hpp +++ b/sparse/unit_test/Test_Sparse_mdf.hpp @@ -21,192 +21,9 @@ namespace Test { -// void foo(){ - -// // const value_type four = static_cast(4.0); - -// constexpr ordinal_type numRows = 100; -// constexpr ordinal_type numCols = numRows; -// row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), -// numRows + 1); Kokkos::deep_copy(row_map,0); - -// constexpr value_type perc_fill = 0.3; -// constexpr size_type targetNonZerosPerRow = numRows*perc_fill; -// constexpr value_type num_fill_scl = 0.6; - -// Kokkos::Random_XorShift64_Pool random(13718 + 3); -// Kokkos::fill_random(row_map, random, -// size_type(targetNonZerosPerRow*num_fill_scl), -// value_type(targetNonZerosPerRow/num_fill_scl)); - -// size_type numNonZeros = 0; -// Kokkos::parallel_scan( -// Kokkos::RangePolicy(0,numRows+1), -// KOKKOS_LAMBDA(ordinal_type i,bool is_final,size_type & runningNZ){ -// if (is_final) { -// const auto curr_val = row_map[i]; -// row_map[i] = runningNZ; -// if (i < numRows) runningNZ += curr_val; -// } -// else { -// runningNZ += row_map[i]; -// } -// }, -// numNonZeros -// ); - -// // constexpr size_type numNonZeros = 64; -// // row_map_type row_map("row map", numRows + 1); -// col_ind_type col_ind("column indices", numNonZeros); -// values_type values("values", numNonZeros); -// Kokkos::fill_random(values, random, value_type(1.0), value_type(10.)); - -// } - -template -KokkosSparse::CrsMatrix -make_adv_diffusion_matrix(const scalar_type beta, const scalar_type vel_mag, - const size_type Nx, const size_type Ny) { - using crs_matrix_type = KokkosSparse::CrsMatrix; - using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; - using row_map_type = typename crs_graph_type::row_map_type::non_const_type; - using col_ind_type = typename crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using value_type = typename crs_matrix_type::value_type; - using execution_space = typename crs_matrix_type::execution_space; - - const ordinal_type numRows = Nx * Ny; - const ordinal_type& numCols = numRows; - row_map_type row_map(Kokkos::ViewAllocateWithoutInitializing("row map"), - numRows + 1); - - ordinal_type numNonZeros = 0; - Kokkos::parallel_scan( - Kokkos::RangePolicy(ordinal_type(0), - ordinal_type(numRows + 1)), - KOKKOS_LAMBDA(ordinal_type i, ordinal_type & runningNZ, bool is_final) { - const auto curr_val = (i == 0) ? 1 : 5; - if (is_final) row_map[i] = runningNZ; - if (i < numRows) runningNZ += curr_val; - }, - numNonZeros); - - col_ind_type col_ind("column indices", numNonZeros); - values_type values("values", numNonZeros); - Kokkos::parallel_for( - Kokkos::MDRangePolicy >({0, 0}, {Nx, Ny}), - KOKKOS_LAMBDA(ordinal_type iX, ordinal_type iY) { - const ordinal_type row_XY = iX + Nx * iY; - auto map_ind = row_map(row_XY); - if (row_XY == 0) { - col_ind(map_ind) = row_XY; - values(map_ind) = 1.; - return; - } - - const ordinal_type nX = (iX + Nx - 1) % Nx; - const ordinal_type pX = (iX + 1) % Nx; - const ordinal_type nY = (iY + Ny - 1) % Ny; - const ordinal_type pY = (iY + 1) % Ny; - - const ordinal_type row_pXY = pX + Nx * iY; - const ordinal_type row_nXY = nX + Nx * iY; - const ordinal_type row_XpY = iX + Nx * pY; - const ordinal_type row_XnY = iX + Nx * nY; - - // Negative y dir - col_ind(map_ind) = row_XnY; - values(map_ind) = beta; - ++map_ind; - // Negative x dir - col_ind(map_ind) = row_nXY; - values(map_ind) = beta - vel_mag; - ++map_ind; - // Middle - col_ind(map_ind) = row_XY; - values(map_ind) = -4.0 * beta + vel_mag; - ++map_ind; - // Positive x dir - col_ind(map_ind) = row_pXY; - values(map_ind) = beta; - ++map_ind; - // Positive y dir - col_ind(map_ind) = row_XpY; - values(map_ind) = beta; - }); - - return crs_matrix_type("A", numRows, numCols, numNonZeros, values, row_map, - col_ind); -} - -template -void run_test_mdf_recr_issue() { // - - // using execution_space = Kokkos::Serial; - using execution_space = typename device::execution_space; - - constexpr int num_teams = 10; - constexpr int num_per_team = 10; - Kokkos::View m_data( - Kokkos::ViewAllocateWithoutInitializing("data"), num_teams, num_per_team); - Kokkos::View m_num_entr( - Kokkos::ViewAllocateWithoutInitializing("data"), num_teams); - - using team_policy_t = Kokkos::TeamPolicy; - using member_t = typename team_policy_t::member_type; - - Kokkos::parallel_for(team_policy_t(num_teams, Kokkos::AUTO, Kokkos::AUTO), - KOKKOS_LAMBDA(member_t team) { - const auto iTeam = team.league_rank(); - - // int num_added; - Kokkos::parallel_scan( - Kokkos::TeamVectorRange(team, num_per_team), - [&](int i, int& partial_num, bool final) { - if (final) m_data(iTeam, i) = partial_num; - partial_num += i; - }); - - // // Do something with num_entr ... - // Kokkos::single(Kokkos::PerTeam(team),[&]{ - // m_num_entr(iTeam) = num_added; - // }); - }); -} - -template -void run_test_mdf() { //_timing - using crs_matrix_type = KokkosSparse::CrsMatrix; - using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; - using row_map_type = typename crs_graph_type::row_map_type::non_const_type; - using col_ind_type = typename crs_graph_type::entries_type::non_const_type; - using values_type = typename crs_matrix_type::values_type::non_const_type; - using value_type = typename crs_matrix_type::value_type; - using execution_space = typename crs_matrix_type::execution_space; - - const scalar_type beta = 1.0; - const scalar_type vel_mag = 0.5; - const size_type Nx = 400; - const size_type Ny = 400; - - crs_matrix_type A = - make_adv_diffusion_matrix( - beta, vel_mag, Nx, Ny); - - KokkosSparse::Experimental::MDF_handle handle(A); - handle.set_verbosity(0); - KokkosSparse::Experimental::mdf_symbolic(A, handle); - KokkosSparse::Experimental::mdf_numeric(A, handle); -} - template -void run_test_mdf_real() { // +void run_test_mdf() { using crs_matrix_type = KokkosSparse::CrsMatrix; using crs_graph_type = typename crs_matrix_type::StaticCrsGraphType; From 0eae1f225967a2ae03ee9c0df202363cac270dad Mon Sep 17 00:00:00 2001 From: Tom Ransegnola Date: Fri, 7 Jul 2023 12:58:31 -0600 Subject: [PATCH 3/3] fix clangformat --- sparse/impl/KokkosSparse_mdf_impl.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/sparse/impl/KokkosSparse_mdf_impl.hpp b/sparse/impl/KokkosSparse_mdf_impl.hpp index 1042c453f9..4383279ad0 100644 --- a/sparse/impl/KokkosSparse_mdf_impl.hpp +++ b/sparse/impl/KokkosSparse_mdf_impl.hpp @@ -441,12 +441,13 @@ struct MDF_factorize_row { // Only one of the values will match selected so can just sum all contribs const auto rowView = A.rowConst(selected_row); value_type diag = Kokkos::ArithTraits::zero(); - Kokkos::parallel_reduce(Kokkos::TeamVectorRange(team, rowView.length), - [&](const size_type ind, value_type& running_diag) { - if (rowView.colidx(ind) == selected_row) - running_diag = rowView.value(ind); - }, - Kokkos::Sum(diag)); + Kokkos::parallel_reduce( + Kokkos::TeamVectorRange(team, rowView.length), + [&](const size_type ind, value_type& running_diag) { + if (rowView.colidx(ind) == selected_row) + running_diag = rowView.value(ind); + }, + Kokkos::Sum(diag)); // Extract alpha and beta vectors // Then insert alpha*beta/diag_val if the corresponding