Skip to content

Commit

Permalink
Merge symbolic LU for near-symmetric matrices
Browse files Browse the repository at this point in the history
This adds a symbolic LU factorization based on the symbolic Cholesky of A + A^T and the numerical LU.

Related PR: #1445
  • Loading branch information
upsj authored Nov 5, 2023
2 parents 0e736e8 + f55798e commit 78e6ea7
Show file tree
Hide file tree
Showing 19 changed files with 723 additions and 48 deletions.
24 changes: 16 additions & 8 deletions benchmark/solver/solver_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,13 +62,13 @@ DEFINE_bool(
rel_residual, false,
"Use relative residual instead of residual reduction stopping criterion");

DEFINE_string(
solvers, "cg",
"A comma-separated list of solvers to run. "
"Supported values are: bicgstab, bicg, cb_gmres_keep, "
"cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, "
"cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, "
"lower_trs, upper_trs, spd_direct, symm_direct, direct, overhead");
DEFINE_string(solvers, "cg",
"A comma-separated list of solvers to run. "
"Supported values are: bicgstab, bicg, cb_gmres_keep, "
"cb_gmres_reduce1, cb_gmres_reduce2, cb_gmres_integer, "
"cb_gmres_ireduce1, cb_gmres_ireduce2, cg, cgs, fcg, gmres, idr, "
"lower_trs, upper_trs, spd_direct, symm_direct, "
"near_symm_direct, direct, overhead");

DEFINE_uint32(
nrhs, 1,
Expand Down Expand Up @@ -246,7 +246,15 @@ std::unique_ptr<gko::LinOpFactory> generate_solver(
return gko::experimental::solver::Direct<etype, itype>::build()
.with_factorization(
gko::experimental::factorization::Lu<etype, itype>::build()
.with_symmetric_sparsity(true))
.with_symbolic_algorithm(gko::experimental::factorization::
symbolic_type::symmetric))
.on(exec);
} else if (description == "near_symm_direct") {
return gko::experimental::solver::Direct<etype, itype>::build()
.with_factorization(
gko::experimental::factorization::Lu<etype, itype>::build()
.with_symbolic_algorithm(gko::experimental::factorization::
symbolic_type::near_symmetric))
.on(exec);
} else if (description == "direct") {
return gko::experimental::solver::Direct<etype, itype>::build()
Expand Down
35 changes: 35 additions & 0 deletions benchmark/sparse_blas/operations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -642,6 +642,37 @@ class SymbolicLuOperation : public BenchmarkOperation {
};


class SymbolicLuNearSymmOperation : public BenchmarkOperation {
public:
explicit SymbolicLuNearSymmOperation(const Mtx* mtx) : mtx_{mtx}, result_{}
{}

std::pair<bool, double> validate() const override
{
return std::make_pair(
validate_symbolic_factorization(mtx_, result_.get()), 0.0);
}

gko::size_type get_flops() const override { return 0; }

gko::size_type get_memory() const override { return 0; }

void run() override
{
gko::factorization::symbolic_lu_near_symm(mtx_, result_);
}

void write_stats(json& object) override
{
object["factor_nonzeros"] = result_->get_num_stored_elements();
}

private:
const Mtx* mtx_;
std::unique_ptr<Mtx> result_;
};


class SymbolicCholeskyOperation : public BenchmarkOperation {
public:
explicit SymbolicCholeskyOperation(const Mtx* mtx, bool symmetric)
Expand Down Expand Up @@ -817,6 +848,10 @@ const std::map<std::string,
[](const Mtx* mtx) {
return std::make_unique<SymbolicLuOperation>(mtx);
}},
{"symbolic_lu_near_symm",
[](const Mtx* mtx) {
return std::make_unique<SymbolicLuNearSymmOperation>(mtx);
}},
{"symbolic_cholesky",
[](const Mtx* mtx) {
return std::make_unique<SymbolicCholeskyOperation>(mtx, false);
Expand Down
2 changes: 1 addition & 1 deletion benchmark/sparse_blas/sparse_blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ using mat_data = gko::matrix_data<etype, itype>;
const char* operations_string =
"Comma-separated list of operations to be benchmarked. Can be "
"spgemm, spgeam, transpose, sort, is_sorted, generate_lookup, "
"lookup, symbolic_lu, symbolic_cholesky, "
"lookup, symbolic_lu, symbolic_lu_near_symm, symbolic_cholesky, "
"symbolic_cholesky_symmetric, reorder_rcm, "
#if GKO_HAVE_METIS
"reorder_nd, "
Expand Down
154 changes: 153 additions & 1 deletion common/cuda_hip/factorization/lu_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,10 @@ __global__ __launch_bounds__(default_block_size) void factorize(
// for each lower triangular entry: eliminate with corresponding row
for (auto lower_nz = row_begin; lower_nz < row_diag; lower_nz++) {
const auto dep = cols[lower_nz];
auto val = vals[lower_nz];
// we can load the value before synchronizing because the following
// updates only go past the diagonal of the dependency row, i.e. at
// least column dep + 1
const auto val = vals[lower_nz];
const auto diag_idx = diag_idxs[dep];
const auto dep_end = row_ptrs[dep + 1];
scheduler.wait(dep);
Expand All @@ -128,6 +131,88 @@ __global__ __launch_bounds__(default_block_size) void factorize(
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void symbolic_factorize_simple(
const IndexType* __restrict__ mtx_row_ptrs,
const IndexType* __restrict__ mtx_cols,
const IndexType* __restrict__ factor_row_ptrs,
const IndexType* __restrict__ factor_cols,
const IndexType* __restrict__ storage_offsets,
const int32* __restrict__ storage, const int64* __restrict__ row_descs,
IndexType* __restrict__ diag_idxs, ValueType* __restrict__ factor_vals,
IndexType* __restrict__ out_row_nnz, syncfree_storage dep_storage,
size_type num_rows)
{
using scheduler_t =
syncfree_scheduler<default_block_size, config::warp_size, IndexType>;
__shared__ typename scheduler_t::shared_storage sh_dep_storage;
scheduler_t scheduler(dep_storage, sh_dep_storage);
const auto row = scheduler.get_work_id();
if (row >= num_rows) {
return;
}
const auto warp =
group::tiled_partition<config::warp_size>(group::this_thread_block());
const auto lane = warp.thread_rank();
const auto factor_begin = factor_row_ptrs[row];
const auto factor_end = factor_row_ptrs[row + 1];
const auto mtx_begin = mtx_row_ptrs[row];
const auto mtx_end = mtx_row_ptrs[row + 1];
gko::matrix::csr::device_sparsity_lookup<IndexType> lookup{
factor_row_ptrs, factor_cols, storage_offsets,
storage, row_descs, static_cast<size_type>(row)};
const auto row_diag = lookup.lookup_unsafe(row) + factor_begin;
// fill with zeros first
for (auto nz = factor_begin + lane; nz < factor_end;
nz += config::warp_size) {
factor_vals[nz] = zero<float>();
}
warp.sync();
// then fill in the system matrix
for (auto nz = mtx_begin + lane; nz < mtx_end; nz += config::warp_size) {
const auto col = mtx_cols[nz];
factor_vals[lookup.lookup_unsafe(col) + factor_begin] = one<float>();
}
// finally set diagonal and store diagonal index
if (lane == 0) {
diag_idxs[row] = row_diag;
factor_vals[row_diag] = one<float>();
}
warp.sync();
// for each lower triangular entry: eliminate with corresponding row
for (auto lower_nz = factor_begin; lower_nz < row_diag; lower_nz++) {
const auto dep = factor_cols[lower_nz];
const auto dep_end = factor_row_ptrs[dep + 1];
scheduler.wait(dep);
// read the diag entry after we are sure it was written.
const auto diag_idx = diag_idxs[dep];
if (factor_vals[lower_nz] == one<float>()) {
// eliminate with upper triangle/entries past the diagonal
for (auto upper_nz = diag_idx + 1 + lane; upper_nz < dep_end;
upper_nz += config::warp_size) {
const auto upper_col = factor_cols[upper_nz];
const auto upper_val = factor_vals[upper_nz];
const auto output_pos =
lookup.lookup_unsafe(upper_col) + factor_begin;
if (upper_val == one<float>()) {
factor_vals[output_pos] = one<float>();
}
}
}
}
scheduler.mark_ready();
IndexType row_nnz{};
for (auto nz = factor_begin + lane; nz < factor_end;
nz += config::warp_size) {
row_nnz += factor_vals[nz] == one<float>() ? 1 : 0;
}
row_nnz = reduce(warp, row_nnz, thrust::plus<IndexType>{});
if (lane == 0) {
out_row_nnz[row] = row_nnz;
}
}


} // namespace kernel


Expand Down Expand Up @@ -177,3 +262,70 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);


template <typename IndexType>
void symbolic_factorize_simple(
std::shared_ptr<const DefaultExecutor> exec, const IndexType* row_ptrs,
const IndexType* col_idxs, const IndexType* lookup_offsets,
const int64* lookup_descs, const int32* lookup_storage,
matrix::Csr<float, IndexType>* factors, IndexType* out_row_nnz)
{
const auto num_rows = factors->get_size()[0];
const auto factor_row_ptrs = factors->get_const_row_ptrs();
const auto factor_cols = factors->get_const_col_idxs();
const auto factor_vals = factors->get_values();
array<IndexType> diag_idx_array{exec, num_rows};
array<int> tmp_storage{exec};
const auto diag_idxs = diag_idx_array.get_data();
if (num_rows > 0) {
syncfree_storage dep_storage(exec, tmp_storage, num_rows);
const auto num_blocks =
ceildiv(num_rows, default_block_size / config::warp_size);
kernel::symbolic_factorize_simple<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
row_ptrs, col_idxs, factor_row_ptrs, factor_cols, lookup_offsets,
lookup_storage, lookup_descs, diag_idxs, factor_vals, out_row_nnz,
dep_storage, num_rows);
}
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);


struct first_eq_one_functor {
template <typename Pair>
__device__ __forceinline__ bool operator()(Pair pair) const
{
return thrust::get<0>(pair) == one<float>();
}
};


struct return_second_functor {
template <typename Pair>
__device__ __forceinline__ auto operator()(Pair pair) const
{
return thrust::get<1>(pair);
}
};


template <typename IndexType>
void symbolic_factorize_simple_finalize(
std::shared_ptr<const DefaultExecutor> exec,
const matrix::Csr<float, IndexType>* factors, IndexType* out_col_idxs)
{
const auto col_idxs = factors->get_const_col_idxs();
const auto vals = factors->get_const_values();
const auto input_it =
thrust::make_zip_iterator(thrust::make_tuple(vals, col_idxs));
const auto output_it = thrust::make_transform_output_iterator(
out_col_idxs, return_second_functor{});
thrust::copy_if(thrust_policy(exec), input_it,
input_it + factors->get_num_stored_elements(), output_it,
first_eq_one_functor{});
}

GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(
GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);
2 changes: 2 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -858,6 +858,8 @@ namespace lu_factorization {

GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE);
GKO_STUB_INDEX_TYPE(GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE);


} // namespace lu_factorization
Expand Down
17 changes: 14 additions & 3 deletions core/factorization/lu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ GKO_REGISTER_OPERATION(factorize, lu_factorization::factorize);
GKO_REGISTER_HOST_OPERATION(symbolic_cholesky,
gko::factorization::symbolic_cholesky);
GKO_REGISTER_HOST_OPERATION(symbolic_lu, gko::factorization::symbolic_lu);
GKO_REGISTER_HOST_OPERATION(symbolic_lu_near_symm,
gko::factorization::symbolic_lu_near_symm);


} // namespace
Expand Down Expand Up @@ -95,12 +97,21 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
const auto num_rows = mtx->get_size()[0];
std::unique_ptr<matrix_type> factors;
if (!parameters_.symbolic_factorization) {
if (parameters_.symmetric_sparsity) {
switch (parameters_.symbolic_algorithm) {
case symbolic_type::general:
exec->run(make_symbolic_lu(mtx.get(), factors));
break;
case symbolic_type::near_symmetric:
exec->run(make_symbolic_lu_near_symm(mtx.get(), factors));
break;
case symbolic_type::symmetric: {
std::unique_ptr<gko::factorization::elimination_forest<IndexType>>
forest;
exec->run(make_symbolic_cholesky(mtx.get(), true, factors, forest));
} else {
exec->run(make_symbolic_lu(mtx.get(), factors));
break;
}
default:
GKO_INVALID_STATE("Invalid symbolic factorization algorithm");
}
} else {
const auto& symbolic = parameters_.symbolic_factorization;
Expand Down
29 changes: 24 additions & 5 deletions core/factorization/lu_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,30 @@ namespace kernels {
array<int>& tmp_storage)


#define GKO_DECLARE_ALL_AS_TEMPLATES \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType)
#define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType) \
void symbolic_factorize_simple( \
std::shared_ptr<const DefaultExecutor> exec, \
const IndexType* row_ptrs, const IndexType* col_idxs, \
const IndexType* factor_lookup_offsets, \
const int64* factor_lookup_descs, const int32* factor_lookup_storage, \
matrix::Csr<float, IndexType>* factors, IndexType* out_row_nnz)


#define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE(IndexType) \
void symbolic_factorize_simple_finalize( \
std::shared_ptr<const DefaultExecutor> exec, \
const matrix::Csr<float, IndexType>* factors, IndexType* col_idxs)


#define GKO_DECLARE_ALL_AS_TEMPLATES \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_LU_INITIALIZE(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_LU_FACTORIZE(ValueType, IndexType); \
template <typename IndexType> \
GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType); \
template <typename IndexType> \
GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE_FINALIZE(IndexType)


GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(lu_factorization,
Expand Down
Loading

0 comments on commit 78e6ea7

Please sign in to comment.