Skip to content

Commit

Permalink
Merge Fix to ParILU by adding explicit zeros to diagonal
Browse files Browse the repository at this point in the history
This PR adds the kernel `add_diagonal_elements` to `ParIlu` which adds
explicit zeros to the main diagonal if such element was not already
present.
This ensures that both `L` and `U` are always properly updated during
the generation.


Related issue: #455
Related PR: #456
  • Loading branch information
Thomas Grützmacher authored Feb 6, 2020
2 parents c60d71e + 1c3ea86 commit 9b0c5f1
Show file tree
Hide file tree
Showing 12 changed files with 1,176 additions and 128 deletions.
236 changes: 208 additions & 28 deletions common/factorization/par_ilu_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,207 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
namespace kernel {


namespace detail {


// Default implementation for the unsorted case
template <bool IsSorted>
struct find_helper {
template <typename Group, typename IndexType>
static __forceinline__ __device__ bool find(Group subwarp_grp,
const IndexType *first,
const IndexType *last,
IndexType value)
{
auto subwarp_idx = subwarp_grp.thread_rank();
bool found{false};
for (auto curr_start = first; curr_start < last;
curr_start += subwarp_grp.size()) {
const auto curr = curr_start + subwarp_idx;
found = (curr < last && *curr == value);
found = subwarp_grp.any(found);
if (found) {
break;
}
}
return found;
}
};


// Improved version in case the CSR matrix is sorted
template <>
struct find_helper<true> {
template <typename Group, typename IndexType>
static __forceinline__ __device__ bool find(Group subwarp_grp,
const IndexType *first,
const IndexType *last,
IndexType value)
{
const auto length = static_cast<IndexType>(last - first);
const auto pos =
group_wide_search(IndexType{}, length, subwarp_grp,
[&](IndexType i) { return first[i] >= value; });
return pos < length && first[pos] == value;
}
};


} // namespace detail


// SubwarpSize needs to be a power of 2
// Each subwarp works on one row
template <bool IsSorted, int SubwarpSize, typename IndexType>
__global__
__launch_bounds__(default_block_size) void find_missing_diagonal_elements(
IndexType num_rows, IndexType num_cols,
const IndexType *__restrict__ col_idxs,
const IndexType *__restrict__ row_ptrs,
IndexType *__restrict__ elements_to_add_per_row,
bool *__restrict__ changes_required)
{
const auto total_thread_count =
static_cast<size_type>(blockDim.x) * gridDim.x / SubwarpSize;
const auto tidx =
threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
const auto begin_row = static_cast<IndexType>(tidx / SubwarpSize);

auto thread_block = group::this_thread_block();
auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
const auto subwarp_idx = subwarp_grp.thread_rank();

bool local_change{false};
for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
if (row >= num_cols) {
if (subwarp_idx == 0) {
elements_to_add_per_row[row] = 0;
}
continue;
}
const auto *start_cols = col_idxs + row_ptrs[row];
const auto *end_cols = col_idxs + row_ptrs[row + 1];
if (detail::find_helper<IsSorted>::find(subwarp_grp, start_cols,
end_cols, row)) {
if (subwarp_idx == 0) {
elements_to_add_per_row[row] = 0;
}
} else {
if (subwarp_idx == 0) {
elements_to_add_per_row[row] = 1;
}
local_change = true;
}
}
// Could also be reduced (not sure if that leads to a performance benefit)
if (local_change && subwarp_idx == 0) {
*changes_required = true;
}
}


// SubwarpSize needs to be a power of 2
// Each subwarp works on one row
template <int SubwarpSize, typename ValueType, typename IndexType>
__global__
__launch_bounds__(default_block_size) void add_missing_diagonal_elements(
IndexType num_rows, const ValueType *__restrict__ old_values,
const IndexType *__restrict__ old_col_idxs,
const IndexType *__restrict__ old_row_ptrs,
ValueType *__restrict__ new_values,
IndexType *__restrict__ new_col_idxs,
const IndexType *__restrict__ row_ptrs_addition)
{
// Precaution in case not enough threads were created
const auto total_thread_count =
static_cast<size_type>(blockDim.x) * gridDim.x / SubwarpSize;
const auto tidx =
threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
const auto begin_row = static_cast<IndexType>(tidx / SubwarpSize);

auto thread_block = group::this_thread_block();
auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
const auto subwarp_idx = subwarp_grp.thread_rank();

for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
const IndexType old_row_start{old_row_ptrs[row]};
const IndexType old_row_end{old_row_ptrs[row + 1]};
const IndexType new_row_start{old_row_start + row_ptrs_addition[row]};
const IndexType new_row_end{old_row_end + row_ptrs_addition[row + 1]};

// if no element needs to be added, do a simple copy of the whole row
if (new_row_end - new_row_start == old_row_end - old_row_start) {
for (IndexType i = subwarp_idx; i < new_row_end - new_row_start;
i += SubwarpSize) {
const IndexType new_idx = new_row_start + i;
const IndexType old_idx = old_row_start + i;
new_values[new_idx] = old_values[old_idx];
new_col_idxs[new_idx] = old_col_idxs[old_idx];
}
} else {
IndexType new_idx = new_row_start + subwarp_idx;
bool diagonal_added{false};
for (IndexType old_idx_start = old_row_start;
old_idx_start < old_row_end;
old_idx_start += SubwarpSize, new_idx += SubwarpSize) {
const auto old_idx = old_idx_start + subwarp_idx;
bool thread_is_active = old_idx < old_row_end;
const auto col_idx =
thread_is_active ? old_col_idxs[old_idx] : IndexType{};
// automatically false if thread is not active
bool diagonal_add_required = !diagonal_added && row < col_idx;
auto ballot = subwarp_grp.ballot(diagonal_add_required);

if (ballot) {
auto first_subwarp_idx = ffs(ballot) - 1;
if (first_subwarp_idx == subwarp_idx) {
new_values[new_idx] = zero<ValueType>();
new_col_idxs[new_idx] = row;
}
if (thread_is_active) {
// if diagonal was inserted in a thread below this one,
// add it to the new_idx.
bool is_thread_after_diagonal =
(first_subwarp_idx <= subwarp_idx);
new_idx += is_thread_after_diagonal;
new_values[new_idx] = old_values[old_idx];
new_col_idxs[new_idx] = col_idx;
// if diagonal is inserted in a thread after this one,
// it needs to be considered after writing the values
new_idx += !is_thread_after_diagonal;
}
diagonal_added = true;
} else if (thread_is_active) {
new_values[new_idx] = old_values[old_idx];
new_col_idxs[new_idx] = col_idx;
}
}
if (!diagonal_added && subwarp_idx == 0) {
new_idx = new_row_end - 1;
new_values[new_idx] = zero<ValueType>();
new_col_idxs[new_idx] = row;
}
}
}
}


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void update_row_ptrs(
IndexType num_rows, IndexType *__restrict__ row_ptrs,
IndexType *__restrict__ row_ptr_addition)
{
const auto total_thread_count =
static_cast<size_type>(blockDim.x) * gridDim.x;
const auto begin_row =
threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;

for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
row_ptrs[row] += row_ptr_addition[row];
}
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
size_type num_rows, const IndexType *__restrict__ row_ptrs,
Expand All @@ -44,16 +245,13 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
if (row < num_rows) {
IndexType l_row_nnz{};
IndexType u_row_nnz{};
bool has_diagonal{};
for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) {
auto col = col_idxs[idx];
l_row_nnz += (col <= row);
u_row_nnz += (row <= col);
has_diagonal |= col == row;
}
// if we didn't find it, add the diagonal entry
l_nnz_row[row] = l_row_nnz + !has_diagonal;
u_nnz_row[row] = u_row_nnz + !has_diagonal;
l_nnz_row[row] = l_row_nnz;
u_nnz_row[row] = u_row_nnz;
}
}

Expand All @@ -71,39 +269,21 @@ __global__ __launch_bounds__(default_block_size) void initialize_l_u(
const auto row = blockDim.x * blockIdx.x + threadIdx.x;
if (row < num_rows) {
auto l_idx = l_row_ptrs[row];
auto u_idx = u_row_ptrs[row] + 1; // we treat the diagonal separately
bool has_diagonal{};
ValueType diag_val{};
auto u_idx = u_row_ptrs[row];
for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
const auto col = col_idxs[i];
const auto val = values[i];
// save diagonal entry for later
if (col == row) {
has_diagonal = true;
diag_val = val;
}
if (col < row) {
if (col <= row) {
l_col_idxs[l_idx] = col;
l_values[l_idx] = val;
l_values[l_idx] = (col == row ? one<ValueType>() : val);
++l_idx;
}
if (row < col) {
if (row <= col) {
u_col_idxs[u_idx] = col;
u_values[u_idx] = val;
++u_idx;
}
}
// if there was no diagonal entry, set it to one
if (!has_diagonal) {
diag_val = one<ValueType>();
}
// store diagonal entries
auto l_diag_idx = l_row_ptrs[row + 1] - 1;
auto u_diag_idx = u_row_ptrs[row];
l_col_idxs[l_diag_idx] = row;
u_col_idxs[u_diag_idx] = row;
l_values[l_diag_idx] = one<ValueType>();
u_values[u_diag_idx] = diag_val;
}
}

Expand Down Expand Up @@ -154,4 +334,4 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors(
}


} // namespace kernel
} // namespace kernel
6 changes: 6 additions & 0 deletions core/device_hooks/common_kernels.inc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -764,6 +764,12 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
namespace par_ilu_factorization {


template <typename ValueType, typename IndexType>
GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL);

template <typename ValueType, typename IndexType>
GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType)
GKO_NOT_COMPILED(GKO_HOOK_MODULE);
Expand Down
17 changes: 13 additions & 4 deletions core/factorization/par_ilu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ namespace factorization {
namespace par_ilu_factorization {


GKO_REGISTER_OPERATION(add_diagonal_elements,
par_ilu_factorization::add_diagonal_elements);
GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
par_ilu_factorization::initialize_row_ptrs_l_u);
GKO_REGISTER_OPERATION(initialize_l_u, par_ilu_factorization::initialize_l_u);
Expand Down Expand Up @@ -82,15 +84,18 @@ ParIlu<ValueType, IndexType>::generate_l_u(
// Only copies the matrix if it is not on the same executor or was not in
// the right format. Throws an exception if it is not convertable.
std::unique_ptr<CsrMatrix> csr_system_matrix_unique_ptr{};
auto csr_system_matrix =
auto csr_system_matrix_const =
dynamic_cast<const CsrMatrix *>(system_matrix.get());
if (csr_system_matrix == nullptr ||
csr_system_matrix->get_executor() != exec) {
CsrMatrix *csr_system_matrix{};
if (csr_system_matrix_const == nullptr ||
csr_system_matrix_const->get_executor() != exec) {
csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
as<ConvertibleTo<CsrMatrix>>(system_matrix.get())
->convert_to(csr_system_matrix_unique_ptr.get());
csr_system_matrix = csr_system_matrix_unique_ptr.get();
} else {
csr_system_matrix_unique_ptr = csr_system_matrix_const->clone();
}
csr_system_matrix = csr_system_matrix_unique_ptr.get();
// If it needs to be sorted, copy it if necessary and sort it
if (!skip_sorting) {
if (csr_system_matrix_unique_ptr == nullptr) {
Expand All @@ -101,6 +106,10 @@ ParIlu<ValueType, IndexType>::generate_l_u(
csr_system_matrix = csr_system_matrix_unique_ptr.get();
}

// Add explicit diagonal zero elements if they are missing
exec->run(par_ilu_factorization::make_add_diagonal_elements(
csr_system_matrix, true));

const auto matrix_size = csr_system_matrix->get_size();
const auto number_rows = matrix_size[0];
Array<IndexType> l_row_ptrs{exec, number_rows + 1};
Expand Down
6 changes: 6 additions & 0 deletions core/factorization/par_ilu_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ namespace gko {
namespace kernels {


#define GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType) \
void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec, \
matrix::Csr<ValueType, IndexType> *mtx, \
bool is_sorted)
#define GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, \
IndexType) \
void initialize_row_ptrs_l_u( \
Expand All @@ -70,6 +74,8 @@ namespace kernels {


#define GKO_DECLARE_ALL_AS_TEMPLATES \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType); \
template <typename ValueType, typename IndexType> \
Expand Down
Loading

0 comments on commit 9b0c5f1

Please sign in to comment.