Merge Fix to ParILU by adding explicit zeros to diagonal

This PR adds the kernel `add_diagonal_elements` to `ParIlu` which adds explicit zeros to the main diagonal if such element was not already present. This ensures that both `L` and `U` are always properly updated during the generation. Related issue: #455 Related PR: #456
ginkgo-project · Feb 6, 2020 · 9b0c5f1 · 9b0c5f1
2 parents c60d71e + 1c3ea86
commit 9b0c5f1
Show file tree

Hide file tree

Showing 12 changed files with 1,176 additions and 128 deletions.
diff --git a/common/factorization/par_ilu_kernels.hpp.inc b/common/factorization/par_ilu_kernels.hpp.inc
@@ -33,6 +33,207 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace kernel {
 
 
+namespace detail {
+
+
+// Default implementation for the unsorted case
+template <bool IsSorted>
+struct find_helper {
+    template <typename Group, typename IndexType>
+    static __forceinline__ __device__ bool find(Group subwarp_grp,
+                                                const IndexType *first,
+                                                const IndexType *last,
+                                                IndexType value)
+    {
+        auto subwarp_idx = subwarp_grp.thread_rank();
+        bool found{false};
+        for (auto curr_start = first; curr_start < last;
+             curr_start += subwarp_grp.size()) {
+            const auto curr = curr_start + subwarp_idx;
+            found = (curr < last && *curr == value);
+            found = subwarp_grp.any(found);
+            if (found) {
+                break;
+            }
+        }
+        return found;
+    }
+};
+
+
+// Improved version in case the CSR matrix is sorted
+template <>
+struct find_helper<true> {
+    template <typename Group, typename IndexType>
+    static __forceinline__ __device__ bool find(Group subwarp_grp,
+                                                const IndexType *first,
+                                                const IndexType *last,
+                                                IndexType value)
+    {
+        const auto length = static_cast<IndexType>(last - first);
+        const auto pos =
+            group_wide_search(IndexType{}, length, subwarp_grp,
+                              [&](IndexType i) { return first[i] >= value; });
+        return pos < length && first[pos] == value;
+    }
+};
+
+
+}  // namespace detail
+
+
+// SubwarpSize needs to be a power of 2
+// Each subwarp works on one row
+template <bool IsSorted, int SubwarpSize, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void find_missing_diagonal_elements(
+        IndexType num_rows, IndexType num_cols,
+        const IndexType *__restrict__ col_idxs,
+        const IndexType *__restrict__ row_ptrs,
+        IndexType *__restrict__ elements_to_add_per_row,
+        bool *__restrict__ changes_required)
+{
+    const auto total_thread_count =
+        static_cast<size_type>(blockDim.x) * gridDim.x / SubwarpSize;
+    const auto tidx =
+        threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
+    const auto begin_row = static_cast<IndexType>(tidx / SubwarpSize);
+
+    auto thread_block = group::this_thread_block();
+    auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
+    const auto subwarp_idx = subwarp_grp.thread_rank();
+
+    bool local_change{false};
+    for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
+        if (row >= num_cols) {
+            if (subwarp_idx == 0) {
+                elements_to_add_per_row[row] = 0;
+            }
+            continue;
+        }
+        const auto *start_cols = col_idxs + row_ptrs[row];
+        const auto *end_cols = col_idxs + row_ptrs[row + 1];
+        if (detail::find_helper<IsSorted>::find(subwarp_grp, start_cols,
+                                                end_cols, row)) {
+            if (subwarp_idx == 0) {
+                elements_to_add_per_row[row] = 0;
+            }
+        } else {
+            if (subwarp_idx == 0) {
+                elements_to_add_per_row[row] = 1;
+            }
+            local_change = true;
+        }
+    }
+    // Could also be reduced (not sure if that leads to a performance benefit)
+    if (local_change && subwarp_idx == 0) {
+        *changes_required = true;
+    }
+}
+
+
+// SubwarpSize needs to be a power of 2
+// Each subwarp works on one row
+template <int SubwarpSize, typename ValueType, typename IndexType>
+__global__
+    __launch_bounds__(default_block_size) void add_missing_diagonal_elements(
+        IndexType num_rows, const ValueType *__restrict__ old_values,
+        const IndexType *__restrict__ old_col_idxs,
+        const IndexType *__restrict__ old_row_ptrs,
+        ValueType *__restrict__ new_values,
+        IndexType *__restrict__ new_col_idxs,
+        const IndexType *__restrict__ row_ptrs_addition)
+{
+    // Precaution in case not enough threads were created
+    const auto total_thread_count =
+        static_cast<size_type>(blockDim.x) * gridDim.x / SubwarpSize;
+    const auto tidx =
+        threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
+    const auto begin_row = static_cast<IndexType>(tidx / SubwarpSize);
+
+    auto thread_block = group::this_thread_block();
+    auto subwarp_grp = group::tiled_partition<SubwarpSize>(thread_block);
+    const auto subwarp_idx = subwarp_grp.thread_rank();
+
+    for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
+        const IndexType old_row_start{old_row_ptrs[row]};
+        const IndexType old_row_end{old_row_ptrs[row + 1]};
+        const IndexType new_row_start{old_row_start + row_ptrs_addition[row]};
+        const IndexType new_row_end{old_row_end + row_ptrs_addition[row + 1]};
+
+        // if no element needs to be added, do a simple copy of the whole row
+        if (new_row_end - new_row_start == old_row_end - old_row_start) {
+            for (IndexType i = subwarp_idx; i < new_row_end - new_row_start;
+                 i += SubwarpSize) {
+                const IndexType new_idx = new_row_start + i;
+                const IndexType old_idx = old_row_start + i;
+                new_values[new_idx] = old_values[old_idx];
+                new_col_idxs[new_idx] = old_col_idxs[old_idx];
+            }
+        } else {
+            IndexType new_idx = new_row_start + subwarp_idx;
+            bool diagonal_added{false};
+            for (IndexType old_idx_start = old_row_start;
+                 old_idx_start < old_row_end;
+                 old_idx_start += SubwarpSize, new_idx += SubwarpSize) {
+                const auto old_idx = old_idx_start + subwarp_idx;
+                bool thread_is_active = old_idx < old_row_end;
+                const auto col_idx =
+                    thread_is_active ? old_col_idxs[old_idx] : IndexType{};
+                // automatically false if thread is not active
+                bool diagonal_add_required = !diagonal_added && row < col_idx;
+                auto ballot = subwarp_grp.ballot(diagonal_add_required);
+
+                if (ballot) {
+                    auto first_subwarp_idx = ffs(ballot) - 1;
+                    if (first_subwarp_idx == subwarp_idx) {
+                        new_values[new_idx] = zero<ValueType>();
+                        new_col_idxs[new_idx] = row;
+                    }
+                    if (thread_is_active) {
+                        // if diagonal was inserted in a thread below this one,
+                        // add it to the new_idx.
+                        bool is_thread_after_diagonal =
+                            (first_subwarp_idx <= subwarp_idx);
+                        new_idx += is_thread_after_diagonal;
+                        new_values[new_idx] = old_values[old_idx];
+                        new_col_idxs[new_idx] = col_idx;
+                        // if diagonal is inserted in a thread after this one,
+                        // it needs to be considered after writing the values
+                        new_idx += !is_thread_after_diagonal;
+                    }
+                    diagonal_added = true;
+                } else if (thread_is_active) {
+                    new_values[new_idx] = old_values[old_idx];
+                    new_col_idxs[new_idx] = col_idx;
+                }
+            }
+            if (!diagonal_added && subwarp_idx == 0) {
+                new_idx = new_row_end - 1;
+                new_values[new_idx] = zero<ValueType>();
+                new_col_idxs[new_idx] = row;
+            }
+        }
+    }
+}
+
+
+template <typename IndexType>
+__global__ __launch_bounds__(default_block_size) void update_row_ptrs(
+    IndexType num_rows, IndexType *__restrict__ row_ptrs,
+    IndexType *__restrict__ row_ptr_addition)
+{
+    const auto total_thread_count =
+        static_cast<size_type>(blockDim.x) * gridDim.x;
+    const auto begin_row =
+        threadIdx.x + static_cast<size_type>(blockIdx.x) * blockDim.x;
+
+    for (IndexType row = begin_row; row < num_rows; row += total_thread_count) {
+        row_ptrs[row] += row_ptr_addition[row];
+    }
+}
+
+
 template <typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
     size_type num_rows, const IndexType *__restrict__ row_ptrs,
@@ -44,16 +245,13 @@ __global__ __launch_bounds__(default_block_size) void count_nnz_per_l_u_row(
     if (row < num_rows) {
         IndexType l_row_nnz{};
         IndexType u_row_nnz{};
-        bool has_diagonal{};
         for (auto idx = row_ptrs[row]; idx < row_ptrs[row + 1]; ++idx) {
             auto col = col_idxs[idx];
             l_row_nnz += (col <= row);
             u_row_nnz += (row <= col);
-            has_diagonal |= col == row;
         }
-        // if we didn't find it, add the diagonal entry
-        l_nnz_row[row] = l_row_nnz + !has_diagonal;
-        u_nnz_row[row] = u_row_nnz + !has_diagonal;
+        l_nnz_row[row] = l_row_nnz;
+        u_nnz_row[row] = u_row_nnz;
     }
 }
 
@@ -71,39 +269,21 @@ __global__ __launch_bounds__(default_block_size) void initialize_l_u(
     const auto row = blockDim.x * blockIdx.x + threadIdx.x;
     if (row < num_rows) {
         auto l_idx = l_row_ptrs[row];
-        auto u_idx = u_row_ptrs[row] + 1;  // we treat the diagonal separately
-        bool has_diagonal{};
-        ValueType diag_val{};
+        auto u_idx = u_row_ptrs[row];
         for (size_type i = row_ptrs[row]; i < row_ptrs[row + 1]; ++i) {
             const auto col = col_idxs[i];
             const auto val = values[i];
-            // save diagonal entry for later
-            if (col == row) {
-                has_diagonal = true;
-                diag_val = val;
-            }
-            if (col < row) {
+            if (col <= row) {
                 l_col_idxs[l_idx] = col;
-                l_values[l_idx] = val;
+                l_values[l_idx] = (col == row ? one<ValueType>() : val);
                 ++l_idx;
             }
-            if (row < col) {
+            if (row <= col) {
                 u_col_idxs[u_idx] = col;
                 u_values[u_idx] = val;
                 ++u_idx;
             }
         }
-        // if there was no diagonal entry, set it to one
-        if (!has_diagonal) {
-            diag_val = one<ValueType>();
-        }
-        // store diagonal entries
-        auto l_diag_idx = l_row_ptrs[row + 1] - 1;
-        auto u_diag_idx = u_row_ptrs[row];
-        l_col_idxs[l_diag_idx] = row;
-        u_col_idxs[u_diag_idx] = row;
-        l_values[l_diag_idx] = one<ValueType>();
-        u_values[u_diag_idx] = diag_val;
     }
 }
 
@@ -154,4 +334,4 @@ __global__ __launch_bounds__(default_block_size) void compute_l_u_factors(
 }
 
 
-}  // namespace kernel
+}  // namespace kernel
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -764,6 +764,12 @@ GKO_NOT_COMPILED(GKO_HOOK_MODULE);
 namespace par_ilu_factorization {
 
 
+template <typename ValueType, typename IndexType>
+GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType)
+GKO_NOT_COMPILED(GKO_HOOK_MODULE);
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
+    GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL);
+
 template <typename ValueType, typename IndexType>
 GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType)
 GKO_NOT_COMPILED(GKO_HOOK_MODULE);

diff --git a/core/factorization/par_ilu.cpp b/core/factorization/par_ilu.cpp
@@ -53,6 +53,8 @@ namespace factorization {
 namespace par_ilu_factorization {
 
 
+GKO_REGISTER_OPERATION(add_diagonal_elements,
+                       par_ilu_factorization::add_diagonal_elements);
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
                        par_ilu_factorization::initialize_row_ptrs_l_u);
 GKO_REGISTER_OPERATION(initialize_l_u, par_ilu_factorization::initialize_l_u);
@@ -82,15 +84,18 @@ ParIlu<ValueType, IndexType>::generate_l_u(
     // Only copies the matrix if it is not on the same executor or was not in
     // the right format. Throws an exception if it is not convertable.
     std::unique_ptr<CsrMatrix> csr_system_matrix_unique_ptr{};
-    auto csr_system_matrix =
+    auto csr_system_matrix_const =
         dynamic_cast<const CsrMatrix *>(system_matrix.get());
-    if (csr_system_matrix == nullptr ||
-        csr_system_matrix->get_executor() != exec) {
+    CsrMatrix *csr_system_matrix{};
+    if (csr_system_matrix_const == nullptr ||
+        csr_system_matrix_const->get_executor() != exec) {
         csr_system_matrix_unique_ptr = CsrMatrix::create(exec);
         as<ConvertibleTo<CsrMatrix>>(system_matrix.get())
             ->convert_to(csr_system_matrix_unique_ptr.get());
-        csr_system_matrix = csr_system_matrix_unique_ptr.get();
+    } else {
+        csr_system_matrix_unique_ptr = csr_system_matrix_const->clone();
     }
+    csr_system_matrix = csr_system_matrix_unique_ptr.get();
     // If it needs to be sorted, copy it if necessary and sort it
     if (!skip_sorting) {
         if (csr_system_matrix_unique_ptr == nullptr) {
@@ -101,6 +106,10 @@ ParIlu<ValueType, IndexType>::generate_l_u(
         csr_system_matrix = csr_system_matrix_unique_ptr.get();
     }
 
+    // Add explicit diagonal zero elements if they are missing
+    exec->run(par_ilu_factorization::make_add_diagonal_elements(
+        csr_system_matrix, true));
+
     const auto matrix_size = csr_system_matrix->get_size();
     const auto number_rows = matrix_size[0];
     Array<IndexType> l_row_ptrs{exec, number_rows + 1};

diff --git a/core/factorization/par_ilu_kernels.hpp b/core/factorization/par_ilu_kernels.hpp
@@ -49,6 +49,10 @@ namespace gko {
 namespace kernels {
 
 
+#define GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType) \
+    void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,    \
+                               matrix::Csr<ValueType, IndexType> *mtx,         \
+                               bool is_sorted)
 #define GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, \
                                                            IndexType) \
     void initialize_row_ptrs_l_u(                                     \
@@ -70,6 +74,8 @@ namespace kernels {
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                                          \
+    template <typename ValueType, typename IndexType>                         \
+    GKO_DECLARE_PAR_ILU_ADD_DIAGONAL_ELEMENTS_KERNEL(ValueType, IndexType);   \
     template <typename ValueType, typename IndexType>                         \
     GKO_DECLARE_PAR_ILU_INITIALIZE_ROW_PTRS_L_U_KERNEL(ValueType, IndexType); \
     template <typename ValueType, typename IndexType>                         \