Update of the GMRES PR

vqd8a · Dec 2, 2021 · 8c3d535 · 8c3d535
1 parent df2b9a5
commit 8c3d535
Show file tree

Hide file tree

Showing 44 changed files with 366 additions and 774 deletions.
diff --git a/src/batched/dense/impl/KokkosBatched_Dot_Internal.hpp b/src/batched/dense/impl/KokkosBatched_Dot_Internal.hpp
@@ -34,11 +34,13 @@ struct SerialDotInternal {
   // j \in [0,n), i \in [0,m)
   // C(j) = conj(A(:,j))*B(:,j)
   template <typename ValueType, typename MagnitudeType>
-  KOKKOS_FORCEINLINE_FUNCTION static int invoke(
-      const int m, const int n, const ValueType *__restrict__ A, const int as0,
-      const int as1, const ValueType *__restrict__ B, const int bs0,
-      const int bs1,
-      /* */ MagnitudeType *__restrict__ C, const int cs) {
+  KOKKOS_INLINE_FUNCTION static int invoke(const int m, const int n,
+                                           const ValueType *__restrict__ A,
+                                           const int as0, const int as1,
+                                           const ValueType *__restrict__ B,
+                                           const int bs0, const int bs1,
+                                           /* */ MagnitudeType *__restrict__ C,
+                                           const int cs) {
     for (int j = 0; j < n; ++j)
       invoke(m, A + j * as1, as0, B + j * bs1, bs0, C + j * cs);
     return 0;

diff --git a/src/batched/sparse/KokkosBatched_CG.hpp b/src/batched/sparse/KokkosBatched_CG.hpp
@@ -49,14 +49,14 @@
 
 /// \brief Batched CG: Selective Interface
 ///
-/// \tparam OperatorType: The type of the opertator of the system
+/// \tparam OperatorType: The type of the operator of the system
 /// \tparam VectorViewType: Input type for the right-hand side and the solution,
 /// needs to be a 2D view
 ///
 /// \param member [in]: TeamPolicy member
 /// \param A [in]: batched operator (can be a batched matrix or a (left or right
 /// or both) preconditioned batched matrix) \param B [in]: right-hand side, a
-/// rank 2 view \param X [in/out]: initial guess and solutin, a rank 2 view
+/// rank 2 view \param X [in/out]: initial guess and solution, a rank 2 view
 /// \param handle [in]: a handle which provides different information such as
 /// the tolerance or the maximal number of iterations of the solver.
 

diff --git a/src/batched/sparse/KokkosBatched_CrsMatrix.hpp b/src/batched/sparse/KokkosBatched_CrsMatrix.hpp
@@ -64,18 +64,24 @@ class CrsMatrix {
   IntViewType row_ptr;
   IntViewType colIndices;
   int n_operators;
+  int n_rows;
+  int n_colums;
 
  public:
   KOKKOS_INLINE_FUNCTION
   CrsMatrix(const ValuesViewType &_values, const IntViewType &_row_ptr,
             const IntViewType &_colIndices)
       : values(_values), row_ptr(_row_ptr), colIndices(_colIndices) {
     n_operators = _values.extent(0);
+    n_rows      = _row_ptr.extent(0) - 1;
+    n_colums    = n_rows;
   }
+
   KOKKOS_INLINE_FUNCTION
   ~CrsMatrix() {}
 
-  /// \brief apply
+  /// \brief apply version that uses constant coefficients alpha and beta
+  ///
   ///   y_l <- alpha * A_l * x_l + beta * y_l for all l = 1, ..., N
   /// where:
   ///   * N is the number of matrices,
@@ -115,7 +121,7 @@ class CrsMatrix {
           member, alpha, values, row_ptr, colIndices, X, beta, Y);
   }
 
-  /// \brief apply
+  /// \brief apply version that uses variable coefficient alpha and no beta
   ///   y_l <- alpha_l * A_l * x_l  for all l = 1, ..., N
   /// where:
   ///   * N is the number of matrices,
@@ -147,7 +153,7 @@ class CrsMatrix {
                          Y);
   }
 
-  /// \brief apply
+  /// \brief apply version that uses variable coefficients alpha and beta
   ///   y_l <- alpha_l * A_l * x_l + beta_l * y_l for all l = 1, ..., N
   /// where:
   ///   * N is the number of matrices,

diff --git a/src/batched/sparse/KokkosBatched_GMRES.hpp b/src/batched/sparse/KokkosBatched_GMRES.hpp
@@ -49,14 +49,14 @@
 
 /// \brief Batched GMRES: Selective Interface
 ///
-/// \tparam OperatorType: The type of the opertator of the system
+/// \tparam OperatorType: The type of the operator of the system
 /// \tparam VectorViewType: Input type for the right-hand side and the solution,
 /// needs to be a 2D view
 ///
 /// \param member [in]: TeamPolicy member
 /// \param A [in]: batched operator (can be a batched matrix or a (left or right
 /// or both) preconditioned batched matrix) \param B [in]: right-hand side, a
-/// rank 2 view \param X [in/out]: initial guess and solutin, a rank 2 view
+/// rank 2 view \param X [in/out]: initial guess and solution, a rank 2 view
 /// \param handle [in]: a handle which provides different information such as
 /// the tolerance or the maximal number of iterations of the solver.
 

diff --git a/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_TeamVector_Impl.hpp
@@ -70,9 +70,6 @@ struct TeamVectorCG {
     typedef int OrdinalType;
     typedef typename Kokkos::Details::ArithTraits<
         typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
-    typedef Kokkos::View<MagnitudeType*, Kokkos::LayoutLeft,
-                         typename VectorViewType::device_type>
-        NormViewType;
 
     const size_t maximum_iteration = handle->get_max_iteration();
     const MagnitudeType tolerance  = handle->get_tolerance();
@@ -104,9 +101,6 @@ struct TeamVectorCG {
     // Deep copy of b into r_0:
     TeamVectorCopy<MemberType>::invoke(member, _B, R);
 
-    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                         [&](const OrdinalType& i) { mask(i) = 1.; });
-
     // r_0 := b - A x_0
     member.team_barrier();
     A.template apply<MemberType, ScratchPadVectorViewType,
@@ -120,6 +114,12 @@ struct TeamVectorCG {
     TeamVectorDot<MemberType>::invoke(member, R, R, sqr_norm_0);
     member.team_barrier();
 
+    Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           mask(i) =
+                               sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                         });
+
     TeamVectorCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
 
     int status               = 1;

diff --git a/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_CG_Team_Impl.hpp
@@ -69,11 +69,8 @@ struct TeamCG {
     typedef int OrdinalType;
     typedef typename Kokkos::Details::ArithTraits<
         typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
-    typedef Kokkos::View<MagnitudeType*, Kokkos::LayoutLeft,
-                         typename VectorViewType::device_type>
-        NormViewType;
 
-    int maximum_iteration         = handle->get_max_iteration();
+    size_t maximum_iteration      = handle->get_max_iteration();
     const MagnitudeType tolerance = handle->get_tolerance();
 
     using ScratchPadNormViewType = Kokkos::View<
@@ -103,9 +100,6 @@ struct TeamCG {
     // Deep copy of b into r_0:
     TeamCopy<MemberType>::invoke(member, _B, R);
 
-    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
-                         [&](const OrdinalType& i) { mask(i) = 1.; });
-
     // r_0 := b - A x_0
     member.team_barrier();
     A.template apply<MemberType, ScratchPadVectorViewType,
@@ -119,6 +113,12 @@ struct TeamCG {
     TeamDot<MemberType>::invoke(member, R, R, sqr_norm_0);
     member.team_barrier();
 
+    Kokkos::parallel_for(Kokkos::TeamThreadRange(member, 0, numMatrices),
+                         [&](const OrdinalType& i) {
+                           mask(i) =
+                               sqr_norm_0(i) > tolerance * tolerance ? 1. : 0;
+                         });
+
     TeamCopy1D::invoke(member, sqr_norm_0, sqr_norm_j);
 
     int status               = 1;

diff --git a/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp b/src/batched/sparse/impl/KokkosBatched_GMRES_TeamVector_Impl.hpp
@@ -73,12 +73,6 @@ struct TeamVectorGMRES {
     typedef typename Kokkos::Details::ArithTraits<
         typename VectorViewType::non_const_value_type>::mag_type MagnitudeType;
     typedef Kokkos::Details::ArithTraits<MagnitudeType> ATM;
-    typedef Kokkos::View<MagnitudeType*, Kokkos::LayoutLeft,
-                         typename VectorViewType::device_type>
-        NormViewType;
-
-    int maximum_iteration         = handle->get_max_iteration();
-    const MagnitudeType tolerance = handle->get_tolerance();
 
     using ScratchPadNormViewType = Kokkos::View<
         MagnitudeType*,
@@ -96,14 +90,20 @@ struct TeamVectorGMRES {
     const OrdinalType numMatrices = _X.extent(0);
     const OrdinalType numRows     = _X.extent(1);
 
+    size_t maximum_iteration = handle->get_max_iteration() < numRows
+                                   ? handle->get_max_iteration()
+                                   : numRows;
+    const MagnitudeType tolerance     = handle->get_tolerance();
+    const MagnitudeType max_tolerance = 0.;
+
     ScratchPadMultiVectorViewType V(member.team_scratch(1), numMatrices,
                                     maximum_iteration + 1, numRows);
     ScratchPadMultiVectorViewType H(member.team_scratch(1), numMatrices,
                                     maximum_iteration + 1, maximum_iteration);
     ScratchPadMultiVectorViewType Givens(member.team_scratch(1), numMatrices,
                                          maximum_iteration, 2);
     ScratchPadVectorViewType G(member.team_scratch(1), numMatrices,
-                               maximum_iteration);
+                               maximum_iteration + 1);
 
     ScratchPadVectorViewType W(member.team_scratch(0), numMatrices, numRows);
     ScratchPadVectorViewType Q(member.team_scratch(0), numMatrices, numRows);
@@ -134,8 +134,8 @@ struct TeamVectorGMRES {
     Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
                          [&](const OrdinalType& i) {
                            beta(i) = ATM::sqrt(beta(i));
-                           G(i, 0) = beta(i);
-                           tmp(i)  = 1. / beta(i);
+                           G(i, 0) = beta(i) > max_tolerance ? beta(i) : 0.;
+                           tmp(i) = beta(i) > max_tolerance ? 1. / beta(i) : 0.;
                          });
 
     Kokkos::parallel_for(
@@ -166,21 +166,22 @@ struct TeamVectorGMRES {
         TeamVectorCopy1D::invoke(member, tmp,
                                  Kokkos::subview(H, Kokkos::ALL, i, j));
 
-        Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                             [&](const OrdinalType& i) { tmp(i) = -tmp(i); });
+        Kokkos::parallel_for(
+            Kokkos::TeamVectorRange(member, 0, numMatrices),
+            [&](const OrdinalType& ii) { tmp(ii) = -tmp(ii); });
 
         TeamVectorAxpy<MemberType>::invoke(member, tmp, V_i, W);
       }
 
       TeamVectorDot<MemberType>::invoke(member, W, W, tmp);
+      member.team_barrier();
       Kokkos::parallel_for(
           Kokkos::TeamVectorRange(member, 0, numMatrices),
-          [&](const OrdinalType& i) { tmp(i) = ATM::sqrt(tmp(i)); });
+          [&](const OrdinalType& i) {
+            H(i, j + 1, j) = ATM::sqrt(tmp(i));
+            tmp(i) = H(i, j + 1, j) > max_tolerance ? 1. / H(i, j + 1, j) : 0.;
+          });
       member.team_barrier();
-      TeamVectorCopy1D::invoke(member, tmp,
-                               Kokkos::subview(H, Kokkos::ALL, j + 1, j));
-      Kokkos::parallel_for(Kokkos::TeamVectorRange(member, 0, numMatrices),
-                           [&](const OrdinalType& i) { tmp(i) = 1. / tmp(i); });
       Kokkos::parallel_for(
           Kokkos::TeamVectorRange(member, 0, numMatrices * numRows),
           [&](const OrdinalType& iTemp) {
@@ -196,38 +197,39 @@ struct TeamVectorGMRES {
             // Apply the previous Givens rotations:
             auto H_j = Kokkos::subview(H, l, Kokkos::ALL, j);
 
-            for (size_t i = 0; i < j; ++i) {
+            if (mask(l) == 1.) {
+              for (size_t i = 0; i < j; ++i) {
+                auto tmp1 =
+                    Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1);
+                auto tmp2 =
+                    -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1);
+                H_j(i)     = tmp1;
+                H_j(i + 1) = tmp2;
+              }
+
+              // Compute the new Givens rotation:
+              Kokkos::pair<typename VectorViewType::non_const_value_type,
+                           typename VectorViewType::non_const_value_type>
+                  G_new;
+              typename VectorViewType::non_const_value_type alpha;
+              SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
+
+              Givens(l, j, 0) = G_new.first;
+              Givens(l, j, 1) = G_new.second;
+
+              // Apply the new Givens rotation:
               auto tmp1 =
-                  Givens(l, i, 0) * H_j(i) + Givens(l, i, 1) * H_j(i + 1);
+                  Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1);
               auto tmp2 =
-                  -Givens(l, i, 1) * H_j(i) + Givens(l, i, 0) * H_j(i + 1);
-              H_j(i)     = tmp1;
-              H_j(i + 1) = tmp2;
-            }
-
-            // Compute the new Givens rotation:
-            Kokkos::pair<typename VectorViewType::non_const_value_type,
-                         typename VectorViewType::non_const_value_type>
-                G_new;
-            typename VectorViewType::non_const_value_type alpha;
-            SerialGivensInternal::invoke(H_j(j), H_j(j + 1), &G_new, &alpha);
-
-            Givens(l, j, 0) = G_new.first;
-            Givens(l, j, 1) = G_new.second;
-
-            // Apply the new Givens rotation:
-            auto tmp1 = Givens(l, j, 0) * H_j(j) + Givens(l, j, 1) * H_j(j + 1);
-            auto tmp2 =
-                -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1);
-            H_j(j)     = tmp1;
-            H_j(j + 1) = tmp2;
-
-            G(l, j + 1) = -Givens(l, j, 1) * G(l, j);
-            G(l, j) *= Givens(l, j, 0);
-
-            if (mask(l) == 0.) {
-              H_j(j)  = 1.;
-              G(l, j) = 0.;
+                  -Givens(l, j, 1) * H_j(j) + Givens(l, j, 0) * H_j(j + 1);
+              H_j(j)     = tmp1;
+              H_j(j + 1) = tmp2;
+
+              G(l, j + 1) = -Givens(l, j, 1) * G(l, j);
+              G(l, j) *= Givens(l, j, 0);
+            } else {
+              H_j(j)      = 1.;
+              G(l, j + 1) = 0.;
             }
 
             if (mask(l) == 1. && std::abs(G(l, j + 1)) / beta(l) < tolerance) {