Accelerate multi-qubit gates (#490)

* M pennylane_lightning/core/src/bindings/Bindings.hpp; hack `JacobianData` to work with devices. M pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp; `applyMatrix` bugfix: use intermediate hostview to copy matrix data; same bugfix for `getDataVector`. M pennylane_lightning/core/src/simulators/lightning_kokkos/algorithms/AdjointJacobianKokkos.hpp; use copy constructor. M pennylane_lightning/core/src/simulators/lightning_kokkos/measurements/MeasurementsKokkos.hpp; use copy constructor. M pennylane_lightning/core/src/simulators/lightning_kokkos/observables/ObservablesKokkos.hpp; use copy constructor. M requirements-dev.txt; add clang-format-14. * Auto update version * Update changelog. * Auto update version * Auto update version * Add an argument to adjointJacobian to avoid syncing and copying state vector data in adjoint-diff. * Reformat * trigger CI * [skip ci] Update changelog. * Introduce std::unordered_map<std::string, ExpValFunc> expval_funcs_. * Introduce applyExpectationValueFunctor. * Add binding to LKokkos expval(matrix, wires). Combine expval functor calls into two templated methods. Call specialized expval methods when possible. Remove obsolete 'Apply directly' tests. * Update changelog. * Add test for arbitrary expval(Hermitian). * Add getExpectationValueMultiQubitOpFunctor. * Add typename hint for macos. * Add typename macos. * Use Kokkos::ThreadVectorRange policy for innerloop in getExpectationValueMultiQubitOpFunctor. * Auto update version * Auto update version * Couple fix for HIP. * WIP * Add specialized 3-5 qubit expval functors. * Add implementation notes. * [skip ci] Polish getExpValMatrix and limit getExpValNQubitOpFunctor to N == 4 (seems optimal on OPENMP/HIP backends although CUDA can go further). * Fix whitespace warning. * Auto update version * WIP * Add new NQubit functors + bugfix * Use simplified bitshifts in 1- & 2-qubit expval kernels. * Do not template NQubit methods over inverse and just take conj-trans before calling the functor. * Update tests. * Update changelog. * Bump pennylane version. * Bump pennylane version. * Auto update version * Reimplement expval functors with macros. * Auto update version * Reimplement N-wires unitary gate functors with macros. * Rename macros. * Update CHANGELOG.md * Auto update version * trigger CI * trigger CI * Bump Kokkos to 4.1.00 in CI. * Revert kokkos ver. * Add tests for macroed expval functors. * Remove redundant black lines. * Add tests for macroed gate functors. * Use matrix interface to get expval of HermitianObs in LKokkos. * Fix few 1U bit shifts. * Cover kokkos_args error. * Define DoubleLoopRank in KokkosSV. * Update changelog. * Auto update version * Fix codefactor error. * Update pennylane_lightning/core/src/simulators/lightning_kokkos/gates/tests/Test_StateVectorKokkos_Param.cpp Co-authored-by: Lee James O'Riordan <mlxd@users.noreply.github.com> * Auto update version * Define i000 vars more explicitly outside the macros. * trigger CI * Auto update version * trigger CI * trigger CI --------- Co-authored-by: Dev version update bot <github-actions[bot]@users.noreply.github.com> Co-authored-by: Amintor Dusko <87949283+AmintorDusko@users.noreply.github.com> Co-authored-by: Lee James O'Riordan <mlxd@users.noreply.github.com>
PennyLaneAI · Sep 8, 2023 · 3fbd4ce · 3fbd4ce
1 parent 7461186
commit 3fbd4ce
Show file tree

Hide file tree

Showing 9 changed files with 834 additions and 431 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -15,6 +15,9 @@
 
 ### Improvements
 
+* Refactor LKokkos `StateVectorKokkos` class to use Kokkos `RangePolicy` together with special functors in `applyMultiQubitOp` to apply 1- to 4-wire generic unitary gates. For more than 4 wires, the general implementation using Kokkos `TeamPolicy` is employed to yield the best all-around performance. 
+  [(#490)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/490)
+
 * Refactor LKokkos `Measurements` class to use Kokkos `RangePolicy` together with special functors to obtain the expectation value of 1- to 4-wire generic unitary gates. For more than 4 wires, the general implementation using Kokkos `TeamPolicy` is employed to yield the best all-around performance. 
   [(#489)] (https://github.com/PennyLaneAI/pennylane-lightning/pull/489)
 

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.33.0-dev6"
+__version__ = "0.33.0-dev7"
diff --git a/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp b/pennylane_lightning/core/src/simulators/lightning_kokkos/StateVectorKokkos.hpp
@@ -61,8 +61,9 @@ class StateVectorKokkos final
   public:
     using PrecisionT = fp_t;
     using ComplexT = Kokkos::complex<fp_t>;
-    using KokkosExecSpace = Kokkos::DefaultExecutionSpace;
+    using DoubleLoopRank = Kokkos::Rank<2>;
     using HostExecSpace = Kokkos::DefaultHostExecutionSpace;
+    using KokkosExecSpace = Kokkos::DefaultExecutionSpace;
     using KokkosVector = Kokkos::View<ComplexT *>;
     using KokkosSizeTVector = Kokkos::View<size_t *>;
     using UnmanagedComplexHostView =
@@ -257,96 +258,61 @@ class StateVectorKokkos final
     }
 
     /**
-     * @brief Apply a single qubit operator to the state vector using a matrix
+     * @brief Apply a multi qubit operator to the state vector using a matrix
      *
      * @param matrix Kokkos gate matrix in the device space
      * @param wires Wires to apply gate to.
      * @param inverse Indicates whether to use adjoint of gate.
      */
-    void applySingleQubitOp(const KokkosVector &matrix,
-                            const std::vector<size_t> &wires,
-                            bool inverse = false) {
+    void applyMultiQubitOp(const KokkosVector &matrix,
+                           const std::vector<std::size_t> &wires,
+                           bool inverse = false) {
         auto &&num_qubits = this->getNumQubits();
-        if (!inverse) {
+        std::size_t two2N = std::exp2(num_qubits - wires.size());
+        std::size_t dim = std::exp2(wires.size());
+        KokkosVector matrix_trans("matrix_trans", matrix.size());
+
+        if (inverse) {
+            Kokkos::MDRangePolicy<DoubleLoopRank> policy_2d({0, 0}, {dim, dim});
             Kokkos::parallel_for(
-                Kokkos::RangePolicy<KokkosExecSpace>(0, exp2(num_qubits - 1)),
-                singleQubitOpFunctor<fp_t, false>(*data_, num_qubits, matrix,
-                                                  wires));
+                policy_2d,
+                KOKKOS_LAMBDA(const std::size_t i, const std::size_t j) {
+                    matrix_trans(i + j * dim) = conj(matrix(i * dim + j));
+                });
         } else {
-            Kokkos::parallel_for(
-                Kokkos::RangePolicy<KokkosExecSpace>(0, exp2(num_qubits - 1)),
-                singleQubitOpFunctor<fp_t, true>(*data_, num_qubits, matrix,
-                                                 wires));
+            matrix_trans = matrix;
         }
-    }
-
-    /**
-     * @brief Apply a two qubit operator to the state vector using a matrix
-     *
-     * @param matrix Kokkos gate matrix in the device space
-     * @param wires Wires to apply gate to.
-     * @param inverse Indicates whether to use adjoint of gate.
-     */
-    void applyTwoQubitOp(const KokkosVector &matrix,
-                         const std::vector<size_t> &wires,
-                         bool inverse = false) {
-        auto &&num_qubits = this->getNumQubits();
-        if (!inverse) {
+        switch (wires.size()) {
+        case 1:
             Kokkos::parallel_for(
-                Kokkos::RangePolicy<KokkosExecSpace>(0, exp2(num_qubits - 2)),
-                twoQubitOpFunctor<fp_t, false>(*data_, num_qubits, matrix,
-                                               wires));
-        } else {
+                two2N, apply1QubitOpFunctor<fp_t>(*data_, num_qubits,
+                                                  matrix_trans, wires));
+            break;
+        case 2:
             Kokkos::parallel_for(
-                Kokkos::RangePolicy<KokkosExecSpace>(0, exp2(num_qubits - 2)),
-                twoQubitOpFunctor<fp_t, true>(*data_, num_qubits, matrix,
-                                              wires));
-        }
-    }
-
-    /**
-     * @brief Apply a multi qubit operator to the state vector using a matrix
-     *
-     * @param matrix Kokkos gate matrix in the device space
-     * @param wires Wires to apply gate to.
-     * @param inverse Indicates whether to use adjoint of gate.
-     */
-    void applyMultiQubitOp(const KokkosVector &matrix,
-                           const std::vector<size_t> &wires,
-                           bool inverse = false) {
-        auto &&num_qubits = this->getNumQubits();
-        if (wires.size() == 1) {
-            applySingleQubitOp(matrix, wires, inverse);
-        } else if (wires.size() == 2) {
-            applyTwoQubitOp(matrix, wires, inverse);
-        } else {
-            Kokkos::View<const size_t *, Kokkos::HostSpace,
-                         Kokkos::MemoryTraits<Kokkos::Unmanaged>>
-                wires_host(wires.data(), wires.size());
-
-            Kokkos::View<size_t *> wires_view("wires_view", wires.size());
-            Kokkos::deep_copy(wires_view, wires_host);
-
-            std::size_t two2N = std::exp2(num_qubits_ - wires.size());
-            std::size_t dim = std::exp2(wires.size());
+                two2N, apply2QubitOpFunctor<fp_t>(*data_, num_qubits,
+                                                  matrix_trans, wires));
+            break;
+        case 3:
+            Kokkos::parallel_for(
+                two2N, apply3QubitOpFunctor<fp_t>(*data_, num_qubits,
+                                                  matrix_trans, wires));
+            break;
+        case 4:
+            Kokkos::parallel_for(
+                two2N, apply4QubitOpFunctor<fp_t>(*data_, num_qubits,
+                                                  matrix_trans, wires));
+            break;
+        default:
             std::size_t scratch_size = ScratchViewComplex::shmem_size(dim) +
                                        ScratchViewSizeT::shmem_size(dim);
-
-            if (!inverse) {
-                Kokkos::parallel_for(
-                    "multiQubitOpFunctor",
-                    TeamPolicy(two2N, Kokkos::AUTO, dim)
-                        .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
-                    multiQubitOpFunctor<PrecisionT, false>(*data_, num_qubits,
-                                                           matrix, wires_view));
-            } else {
-                Kokkos::parallel_for(
-                    "multiQubitOpFunctor",
-                    TeamPolicy(two2N, Kokkos::AUTO, dim)
-                        .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
-                    multiQubitOpFunctor<PrecisionT, true>(*data_, num_qubits,
-                                                          matrix, wires_view));
-            }
+            Kokkos::parallel_for(
+                "multiQubitOpFunctor",
+                TeamPolicy(two2N, Kokkos::AUTO, dim)
+                    .set_scratch_size(0, Kokkos::PerTeam(scratch_size)),
+                multiQubitOpFunctor<PrecisionT>(*data_, num_qubits,
+                                                matrix_trans, wires));
+            break;
         }
     }
 
@@ -361,7 +327,7 @@ class StateVectorKokkos final
     inline void applyMatrix(ComplexT *matrix, const std::vector<size_t> &wires,
                             bool inverse = false) {
         PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0");
-        size_t n = 1U << wires.size();
+        size_t n = static_cast<std::size_t>(1U) << wires.size();
         KokkosVector matrix_(matrix, n * n);
         applyMultiQubitOp(matrix_, wires, inverse);
     }
@@ -395,15 +361,10 @@ class StateVectorKokkos final
                             const std::vector<size_t> &wires,
                             bool inverse = false) {
         PL_ABORT_IF(wires.empty(), "Number of wires must be larger than 0");
-        size_t n = 1U << wires.size();
+        size_t n = static_cast<std::size_t>(1U) << wires.size();
         size_t n2 = n * n;
         KokkosVector matrix_("matrix_", n2);
-        typename KokkosVector::HostMirror matrix_h =
-            Kokkos::create_mirror_view(matrix_);
-        Kokkos::parallel_for(
-            Kokkos::RangePolicy<HostExecSpace>(0, n2),
-            KOKKOS_LAMBDA(const size_t i) { matrix_h(i) = matrix[i]; });
-        Kokkos::deep_copy(matrix_, matrix_h);
+        Kokkos::deep_copy(matrix_, UnmanagedConstComplexHostView(matrix, n2));
         applyMultiQubitOp(matrix_, wires, inverse);
     }