PennyLaneAI · multiphaseCFD · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024 · Jun 6, 2024
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -1,6 +1,9 @@
 # Release 0.37.0-dev
 
 ### New features since last release
+* Add `inverse` suport for gate operations in `lightning.tensor` in the C++ layer.
+  [(#753)](https://github.com/PennyLaneAI/pennylane-lightning/pull/753) 
+
 * Add `observable` and `expval` support to `cutensornet` backed `lightning.tensor` C++ layer.
   [(#728)](https://github.com/PennyLaneAI/pennylane-lightning/pull/728)
 
@@ -29,6 +32,9 @@
 
 ### Improvements
 
+* Set `state_tensor` as `const` for the `MeasurementTNCuda` class.
+  [(#753)](https://github.com/PennyLaneAI/pennylane-lightning/pull/753) 
+
 * Updated Kokkos version and support to 4.3.01.
   [(#725)](https://github.com/PennyLaneAI/pennylane-lightning/pull/725)
 

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,5 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.37.0-dev28"
+
+__version__ = "0.37.0-dev29"
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
@@ -82,6 +82,8 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
   public:
     MPSTNCuda() = delete;
 
+    // TODO: Add method to the constructor to all user to select methods at
+    // runtime in the C++ layer
     explicit MPSTNCuda(const std::size_t numQubits,
                        const std::size_t maxBondDim)
         : BaseType(numQubits), maxBondDim_(maxBondDim),
@@ -90,6 +92,8 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
         initTensors_();
     }
 
+    // TODO: Add method to the constructor to all user to select methods at
+    // runtime in the C++ layer
     explicit MPSTNCuda(const std::size_t numQubits,
                        const std::size_t maxBondDim, DevTag<int> dev_tag)
         : BaseType(numQubits, dev_tag), maxBondDim_(maxBondDim),

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -72,6 +72,8 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
   public:
     TNCudaBase() = delete;
 
+    // TODO: Add method to the constructor to all user to select methods at
+    // runtime in the C++ layer
     explicit TNCudaBase(const std::size_t numQubits, int device_id = 0,
                         cudaStream_t stream_id = 0)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
@@ -98,6 +100,8 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /*  cutensornetState_t * */ &quantumState_));
     }
 
+    // TODO: Add method to the constructor to all user to select methods at
+    // runtime in the C++ layer
     explicit TNCudaBase(const std::size_t numQubits, DevTag<int> dev_tag)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
           dev_tag_(dev_tag),
@@ -236,6 +240,13 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
                         bool adjoint = false,
                         const std::vector<PrecisionT> &params = {0.0},
                         const std::vector<ComplexT> &gate_matrix = {}) {
+        // TODO: Need to revisit this line of code for the exact TN backend.
+        //  We should be able to turn on/ skip this check based on the backend,
+        //  if(getMethod() == "mps") { ... }
+        PL_ABORT_IF(
+            wires.size() > 2,
+            "Unsupported gate: MPS method only supports 1, 2-wires gates");
+
         auto &&par = (params.empty()) ? std::vector<PrecisionT>{0.0} : params;
         DataBuffer<PrecisionT, int> dummy_device_data(
             Pennylane::Util::exp2(wires.size()), getDevTag());
@@ -259,17 +270,18 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* void * */ static_cast<void *>(dummy_device_data.getData()),
             /* const int64_t *tensorModeStrides */ nullptr,
             /* const int32_t immutable */ 1,
-            /* const int32_t adjoint */ adjoint,
+            /* const int32_t adjoint */ 0,
             /* const int32_t unitary */ 1,
             /* int64_t * */ &id));
         if (!gate_matrix.empty()) {
             auto gate_key = std::make_pair(opName, par);
             std::vector<CFP_t> matrix_cu =
                 cuUtil::complexToCu<ComplexT>(gate_matrix);
             gate_cache_->add_gate(static_cast<std::size_t>(id), gate_key,
-                                  matrix_cu);
+                                  matrix_cu, adjoint);
         } else {
-            gate_cache_->add_gate(static_cast<std::size_t>(id), opName, par);
+            gate_cache_->add_gate(static_cast<std::size_t>(id), opName, par,
+                                  adjoint);
         }
         PL_CUTENSORNET_IS_SUCCESS(cutensornetStateUpdateTensorOperator(
             /* const cutensornetHandle_t */ getTNCudaHandle(),

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
@@ -73,15 +73,18 @@ template <class PrecisionT> class TNCudaGateCache {
      * @param gate_name String representing the name of the given gate.
      * @param gate_param Vector of parameter values. `{}` if non-parametric
      * gate.
+     * @param adjoint Boolean value indicating if the gate requires adjoint.
      */
     void add_gate(const std::size_t gate_id, const std::string &gate_name,
-                  [[maybe_unused]] std::vector<PrecisionT> gate_param = {}) {
+                  [[maybe_unused]] std::vector<PrecisionT> gate_param = {},
+                  bool adjoint = false) {
         auto gate_key = std::make_pair(gate_name, gate_param);
 
         auto &gateMap =
             cuGates::DynamicGateDataAccess<PrecisionT>::getInstance();
 
-        add_gate(gate_id, gate_key, gateMap.getGateData(gate_name, gate_param));
+        add_gate(gate_id, gate_key, gateMap.getGateData(gate_name, gate_param),
+                 adjoint);
     }
     /**
      * @brief Add gate numerical value to the cache, indexed by the id of gate
@@ -93,10 +96,12 @@ template <class PrecisionT> class TNCudaGateCache {
      * its associated parameter value.
      * @param gate_data_host Vector of complex floating point values
      * representing the gate data on host.
+     * @param adjoint Boolean value indicating if the gate requires adjoint.
      */
 
     void add_gate(const std::size_t gate_id, gate_key_info gate_key,
-                  const std::vector<CFP_t> &gate_data_host) {
+                  const std::vector<CFP_t> &gate_data_host,
+                  bool adjoint = false) {
         const std::size_t rank = Pennylane::Util::log2(gate_data_host.size());
         auto modes = std::vector<std::size_t>(rank, 0);
         auto extents = std::vector<std::size_t>(rank, 2);
@@ -108,8 +113,29 @@ template <class PrecisionT> class TNCudaGateCache {
             std::piecewise_construct, std::forward_as_tuple(gate_id),
             std::forward_as_tuple(gate_key, std::move(tensor)));
 
-        device_gates_.at(gate_id).second.getDataBuffer().CopyHostDataToGpu(
-            gate_data_host.data(), gate_data_host.size());
+        if (adjoint) {
+            // TODO: This is a temporary solution for gates data transpose.
+            // There should be a better way to handle this, but there is not
+            // a big performance issue for now since the size of gates is small.
+            std::vector<CFP_t> data_host_transpose(gate_data_host.size());
+
+            std::size_t col_size = 1 << (rank / 2);
+            std::size_t row_size = 1 << (rank / 2);
+
+            for (std::size_t idx = 0; idx < gate_data_host.size(); idx++) {
+                std::size_t col = idx / row_size;
+                std::size_t row = idx % row_size;
+
+                data_host_transpose.at(row * col_size + col) = {
+                    gate_data_host.at(idx).x, -gate_data_host.at(idx).y};
+            }
+
+            device_gates_.at(gate_id).second.getDataBuffer().CopyHostDataToGpu(
+                data_host_transpose.data(), data_host_transpose.size());
+        } else {
+            device_gates_.at(gate_id).second.getDataBuffer().CopyHostDataToGpu(
+                gate_data_host.data(), gate_data_host.size());
+        }
 
         total_alloc_bytes_ += (sizeof(CFP_t) * gate_data_host.size());
     }