Add probs() support to lightning.tensor (#830)

### Before submitting Please complete the following checklist when submitting a PR: - [ ] All new features must include a unit test. If you've fixed a bug or added code that should be tested, add a test to the [`tests`](../tests) directory! - [ ] All new functions and code must be clearly commented and documented. If you do make documentation changes, make sure that the docs build and render correctly by running `make docs`. - [ ] Ensure that the test suite passes, by running `make test`. - [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing the change, and including a link back to the PR. - [x] Ensure that code is properly formatted by running `make format`. When all the above are checked, delete everything above the dashed line and fill in the pull request template. ------------------------------------------------------------------------------------------------------------ **Context:** [SC-65784] This PR add `probs()` support to `lightning.tensor`. Additionally, this PR also found a temporal approach to allow users/developers to call `appendMPSFinalize` multiple time. This feature is essential for the `probs(obs)` as well as `shot measurement`. **Description of the Change:** **Benefits:** **Possible Drawbacks:** **Related GitHub Issues:** --------- Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai> Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com> Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com> Co-authored-by: Lee James O'Riordan <mlxd@users.noreply.github.com> Co-authored-by: Josh Izaac <josh146@gmail.com> Co-authored-by: Lee J. O'Riordan <lee@xanadu.au> Co-authored-by: Pietropaolo Frisoni <pietropaolo.frisoni@xanadu.ai> Co-authored-by: erick-xanadu <110487834+erick-xanadu@users.noreply.github.com> Co-authored-by: Shiro-Raven <exclass9.24@gmail.com> Co-authored-by: albi3ro <chrissie.c.l@gmail.com>
PennyLaneAI · Sep 8, 2024 · 4c04474 · 4c04474
1 parent 2209a6b
commit 4c04474
Show file tree

Hide file tree

Showing 23 changed files with 647 additions and 76 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add the analytic `qml.probs()` measurement support to `lightning.tensor`.
+  [(#830)](https://github.com/PennyLaneAI/pennylane-lightning/pull/830)
+
 * Add `qml.state()` measurement support to `lightning.tensor`.
   [(#827)](https://github.com/PennyLaneAI/pennylane-lightning/pull/827)
 
@@ -30,6 +33,9 @@
 
 ### Improvements
 
+* Multiple calls to the `append_mps_final_state()` API is allowed in `lightning.tensor`.
+  [(#830)](https://github.com/PennyLaneAI/pennylane-lightning/pull/830)
+
 * Update `generate_samples` in `LightningKokkos` and `LightningGPU` to support `qml.measurements.Shots` type instances.
   [(#839)](https://github.com/PennyLaneAI/pennylane-lightning/pull/839)
 

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev39"
+__version__ = "0.38.0-dev40"
diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -749,6 +749,13 @@ void registerLightningTensorBackendAgnosticMeasurements(PyClass &pyclass) {
                 return M.expval(*ob);
             },
             "Expected value of an observable object.")
+        .def(
+            "probs",
+            [](MeasurementsT &M, const std::vector<std::size_t> &wires) {
+                return py::array_t<typename TensorNetT::PrecisionT>(
+                    py::cast(M.probs(wires)));
+            },
+            "Probabilities of a set of wires.")
         .def(
             "var",
             [](MeasurementsT &M, const std::shared_ptr<ObservableT> &ob) {

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
@@ -62,7 +62,6 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
     using BaseType = TNCudaBase<Precision, MPSTNCuda>;
 
     MPSStatus MPSInitialized_ = MPSStatus::MPSInitNotSet;
-    MPSStatus MPSFinalized_ = MPSStatus::MPSFinalizedNotSet;
 
     const std::size_t maxBondDim_;
 
@@ -215,21 +214,18 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
      */
     void append_mps_final_state(double cutoff = 0,
                                 std::string cutoff_mode = "abs") {
-        if (MPSFinalized_ == MPSStatus::MPSFinalizedNotSet) {
-            MPSFinalized_ = MPSStatus::MPSFinalizedSet;
-            PL_CUTENSORNET_IS_SUCCESS(cutensornetStateFinalizeMPS(
-                /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
-                /* cutensornetState_t */ BaseType::getQuantumState(),
-                /* cutensornetBoundaryCondition_t */
-                CUTENSORNET_BOUNDARY_CONDITION_OPEN,
-                /* const int64_t *const extentsOut[] */
-                getSitesExtentsPtr().data(),
-                /*strides=*/nullptr));
-        }
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateFinalizeMPS(
+            /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
+            /* cutensornetState_t */ BaseType::getQuantumState(),
+            /* cutensornetBoundaryCondition_t */
+            CUTENSORNET_BOUNDARY_CONDITION_OPEN,
+            /* const int64_t *const extentsOut[] */
+            getSitesExtentsPtr().data(),
+            /*strides=*/nullptr));
 
         // Optional: SVD
         cutensornetTensorSVDAlgo_t algo =
-            CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; // default
+            CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; // default option
 
         PL_CUTENSORNET_IS_SUCCESS(cutensornetStateConfigure(
             /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
@@ -257,6 +253,21 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
         BaseType::computeState(
             const_cast<int64_t **>(getSitesExtentsPtr().data()),
             reinterpret_cast<void **>(getTensorsOutDataPtr().data()));
+
+        // TODO: This is a dummy tensor update to allow multiple calls to the
+        // `append_mps_final_state` method as well as appending additional
+        // operations to the graph. This is a temporary solution and this line
+        // can be removed in the future when the `cutensornet` backend allows
+        // multiple calls to the `cutensornetStateFinalizeMPS` method. For more
+        // details, please see the `cutensornet` high-level API workflow logic
+        // [here]
+        // (https://docs.nvidia.com/cuda/cuquantum/latest/cutensornet/api/functions.html#high-level-tensor-network-api).
+        // In order to proceed with the following gate operations or
+        // measurements after calling the `cutensornetStateCompute()` API, we
+        // have to call the `cutensornetStateUpdateTensor()` API, which is
+        // wrapped inside the `dummy_tensor_update()` method.
+        //
+        BaseType::dummy_tensor_update();
     }
 
     /**
@@ -276,7 +287,7 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
 
         PL_ABORT_IF(log2(avail_gpu_memory) < BaseType::getNumQubits(),
                     "State tensor size exceeds the available GPU memory!");
-        this->get_state_tensor(res);
+        BaseType::get_state_tensor(res);
     }
 
     /**

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -60,6 +60,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     using ComplexT = std::complex<PrecisionT>;
     using BaseType = TensornetBase<PrecisionT, Derived>;
     SharedTNCudaHandle handle_;
+    SharedCublasCaller cublascaller_;
     cudaDataType_t typeData_;
     DevTag<int> dev_tag_;
     cutensornetComputeType_t typeCompute_;
@@ -78,6 +79,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     explicit TNCudaBase(const std::size_t numQubits, int device_id = 0,
                         cudaStream_t stream_id = 0)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
+          cublascaller_(make_shared_cublas_caller()),
           dev_tag_({device_id, stream_id}),
           gate_cache_(std::make_shared<TNCudaGateCache<PrecisionT>>(dev_tag_)) {
         // TODO this code block could be moved to base class and need to revisit
@@ -108,7 +110,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     // runtime in the C++ layer
     explicit TNCudaBase(const std::size_t numQubits, DevTag<int> dev_tag)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
-          dev_tag_(dev_tag),
+          cublascaller_(make_shared_cublas_caller()), dev_tag_(dev_tag),
           gate_cache_(std::make_shared<TNCudaGateCache<PrecisionT>>(dev_tag_)) {
         // TODO this code block could be moved to base class and need to revisit
         // when working on copy ctor
@@ -155,6 +157,15 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
         return handle_.get();
     }
 
+    /**
+     * @brief Access the CublasCaller the object is using.
+     *
+     * @return a reference to the object's CublasCaller object.
+     */
+    auto getCublasCaller() const -> const CublasCaller & {
+        return *cublascaller_;
+    }
+
     /**
      * @brief Get the quantum state pointer.
      *
@@ -299,7 +310,6 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* int32_t unitary*/ 1));
     }
 
-  protected:
     /**
      * @brief Get the state vector representation of a tensor network.
      *
@@ -323,7 +333,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     }
 
     /**
-     * @brief Get a slice of the state tensor
+     * @brief Get a slice of the full state tensor
      *
      * @param tensor_data Pointer to the device memory for state tensor data.
      * @param tensor_data_size Size of the state tensor data.
@@ -334,17 +344,61 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     void get_state_tensor(CFP_t *tensor_data,
                           const std::size_t tensor_data_size,
                           const std::vector<std::size_t> &wires,
-                          const int32_t numHyperSamples = 1) {
-        // NOTE: this is a solution to get the full state tensor
-        // TODO: projected_modes and projectedModeValues are to be updated for
-        // prob() support.
+                          const int32_t numHyperSamples = 1) const {
         auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
             wires, BaseType::getNumQubits());
 
         std::vector<int32_t> projected_modes{};
 
-        std::vector<int64_t> projectedModeValues{};
+        for (int32_t idx = 0;
+             idx < static_cast<int32_t>(BaseType::getNumQubits()); idx++) {
+            auto it = std::find(stateModes.begin(), stateModes.end(), idx);
+            if (it == stateModes.end()) {
+                projected_modes.emplace_back(idx);
+            }
+        }
+
+        std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);
 
+        if (projected_modes.empty()) {
+            get_accessor_(tensor_data, tensor_data_size, projected_modes,
+                          projectedModeValues, numHyperSamples);
+        } else {
+            DataBuffer<CFP_t, int> tmp(tensor_data_size, getDevTag(), true);
+
+            const std::size_t projected_modes_size = size_t(1)
+                                                     << projected_modes.size();
+            for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
+                for (std::size_t j = 0; j < projected_modes.size(); j++) {
+                    projectedModeValues[j] = (idx >> j) & 1;
+                }
+
+                get_accessor_(tmp.getData(), tensor_data_size, projected_modes,
+                              projectedModeValues, numHyperSamples);
+                // Copy the data to the output tensor
+                scaleAndAddC_CUDA(std::complex<PrecisionT>{1.0, 0.0},
+                                  tmp.getData(), tensor_data, tmp.getLength(),
+                                  getDevTag().getDeviceID(),
+                                  getDevTag().getStreamID(), getCublasCaller());
+            }
+        }
+    }
+
+  private:
+    /**
+     * @brief Get accessor of a state tensor
+     *
+     * @param tensor_data Pointer to the device memory for state tensor data.
+     * @param tensor_data_size Size of the tensor data.
+     * @param projected_modes Projected modes to get the state tensor for.
+     * @param projectedModeValues Values of the projected modes.
+     * @param numHyperSamples Number of hyper samples to use in the calculation
+     * and is set to 1 by default.
+     */
+    void get_accessor_(CFP_t *tensor_data, const std::size_t tensor_data_size,
+                       const std::vector<int32_t> &projected_modes,
+                       const std::vector<int64_t> &projectedModeValues,
+                       const int32_t numHyperSamples = 1) const {
         cutensornetStateAccessor_t accessor;
         PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateAccessor(
             /* const cutensornetHandle_t */ getTNCudaHandle(),
@@ -415,17 +469,40 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
 
         CFP_t scale_scalar_cu{scale_scalar.real(), scale_scalar.imag()};
 
-        SharedCublasCaller cublascaller = make_shared_cublas_caller();
-
         scaleC_CUDA<CFP_t, CFP_t>(scale_scalar_cu, tensor_data,
                                   tensor_data_size, getDevTag().getDeviceID(),
-                                  getDevTag().getStreamID(), *cublascaller);
+                                  getDevTag().getStreamID(), getCublasCaller());
 
         PL_CUTENSORNET_IS_SUCCESS(
             cutensornetDestroyWorkspaceDescriptor(workDesc));
         PL_CUTENSORNET_IS_SUCCESS(cutensornetDestroyAccessor(accessor));
     }
 
+  protected:
+    /**
+     * @brief Dummy tensor operator update to allow multiple calls of
+     * appendMPSFinalize. This is a workaround to avoid the issue of the
+     * cutensornet library not allowing multiple calls of appendMPSFinalize.
+     *
+     * This function either appends a new `Identity` gate to the graph when the
+     * gate cache is empty or update the existing gate operator by itself.
+     */
+    void dummy_tensor_update() {
+        if (gate_cache_->is_empty()) {
+            applyOperation("Identity", {0}, false);
+        }
+
+        const std::size_t id = gate_cache_->get_cache_head_idx();
+
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateUpdateTensorOperator(
+            /* const cutensornetHandle_t */ getTNCudaHandle(),
+            /* cutensornetState_t */ getQuantumState(),
+            /* int64_t tensorId*/ static_cast<int64_t>(id),
+            /* void* */
+            static_cast<void *>(gate_cache_->get_gate_device_ptr(id)),
+            /* int32_t unitary*/ 1));
+    }
+
     /**
      * @brief Save quantumState information to data provided by a user
      *

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
@@ -157,6 +157,22 @@ template <class PrecisionT> class TNCudaGateCache {
         return device_gates_.at(gate_id).second.getDataBuffer().getData();
     }
 
+    /**
+     * @brief Returns the key (index of the gate) of the first element in the
+     * `device_gates_`.
+     *
+     * @return size_t Key of the first element in the `device_gates_`.
+     */
+    auto get_cache_head_idx() const -> std::size_t {
+        auto it = device_gates_.begin();
+        return it->first;
+    }
+
+    /**
+     * @brief Returns if the `device_gates_` is empty.
+     */
+    auto is_empty() const -> bool { return device_gates_.empty(); }
+
   private:
     const DevTag<int> device_tag_;
     std::size_t total_alloc_bytes_;

diff --git a/...tning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp b/...tning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
@@ -74,7 +74,17 @@ TEMPLATE_TEST_CASE("MPSTNCuda::Gates::Hadamard", "[MPSTNCuda_Nonparam]", float,
             const std::size_t index = GENERATE(0, 1, 2);
             MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
 
+            mps_state.append_mps_final_state();
+
             mps_state.applyOperation("Hadamard", {index}, inverse);
+
+            mps_state.append_mps_final_state();
+
+            mps_state.applyOperation("Identity", {index}, inverse);
+
+            // Test for multiple final states appendings
+            mps_state.append_mps_final_state();
+
             cp_t expected(1.0 / std::sqrt(2), 0);
 
             auto results = mps_state.getDataVector();

diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/CMakeLists.txt
@@ -1,12 +1,19 @@
 cmake_minimum_required(VERSION 3.20)
 
-project(${PL_BACKEND}_measurements LANGUAGES CXX)
+project(${PL_BACKEND}_measurements LANGUAGES CXX C CUDA)
 
-add_library(${PL_BACKEND}_measurements INTERFACE)
+if(NOT DEFINED CMAKE_CUDA20_STANDARD_COMPILE_OPTION)
+  set(CMAKE_CUDA20_STANDARD_COMPILE_OPTION "")
+  set(CMAKE_CUDA20_EXTENSION_COMPILE_OPTION "")
+endif()
+
+set(LTENSOR_MPS_FILES cuda_kernels_measures.cu CACHE INTERNAL "" FORCE)
+
+add_library(${PL_BACKEND}_measurements STATIC ${LTENSOR_MPS_FILES})
 
 target_include_directories(${PL_BACKEND}_measurements INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 
-target_link_libraries(${PL_BACKEND}_measurements INTERFACE  lightning_compile_options
+target_link_libraries(${PL_BACKEND}_measurements PUBLIC  lightning_compile_options
                                                             lightning_external_libs
                                                             ${PL_BACKEND}
                                                             ${PL_BACKEND}_utils