From 03aafa23f80bf6c050292ee4b6d244eabe71bc2a Mon Sep 17 00:00:00 2001
From: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
Date: Tue, 20 Aug 2024 23:33:14 -0400
Subject: [PATCH 1/2] Update generate_samples in LK and LGPU to support
 qml.measurements.Shots (#839)

**Context:**
PR https://github.com/PennyLaneAI/pennylane/pull/6046 wraps the legacy
device API automatically in various device creation, qnode, and execute
functions. As LK and LGPU plugins still rely on the legacy device API,
the shots tests and the `generate_samples` logic in
`lightning_kokkos.py` and `lightning_gpu.py` should be updated to adhere
the new convention.

**Related Shortcut Stories:**
[sc-65998]

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Co-authored-by: Shiro-Raven <exclass9.24@gmail.com>
Co-authored-by: albi3ro <chrissie.c.l@gmail.com>
---
 .github/CHANGELOG.md                          |   5 +-
 pennylane_lightning/core/_version.py          |   2 +-
 .../lightning_gpu/lightning_gpu.py            |   7 +-
 .../lightning_kokkos/lightning_kokkos.py      |   3 +
 .../test_measurements_samples_MCMC.py         |  21 +---
 .../lightning_tensor/test_tensornet_class.py  |   4 +-
 tests/test_adjoint_jacobian.py                |  17 ++-
 tests/test_apply.py                           | 101 ++++--------------
 tests/test_expval.py                          |  90 +++-------------
 tests/test_gates.py                           |   2 +-
 tests/test_measurements.py                    |  26 ++---
 tests/test_native_mcm.py                      |   3 +-
 tests/test_templates.py                       |   4 +-
 tests/test_var.py                             |  35 ++----
 14 files changed, 82 insertions(+), 238 deletions(-)

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 7df9853113..298469052a 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -27,6 +27,9 @@
 
 ### Improvements
 
+* Update `generate_samples` in `LightningKokkos` and `LightningGPU` to support `qml.measurements.Shots` type instances.
+  [(#839)](https://github.com/PennyLaneAI/pennylane-lightning/pull/839)
+
 * LightningQubit gains native support for the `PauliRot` gate.
   [(#834)](https://github.com/PennyLaneAI/pennylane-lightning/pull/834)
   
@@ -136,7 +139,7 @@
 
 This release contains contributions from (in alphabetical order):
 
-Ali Asadi, Astral Cai, Amintor Dusko, Vincent Michaud-Rioux, Erick Ochoa Lopez, Lee J. O'Riordan, Mudit Pandey, Shuli Shu, Raul Torres, Paul Haochen Wang
+Ali Asadi, Astral Cai, Ahmed Darwish, Amintor Dusko, Vincent Michaud-Rioux, Erick Ochoa Lopez, Lee J. O'Riordan, Mudit Pandey, Shuli Shu, Raul Torres, Paul Haochen Wang
 
 ---
 
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index 358f5c7cc7..b4c1ef4399 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev38"
+__version__ = "0.38.0-dev39"
diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
index af05d868fc..117e9840de 100644
--- a/pennylane_lightning/lightning_gpu/lightning_gpu.py
+++ b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -38,7 +38,6 @@
 from pennylane_lightning.core.lightning_base import LightningBase
 
 try:
-
     from pennylane_lightning.lightning_gpu_ops import (
         DevPool,
         MeasurementsC64,
@@ -818,9 +817,9 @@ def generate_samples(self):
             array[int]: array of samples in binary representation with shape
             ``(dev.shots, dev.num_wires)``
         """
-        return self.measurements.generate_samples(len(self.wires), self.shots).astype(
-            int, copy=False
-        )
+        shots = self.shots if isinstance(self.shots, int) else self.shots.total_shots
+
+        return self.measurements.generate_samples(len(self.wires), shots).astype(int, copy=False)
 
     # pylint: disable=protected-access
     def expval(self, observable, shot_range=None, bin_size=None):
diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
index 6fea7c1628..5ea499702a 100644
--- a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
+++ b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
@@ -614,6 +614,9 @@ def generate_samples(self, shots=None):
             ``(dev.shots, dev.num_wires)``
         """
         shots = self.shots if shots is None else shots
+
+        shots = shots.total_shots if isinstance(shots, qml.measurements.Shots) else shots
+
         measure = (
             MeasurementsC64(self._kokkos_state)
             if self.use_csingle
diff --git a/tests/lightning_qubit/test_measurements_samples_MCMC.py b/tests/lightning_qubit/test_measurements_samples_MCMC.py
index 562c705dbe..fd49f84d8f 100644
--- a/tests/lightning_qubit/test_measurements_samples_MCMC.py
+++ b/tests/lightning_qubit/test_measurements_samples_MCMC.py
@@ -46,15 +46,8 @@ def test_mcmc_sample_dimensions(self, dev, num_shots, measured_wires, operation,
         the correct dimensions
         """
         ops = [qml.RX(1.5708, wires=[0]), qml.RX(1.5708, wires=[1])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=operation)], shots=num_shots)
-            s1 = dev.execute(tape)
-        else:
-            dev.apply(ops)
-            dev.shots = num_shots
-            dev._wires_measured = measured_wires
-            dev._samples = dev.generate_samples()
-            s1 = dev.sample(operation)
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=operation)], shots=num_shots)
+        s1 = dev.execute(tape)
 
         assert np.array_equal(s1.shape, (shape,))
 
@@ -67,14 +60,8 @@ def test_sample_values(self, tol, kernel):
             device_name, wires=2, shots=1000, mcmc=True, kernel_name=kernel, num_burnin=100
         )
         ops = [qml.RX(1.5708, wires=[0])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=qml.PauliZ(0))], shots=1000)
-            s1 = dev.execute(tape)
-        else:
-            dev.apply([qml.RX(1.5708, wires=[0])])
-            dev._wires_measured = {0}
-            dev._samples = dev.generate_samples()
-            s1 = dev.sample(qml.PauliZ(0))
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=qml.PauliZ(0))], shots=1000)
+        s1 = dev.execute(tape)
 
         # s1 should only contain 1 and -1, which is guaranteed if
         # they square to 1
diff --git a/tests/lightning_tensor/test_tensornet_class.py b/tests/lightning_tensor/test_tensornet_class.py
index e25e5336b1..c5af8a4af2 100644
--- a/tests/lightning_tensor/test_tensornet_class.py
+++ b/tests/lightning_tensor/test_tensornet_class.py
@@ -21,7 +21,6 @@
 import pennylane as qml
 import pytest
 from conftest import LightningDevice, device_name  # tested device
-from pennylane import DeviceError
 from pennylane.wires import Wires
 
 if device_name != "lightning.tensor":
@@ -88,6 +87,7 @@ def test_errors_apply_operation_state_preparation(operation, par):
     tensornet = LightningTensorNet(wires, bondDims)
 
     with pytest.raises(
-        DeviceError, match="lightning.tensor does not support initialization with a state vector."
+        qml.DeviceError,
+        match="lightning.tensor does not support initialization with a state vector.",
     ):
         tensornet.apply_operations([operation(np.array(par), Wires(range(wires)))])
diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
index b2cd0da685..5d5a9115b1 100644
--- a/tests/test_adjoint_jacobian.py
+++ b/tests/test_adjoint_jacobian.py
@@ -700,13 +700,13 @@ def dev(self, request):
         return qml.device(device_name, wires=2, c_dtype=request.param)
 
     @pytest.mark.skipif(ld._new_API, reason="Old API required")
-    def test_finite_shots_warning(self):
-        """Tests that a warning is raised when computing the adjoint diff on a device with finite shots"""
+    def test_finite_shots_error(self):
+        """Tests that an error is raised when computing the adjoint diff on a device with finite shots"""
 
         dev = qml.device(device_name, wires=1, shots=1)
 
-        with pytest.warns(
-            UserWarning, match="Requested adjoint differentiation to be computed with finite shots."
+        with pytest.raises(
+            qml.QuantumFunctionError, match="does not support adjoint with requested circuit."
         ):
 
             @qml.qnode(dev, diff_method="adjoint")
@@ -714,9 +714,6 @@ def circ(x):
                 qml.RX(x, wires=0)
                 return qml.expval(qml.PauliZ(0))
 
-        with pytest.warns(
-            UserWarning, match="Requested adjoint differentiation to be computed with finite shots."
-        ):
             qml.grad(circ)(0.1)
 
     def test_qnode(self, mocker, dev):
@@ -741,7 +738,7 @@ def circuit(x, y, z):
         spy = (
             mocker.spy(dev, "execute_and_compute_derivatives")
             if ld._new_API
-            else mocker.spy(dev, "adjoint_jacobian")
+            else mocker.spy(dev.target_device, "adjoint_jacobian")
         )
         tol, h = get_tolerance_and_stepsize(dev, step_size=True)
 
@@ -926,7 +923,7 @@ def cost(p1, p2):
         if ld._new_API:
             spy = mocker.spy(dev, "execute_and_compute_derivatives")
         else:
-            spy = mocker.spy(dev, "adjoint_jacobian")
+            spy = mocker.spy(dev.target_device, "adjoint_jacobian")
 
         # analytic gradient
         grad_fn = qml.grad(cost)
@@ -968,7 +965,7 @@ def circuit(params):
         spy_analytic = (
             mocker.spy(dev, "execute_and_compute_derivatives")
             if ld._new_API
-            else mocker.spy(dev, "adjoint_jacobian")
+            else mocker.spy(dev.target_device, "adjoint_jacobian")
         )
         tol, h = get_tolerance_and_stepsize(dev, step_size=True)
 
diff --git a/tests/test_apply.py b/tests/test_apply.py
index f12de8635a..cf329fce72 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -566,13 +566,8 @@ def test_expval_single_wire_no_parameters(
         dev = qubit_device(wires=1)
         obs = operation(wires=[0])
         ops = [stateprep(np.array(input), wires=[0])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
-            res = dev.execute(tape)
-        else:
-            dev.reset()
-            dev.apply(ops, obs.diagonalizing_gates())
-            res = dev.expval(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
+        res = dev.execute(tape)
 
         assert np.isclose(res, expected_output, atol=tol, rtol=0)
 
@@ -630,13 +625,8 @@ def test_var_single_wire_no_parameters(
         dev = qubit_device(wires=1)
         obs = operation(wires=[0])
         ops = [stateprep(np.array(input), wires=[0])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
-            res = dev.execute(tape)
-        else:
-            dev.reset()
-            dev.apply(ops, obs.diagonalizing_gates())
-            res = dev.var(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
+        res = dev.execute(tape)
 
         assert np.isclose(res, expected_output, atol=tol, rtol=0)
 
@@ -680,42 +670,22 @@ def test_sample_dimensions(self, qubit_device):
 
         shots = 10
         obs = qml.PauliZ(wires=[0])
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
-            s1 = dev.execute(tape)
-        else:
-            dev.reset()
-            dev.apply(ops)
-            dev.shots = shots
-            dev._wires_measured = {0}
-            dev._samples = dev.generate_samples()
-            s1 = dev.sample(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
+        s1 = dev.execute(tape)
+
         assert np.array_equal(s1.shape, (shots,))
 
         shots = 12
         obs = qml.PauliZ(wires=[1])
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
-            s2 = dev.execute(tape)
-        else:
-            dev.reset()
-            dev.shots = shots
-            dev._wires_measured = {1}
-            dev._samples = dev.generate_samples()
-            s2 = dev.sample(qml.PauliZ(wires=[1]))
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
+        s2 = dev.execute(tape)
         assert np.array_equal(s2.shape, (shots,))
 
         shots = 17
         obs = qml.PauliX(0) @ qml.PauliZ(1)
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
-            s3 = dev.execute(tape)
-        else:
-            dev.reset()
-            dev.shots = shots
-            dev._wires_measured = {0, 1}
-            dev._samples = dev.generate_samples()
-            s3 = dev.sample(qml.PauliZ(wires=[1]))
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
+        s3 = dev.execute(tape)
+
         assert np.array_equal(s3.shape, (shots,))
 
     def test_sample_values(self, qubit_device, tol):
@@ -730,18 +700,10 @@ def test_sample_values(self, qubit_device, tol):
 
         ops = [qml.RX(1.5708, wires=[0])]
 
-        shots = 1000
+        shots = qml.measurements.Shots(1000)
         obs = qml.PauliZ(0)
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
-            s1 = dev.execute(tape)
-        else:
-            dev.reset()
-            dev.apply(ops)
-            dev.shots = shots
-            dev._wires_measured = {0}
-            dev._samples = dev.generate_samples()
-            s1 = dev.sample(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
+        s1 = dev.execute(tape)
 
         # s1 should only contain 1 and -1, which is guaranteed if
         # they square to 1
@@ -756,13 +718,8 @@ def test_load_default_qubit_device(self):
         """Test that the default plugin loads correctly"""
 
         dev = qml.device(device_name, wires=2)
-        if dev._new_API:
-            assert not dev.shots
-            assert len(dev.wires) == 2
-        else:
-            assert dev.shots is None
-            assert dev.num_wires == 2
-            assert dev.short_name == device_name
+        assert not dev.shots
+        assert len(dev.wires) == 2
 
     @pytest.mark.xfail(ld._new_API, reason="Old device API required.")
     def test_no_backprop(self):
@@ -1276,14 +1233,10 @@ def test_multi_samples_return_correlated_results(self, qubit_device):
         def circuit():
             qml.Hadamard(0)
             qml.CNOT(wires=[0, 1])
-            if ld._new_API:
-                return qml.sample(wires=[0, 1])
-            else:
-                return qml.sample(qml.PauliZ(0)), qml.sample(qml.PauliZ(1))
+            return qml.sample(wires=[0, 1])
 
         outcomes = circuit()
-        if ld._new_API:
-            outcomes = outcomes.T
+        outcomes = outcomes.T
 
         assert np.array_equal(outcomes[0], outcomes[1])
 
@@ -1305,14 +1258,10 @@ def test_multi_samples_return_correlated_results_more_wires_than_size_of_observa
         def circuit():
             qml.Hadamard(0)
             qml.CNOT(wires=[0, 1])
-            if ld._new_API:
-                return qml.sample(wires=[0, 1])
-            else:
-                return qml.sample(qml.PauliZ(0)), qml.sample(qml.PauliZ(1))
+            return qml.sample(wires=[0, 1])
 
         outcomes = circuit()
-        if ld._new_API:
-            outcomes = outcomes.T
+        outcomes = outcomes.T
 
         assert np.array_equal(outcomes[0], outcomes[1])
 
@@ -1350,14 +1299,10 @@ def circuit():
             qml.Snapshot()
             qml.adjoint(qml.Snapshot())
             qml.CNOT(wires=[0, 1])
-            if ld._new_API:
-                return qml.sample(wires=[0, 1])
-            else:
-                return qml.sample(qml.PauliZ(0)), qml.sample(qml.PauliZ(1))
+            return qml.sample(wires=[0, 1])
 
         outcomes = circuit()
-        if ld._new_API:
-            outcomes = outcomes.T
+        outcomes = outcomes.T
 
         assert np.array_equal(outcomes[0], outcomes[1])
 
diff --git a/tests/test_expval.py b/tests/test_expval.py
index 0c48a76a67..5fb6e7fd2a 100644
--- a/tests/test_expval.py
+++ b/tests/test_expval.py
@@ -38,15 +38,8 @@ def test_identity_expectation(self, theta, phi, qubit_device, tol):
         O1 = qml.Identity(wires=[0])
         O2 = qml.Identity(wires=[1])
         ops = [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
-            res = dev.execute(tape)
-        else:
-            dev.apply(
-                ops,
-                rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-            )
-            res = np.array([dev.expval(O1), dev.expval(O2)])
+        tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
+        res = dev.execute(tape)
         assert np.allclose(res, np.array([1, 1]), tol)
 
     def test_pauliz_expectation(self, theta, phi, qubit_device, tol):
@@ -56,16 +49,8 @@ def test_pauliz_expectation(self, theta, phi, qubit_device, tol):
         O1 = qml.PauliZ(wires=[0])
         O2 = qml.PauliZ(wires=[1])
         ops = [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
-            res = dev.execute(tape)
-        else:
-            dev.apply(
-                ops,
-                rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-            )
-
-            res = np.array([dev.expval(O1), dev.expval(O2)])
+        tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
+        res = dev.execute(tape)
         assert np.allclose(res, np.array([np.cos(theta), np.cos(theta) * np.cos(phi)]), tol)
 
     def test_paulix_expectation(self, theta, phi, qubit_device, tol):
@@ -75,17 +60,9 @@ def test_paulix_expectation(self, theta, phi, qubit_device, tol):
         O1 = qml.PauliX(wires=[0])
         O2 = qml.PauliX(wires=[1])
         ops = [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
-            res = dev.execute(tape)
+        tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
+        res = dev.execute(tape)
 
-        else:
-            dev.apply(
-                ops,
-                rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-            )
-
-            res = np.array([dev.expval(O1), dev.expval(O2)], dtype=dev.C_DTYPE)
         assert np.allclose(
             res,
             np.array([np.sin(theta) * np.sin(phi), np.sin(phi)], dtype=dev.dtype),
@@ -99,17 +76,9 @@ def test_pauliy_expectation(self, theta, phi, qubit_device, tol):
         O1 = qml.PauliY(wires=[0])
         O2 = qml.PauliY(wires=[1])
         ops = [qml.RX(theta, wires=[0]), qml.RX(phi, wires=[1]), qml.CNOT(wires=[0, 1])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
-            res = dev.execute(tape)
-
-        else:
-            dev.apply(
-                ops,
-                rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-            )
+        tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
+        res = dev.execute(tape)
 
-            res = np.array([dev.expval(O1), dev.expval(O2)])
         assert np.allclose(res, np.array([0, -np.cos(theta) * np.sin(phi)]), tol)
 
     def test_hadamard_expectation(self, theta, phi, qubit_device, tol):
@@ -119,17 +88,9 @@ def test_hadamard_expectation(self, theta, phi, qubit_device, tol):
         O1 = qml.Hadamard(wires=[0])
         O2 = qml.Hadamard(wires=[1])
         ops = [qml.RY(theta, wires=[0]), qml.RY(phi, wires=[1]), qml.CNOT(wires=[0, 1])]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
-            res = dev.execute(tape)
+        tape = qml.tape.QuantumScript(ops, [qml.expval(O1), qml.expval(O2)])
+        res = dev.execute(tape)
 
-        else:
-            dev.apply(
-                ops,
-                rotations=[*O1.diagonalizing_gates(), *O2.diagonalizing_gates()],
-            )
-
-            res = np.array([dev.expval(O1), dev.expval(O2)])
         expected = np.array(
             [np.sin(theta) * np.sin(phi) + np.cos(theta), np.cos(theta) * np.cos(phi) + np.sin(phi)]
         ) / np.sqrt(2)
@@ -322,12 +283,8 @@ def test_paulix_pauliy(self, theta, phi, varphi, qubit_device, tol):
             qml.CNOT(wires=[0, 1]),
             qml.CNOT(wires=[1, 2]),
         ]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
-            res = dev.execute(tape)
-        else:
-            dev.apply(ops, rotations=obs.diagonalizing_gates())
-            res = dev.expval(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
+        res = dev.execute(tape)
 
         expected = np.sin(theta) * np.sin(phi) * np.sin(varphi)
 
@@ -345,16 +302,8 @@ def test_pauliz_identity(self, theta, phi, varphi, qubit_device, tol):
             qml.CNOT(wires=[0, 1]),
             qml.CNOT(wires=[1, 2]),
         ]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
-            res = dev.execute(tape)
-        else:
-            dev.apply(
-                ops,
-                rotations=obs.diagonalizing_gates(),
-            )
-
-            res = dev.expval(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
+        res = dev.execute(tape)
 
         expected = np.cos(varphi) * np.cos(phi)
 
@@ -372,15 +321,8 @@ def test_pauliz_hadamard_pauliy(self, theta, phi, varphi, qubit_device, tol):
             qml.CNOT(wires=[0, 1]),
             qml.CNOT(wires=[1, 2]),
         ]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
-            res = dev.execute(tape)
-        else:
-            dev.apply(
-                ops,
-                rotations=obs.diagonalizing_gates(),
-            )
-            res = dev.expval(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.expval(op=obs)])
+        res = dev.execute(tape)
         expected = -(np.cos(varphi) * np.sin(phi) + np.sin(varphi) * np.cos(theta)) / np.sqrt(2)
 
         assert np.allclose(res, expected, tol)
diff --git a/tests/test_gates.py b/tests/test_gates.py
index 36893a056c..fa120b820c 100644
--- a/tests/test_gates.py
+++ b/tests/test_gates.py
@@ -380,7 +380,7 @@ def test_state_prep(n_targets, tol):
             [qml.state()],
         )
         ref = dq.execute([tape])[0]
-        res = dev.execute([tape])[0] if ld._new_API else dev.execute(tape)
+        res = dev.execute([tape])[0]
         assert np.allclose(res.ravel(), ref.ravel(), tol)
 
 
diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index 68f0e279de..c2bf5d8307 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -393,7 +393,7 @@ def circuit():
             qml.RX(0.52, wires=0)
             return qml.expval(qml.RX(0.742, wires=[0]))
 
-        with pytest.raises(qml._device.DeviceError, match="Observable RX.*not supported"):
+        with pytest.raises(qml.DeviceError, match="Observable RX.*not supported"):
             circuit()
 
     def test_observable_return_type_is_expectation(self, dev):
@@ -494,7 +494,7 @@ def circuit():
             qml.RX(0.52, wires=0)
             return qml.var(qml.RX(0.742, wires=[0]))
 
-        with pytest.raises(qml._device.DeviceError, match="Observable RX.*not supported"):
+        with pytest.raises(qml.DeviceError, match="Observable RX.*not supported"):
             circuit()
 
     def test_observable_return_type_is_variance(self, dev):
@@ -523,7 +523,7 @@ def circuit():
             qml.RX(0.52, wires=0)
             return qml.var(qml.RX(0.742, wires=[0]))
 
-        with pytest.raises(qml._device.DeviceError, match="Observable RX.*not supported"):
+        with pytest.raises(qml.DeviceError, match="Observable RX.*not supported"):
             circuit()
 
 
@@ -651,14 +651,8 @@ def test_sample_dimensions(self, qubit_device, shots, wires):
         dev = qubit_device(wires=2, shots=shots)
         ops = [qml.RX(1.5708, wires=[0]), qml.RX(1.5708, wires=[1])]
         obs = qml.PauliZ(wires=[0])
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
-            s1 = dev.execute(tape)
-        else:
-            dev.apply(ops)
-            dev._wires_measured = wires
-            dev._samples = dev.generate_samples()
-            s1 = dev.sample(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
+        s1 = dev.execute(tape)
         assert np.array_equal(s1.shape, (shots,))
 
     def test_sample_values(self, qubit_device, tol):
@@ -669,14 +663,8 @@ def test_sample_values(self, qubit_device, tol):
         dev = qubit_device(wires=2, shots=shots)
         ops = [qml.RX(1.5708, wires=[0])]
         obs = qml.PauliZ(0)
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
-            s1 = dev.execute(tape)
-        else:
-            dev.apply(ops)
-            dev._wires_measured = {0}
-            dev._samples = dev.generate_samples()
-            s1 = dev.sample(qml.PauliZ(0))
+        tape = qml.tape.QuantumScript(ops, [qml.sample(op=obs)], shots=shots)
+        s1 = dev.execute(tape)
 
         # s1 should only contain 1 and -1, which is guaranteed if
         # they square to 1
diff --git a/tests/test_native_mcm.py b/tests/test_native_mcm.py
index f7b9c030fc..4ca3b66b74 100644
--- a/tests/test_native_mcm.py
+++ b/tests/test_native_mcm.py
@@ -20,7 +20,6 @@
 import pytest
 from conftest import LightningDevice, device_name, validate_measurements
 from flaky import flaky
-from pennylane._device import DeviceError
 
 if device_name not in ("lightning.qubit", "lightning.kokkos"):
     pytest.skip("Native MCM not supported. Skipping.", allow_module_level=True)
@@ -86,7 +85,7 @@ def func(x, y):
 
     if device_name == "lightning.qubit":
         with pytest.raises(
-            DeviceError,
+            qml.DeviceError,
             match=f"not accepted with finite shots on lightning.qubit",
         ):
             func(*params)
diff --git a/tests/test_templates.py b/tests/test_templates.py
index d97d1a29a3..4774a12496 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -176,7 +176,7 @@ def circuit(feature_vector):
 
         X = np.arange(1, n_qubits + 1)
 
-        with pytest.raises(qml._device.DeviceError, match="not supported"):
+        with pytest.raises(qml.DeviceError, match="not supported"):
             _ = qml.QNode(circuit, dev, diff_method=None)(X)
 
 
@@ -242,7 +242,7 @@ def circuit(weights):
         shapes = qml.CVNeuralNetLayers.shape(n_layers=2, n_wires=n_qubits)
         weights = [np.random.random(shape) for shape in shapes]
 
-        with pytest.raises(qml._device.DeviceError, match="not supported"):
+        with pytest.raises(qml.DeviceError, match="not supported"):
             _ = qml.QNode(circuit, dev, diff_method=None)(weights)
 
 
diff --git a/tests/test_var.py b/tests/test_var.py
index 16b7ee1f3e..f2675fa2dc 100644
--- a/tests/test_var.py
+++ b/tests/test_var.py
@@ -42,15 +42,10 @@ def test_var(self, theta, phi, qubit_device, tol):
             qml.RX(phi, wires=[0]),
             qml.RY(theta, wires=[0]),
         ]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
-            var = dev.execute(tape)
-        else:
-            dev.apply(
-                ops,
-                rotations=[*obs.diagonalizing_gates()],
-            )
-            var = dev.var(obs)
+
+        tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
+        var = dev.execute(tape)
+
         expected = 0.25 * (3 - np.cos(2 * theta) - 2 * np.cos(theta) ** 2 * np.cos(2 * phi))
 
         assert np.allclose(var, expected, tol)
@@ -99,15 +94,8 @@ def test_paulix_pauliy(self, theta, phi, varphi, qubit_device, tol):
             qml.CNOT(wires=[0, 1]),
             qml.CNOT(wires=[1, 2]),
         ]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
-            res = dev.execute(tape)
-        else:
-            dev.apply(
-                ops,
-                rotations=obs.diagonalizing_gates(),
-            )
-            res = dev.var(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
+        res = dev.execute(tape)
 
         expected = (
             8 * np.sin(theta) ** 2 * np.cos(2 * varphi) * np.sin(phi) ** 2
@@ -131,15 +119,8 @@ def test_pauliz_hadamard_pauliy(self, theta, phi, varphi, qubit_device, tol):
             qml.CNOT(wires=[0, 1]),
             qml.CNOT(wires=[1, 2]),
         ]
-        if ld._new_API:
-            tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
-            res = dev.execute(tape)
-        else:
-            dev.apply(
-                ops,
-                rotations=obs.diagonalizing_gates(),
-            )
-            res = dev.var(obs)
+        tape = qml.tape.QuantumScript(ops, [qml.var(op=obs)])
+        res = dev.execute(tape)
 
         expected = (
             3

From b387cfc9aff88b6d4fafc2629950954f1e35b605 Mon Sep 17 00:00:00 2001
From: Shuli Shu <31480676+multiphaseCFD@users.noreply.github.com>
Date: Wed, 21 Aug 2024 10:13:39 -0400
Subject: [PATCH 2/2] Add `probs()` support to `lightning.tensor` (#830)

### Before submitting

Please complete the following checklist when submitting a PR:

- [ ] All new features must include a unit test.
If you've fixed a bug or added code that should be tested, add a test to
the
      [`tests`](../tests) directory!

- [ ] All new functions and code must be clearly commented and
documented.
If you do make documentation changes, make sure that the docs build and
      render correctly by running `make docs`.

- [ ] Ensure that the test suite passes, by running `make test`.

- [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing
the
      change, and including a link back to the PR.

- [x] Ensure that code is properly formatted by running `make format`.

When all the above are checked, delete everything above the dashed
line and fill in the pull request template.


------------------------------------------------------------------------------------------------------------

**Context:**

[SC-65784]

This PR add `probs()` support to `lightning.tensor`. Additionally, this
PR also found a temporal approach to allow users/developers to call
`appendMPSFinalize` multiple time. This feature is essential for the
`probs(obs)` as well as `shot measurement`.

**Description of the Change:**

**Benefits:**

**Possible Drawbacks:**

**Related GitHub Issues:**

---------

Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai>
Co-authored-by: Vincent Michaud-Rioux <vincentm@nanoacademic.com>
Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
Co-authored-by: Lee James O'Riordan <mlxd@users.noreply.github.com>
Co-authored-by: Josh Izaac <josh146@gmail.com>
Co-authored-by: Lee J. O'Riordan <lee@xanadu.au>
Co-authored-by: Pietropaolo Frisoni <pietropaolo.frisoni@xanadu.ai>
Co-authored-by: erick-xanadu <110487834+erick-xanadu@users.noreply.github.com>
Co-authored-by: Shiro-Raven <exclass9.24@gmail.com>
Co-authored-by: albi3ro <chrissie.c.l@gmail.com>
---
 .github/CHANGELOG.md                          |   6 +
 pennylane_lightning/core/_version.py          |   2 +-
 .../core/src/bindings/Bindings.hpp            |   7 +
 .../lightning_tensor/tncuda/MPSTNCuda.hpp     |  39 +++--
 .../lightning_tensor/tncuda/TNCudaBase.hpp    |  99 +++++++++--
 .../tncuda/gates/TNCudaGateCache.hpp          |  16 ++
 .../gates/tests/Test_MPSTNCuda_NonParam.cpp   |  10 ++
 .../tncuda/measurements/CMakeLists.txt        |  13 +-
 .../measurements/MeasurementsTNCuda.hpp       | 108 +++++++++++-
 .../measurements/cuda_kernels_measures.cu     | 161 ++++++++++++++++++
 .../tncuda/measurements/tests/CMakeLists.txt  |   3 +-
 .../tests/Test_MPSTNCuda_Measure.cpp          |  94 ++++++++++
 .../observables/ObservablesTNCudaOperator.hpp |  12 +-
 .../core/src/utils/cuda_utils/LinearAlg.hpp   |  25 +++
 .../cuda_utils/tests/Test_LinearAlgebra.cpp   |  23 ++-
 .../lightning_tensor/_measurements.py         |  36 +++-
 .../lightning_tensor/_tensornet.py            |  10 +-
 tests/new_api/test_device.py                  |  10 +-
 tests/test_apply.py                           |   4 +-
 tests/test_comparison.py                      |  10 +-
 tests/test_expval.py                          |  15 +-
 tests/test_measurements.py                    |   8 +-
 tests/test_templates.py                       |  12 +-
 23 files changed, 647 insertions(+), 76 deletions(-)
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/cuda_kernels_measures.cu
 create mode 100644 pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp

diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 298469052a..bb485719ca 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -2,6 +2,9 @@
 
 ### New features since last release
 
+* Add the analytic `qml.probs()` measurement support to `lightning.tensor`.
+  [(#830)](https://github.com/PennyLaneAI/pennylane-lightning/pull/830)
+
 * Add `qml.state()` measurement support to `lightning.tensor`.
   [(#827)](https://github.com/PennyLaneAI/pennylane-lightning/pull/827)
 
@@ -27,6 +30,9 @@
 
 ### Improvements
 
+* Multiple calls to the `append_mps_final_state()` API is allowed in `lightning.tensor`.
+  [(#830)](https://github.com/PennyLaneAI/pennylane-lightning/pull/830)
+  
 * Update `generate_samples` in `LightningKokkos` and `LightningGPU` to support `qml.measurements.Shots` type instances.
   [(#839)](https://github.com/PennyLaneAI/pennylane-lightning/pull/839)
 
diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
index b4c1ef4399..d6ffb292ae 100644
--- a/pennylane_lightning/core/_version.py
+++ b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev39"
+__version__ = "0.38.0-dev40"
diff --git a/pennylane_lightning/core/src/bindings/Bindings.hpp b/pennylane_lightning/core/src/bindings/Bindings.hpp
index f1b7acf167..08907f5126 100644
--- a/pennylane_lightning/core/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/core/src/bindings/Bindings.hpp
@@ -734,6 +734,13 @@ void registerLightningTensorBackendAgnosticMeasurements(PyClass &pyclass) {
                 return M.expval(*ob);
             },
             "Expected value of an observable object.")
+        .def(
+            "probs",
+            [](MeasurementsT &M, const std::vector<std::size_t> &wires) {
+                return py::array_t<typename TensorNetT::PrecisionT>(
+                    py::cast(M.probs(wires)));
+            },
+            "Probabilities of a set of wires.")
         .def(
             "var",
             [](MeasurementsT &M, const std::shared_ptr<ObservableT> &ob) {
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
index a39fe015ad..dad38bfdd6 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/MPSTNCuda.hpp
@@ -62,7 +62,6 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
     using BaseType = TNCudaBase<Precision, MPSTNCuda>;
 
     MPSStatus MPSInitialized_ = MPSStatus::MPSInitNotSet;
-    MPSStatus MPSFinalized_ = MPSStatus::MPSFinalizedNotSet;
 
     const std::size_t maxBondDim_;
 
@@ -215,21 +214,18 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
      */
     void append_mps_final_state(double cutoff = 0,
                                 std::string cutoff_mode = "abs") {
-        if (MPSFinalized_ == MPSStatus::MPSFinalizedNotSet) {
-            MPSFinalized_ = MPSStatus::MPSFinalizedSet;
-            PL_CUTENSORNET_IS_SUCCESS(cutensornetStateFinalizeMPS(
-                /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
-                /* cutensornetState_t */ BaseType::getQuantumState(),
-                /* cutensornetBoundaryCondition_t */
-                CUTENSORNET_BOUNDARY_CONDITION_OPEN,
-                /* const int64_t *const extentsOut[] */
-                getSitesExtentsPtr().data(),
-                /*strides=*/nullptr));
-        }
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateFinalizeMPS(
+            /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
+            /* cutensornetState_t */ BaseType::getQuantumState(),
+            /* cutensornetBoundaryCondition_t */
+            CUTENSORNET_BOUNDARY_CONDITION_OPEN,
+            /* const int64_t *const extentsOut[] */
+            getSitesExtentsPtr().data(),
+            /*strides=*/nullptr));
 
         // Optional: SVD
         cutensornetTensorSVDAlgo_t algo =
-            CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; // default
+            CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ; // default option
 
         PL_CUTENSORNET_IS_SUCCESS(cutensornetStateConfigure(
             /* const cutensornetHandle_t */ BaseType::getTNCudaHandle(),
@@ -257,6 +253,21 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
         BaseType::computeState(
             const_cast<int64_t **>(getSitesExtentsPtr().data()),
             reinterpret_cast<void **>(getTensorsOutDataPtr().data()));
+
+        // TODO: This is a dummy tensor update to allow multiple calls to the
+        // `append_mps_final_state` method as well as appending additional
+        // operations to the graph. This is a temporary solution and this line
+        // can be removed in the future when the `cutensornet` backend allows
+        // multiple calls to the `cutensornetStateFinalizeMPS` method. For more
+        // details, please see the `cutensornet` high-level API workflow logic
+        // [here]
+        // (https://docs.nvidia.com/cuda/cuquantum/latest/cutensornet/api/functions.html#high-level-tensor-network-api).
+        // In order to proceed with the following gate operations or
+        // measurements after calling the `cutensornetStateCompute()` API, we
+        // have to call the `cutensornetStateUpdateTensor()` API, which is
+        // wrapped inside the `dummy_tensor_update()` method.
+        //
+        BaseType::dummy_tensor_update();
     }
 
     /**
@@ -276,7 +287,7 @@ class MPSTNCuda final : public TNCudaBase<Precision, MPSTNCuda<Precision>> {
 
         PL_ABORT_IF(log2(avail_gpu_memory) < BaseType::getNumQubits(),
                     "State tensor size exceeds the available GPU memory!");
-        this->get_state_tensor(res);
+        BaseType::get_state_tensor(res);
     }
 
     /**
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
index 83f9aa2774..35f296930c 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/TNCudaBase.hpp
@@ -60,6 +60,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     using ComplexT = std::complex<PrecisionT>;
     using BaseType = TensornetBase<PrecisionT, Derived>;
     SharedTNCudaHandle handle_;
+    SharedCublasCaller cublascaller_;
     cudaDataType_t typeData_;
     DevTag<int> dev_tag_;
     cutensornetComputeType_t typeCompute_;
@@ -78,6 +79,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     explicit TNCudaBase(const std::size_t numQubits, int device_id = 0,
                         cudaStream_t stream_id = 0)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
+          cublascaller_(make_shared_cublas_caller()),
           dev_tag_({device_id, stream_id}),
           gate_cache_(std::make_shared<TNCudaGateCache<PrecisionT>>(dev_tag_)) {
         // TODO this code block could be moved to base class and need to revisit
@@ -108,7 +110,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     // runtime in the C++ layer
     explicit TNCudaBase(const std::size_t numQubits, DevTag<int> dev_tag)
         : BaseType(numQubits), handle_(make_shared_tncuda_handle()),
-          dev_tag_(dev_tag),
+          cublascaller_(make_shared_cublas_caller()), dev_tag_(dev_tag),
           gate_cache_(std::make_shared<TNCudaGateCache<PrecisionT>>(dev_tag_)) {
         // TODO this code block could be moved to base class and need to revisit
         // when working on copy ctor
@@ -155,6 +157,15 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
         return handle_.get();
     }
 
+    /**
+     * @brief Access the CublasCaller the object is using.
+     *
+     * @return a reference to the object's CublasCaller object.
+     */
+    auto getCublasCaller() const -> const CublasCaller & {
+        return *cublascaller_;
+    }
+
     /**
      * @brief Get the quantum state pointer.
      *
@@ -299,7 +310,6 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
             /* int32_t unitary*/ 1));
     }
 
-  protected:
     /**
      * @brief Get the state vector representation of a tensor network.
      *
@@ -323,7 +333,7 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     }
 
     /**
-     * @brief Get a slice of the state tensor
+     * @brief Get a slice of the full state tensor
      *
      * @param tensor_data Pointer to the device memory for state tensor data.
      * @param tensor_data_size Size of the state tensor data.
@@ -334,17 +344,61 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
     void get_state_tensor(CFP_t *tensor_data,
                           const std::size_t tensor_data_size,
                           const std::vector<std::size_t> &wires,
-                          const int32_t numHyperSamples = 1) {
-        // NOTE: this is a solution to get the full state tensor
-        // TODO: projected_modes and projectedModeValues are to be updated for
-        // prob() support.
+                          const int32_t numHyperSamples = 1) const {
         auto stateModes = cuUtil::NormalizeCastIndices<std::size_t, int32_t>(
             wires, BaseType::getNumQubits());
 
         std::vector<int32_t> projected_modes{};
 
-        std::vector<int64_t> projectedModeValues{};
+        for (int32_t idx = 0;
+             idx < static_cast<int32_t>(BaseType::getNumQubits()); idx++) {
+            auto it = std::find(stateModes.begin(), stateModes.end(), idx);
+            if (it == stateModes.end()) {
+                projected_modes.emplace_back(idx);
+            }
+        }
+
+        std::vector<int64_t> projectedModeValues(projected_modes.size(), 0);
 
+        if (projected_modes.empty()) {
+            get_accessor_(tensor_data, tensor_data_size, projected_modes,
+                          projectedModeValues, numHyperSamples);
+        } else {
+            DataBuffer<CFP_t, int> tmp(tensor_data_size, getDevTag(), true);
+
+            const std::size_t projected_modes_size = size_t(1)
+                                                     << projected_modes.size();
+            for (std::size_t idx = 0; idx < projected_modes_size; idx++) {
+                for (std::size_t j = 0; j < projected_modes.size(); j++) {
+                    projectedModeValues[j] = (idx >> j) & 1;
+                }
+
+                get_accessor_(tmp.getData(), tensor_data_size, projected_modes,
+                              projectedModeValues, numHyperSamples);
+                // Copy the data to the output tensor
+                scaleAndAddC_CUDA(std::complex<PrecisionT>{1.0, 0.0},
+                                  tmp.getData(), tensor_data, tmp.getLength(),
+                                  getDevTag().getDeviceID(),
+                                  getDevTag().getStreamID(), getCublasCaller());
+            }
+        }
+    }
+
+  private:
+    /**
+     * @brief Get accessor of a state tensor
+     *
+     * @param tensor_data Pointer to the device memory for state tensor data.
+     * @param tensor_data_size Size of the tensor data.
+     * @param projected_modes Projected modes to get the state tensor for.
+     * @param projectedModeValues Values of the projected modes.
+     * @param numHyperSamples Number of hyper samples to use in the calculation
+     * and is set to 1 by default.
+     */
+    void get_accessor_(CFP_t *tensor_data, const std::size_t tensor_data_size,
+                       const std::vector<int32_t> &projected_modes,
+                       const std::vector<int64_t> &projectedModeValues,
+                       const int32_t numHyperSamples = 1) const {
         cutensornetStateAccessor_t accessor;
         PL_CUTENSORNET_IS_SUCCESS(cutensornetCreateAccessor(
             /* const cutensornetHandle_t */ getTNCudaHandle(),
@@ -415,17 +469,40 @@ class TNCudaBase : public TensornetBase<PrecisionT, Derived> {
 
         CFP_t scale_scalar_cu{scale_scalar.real(), scale_scalar.imag()};
 
-        SharedCublasCaller cublascaller = make_shared_cublas_caller();
-
         scaleC_CUDA<CFP_t, CFP_t>(scale_scalar_cu, tensor_data,
                                   tensor_data_size, getDevTag().getDeviceID(),
-                                  getDevTag().getStreamID(), *cublascaller);
+                                  getDevTag().getStreamID(), getCublasCaller());
 
         PL_CUTENSORNET_IS_SUCCESS(
             cutensornetDestroyWorkspaceDescriptor(workDesc));
         PL_CUTENSORNET_IS_SUCCESS(cutensornetDestroyAccessor(accessor));
     }
 
+  protected:
+    /**
+     * @brief Dummy tensor operator update to allow multiple calls of
+     * appendMPSFinalize. This is a workaround to avoid the issue of the
+     * cutensornet library not allowing multiple calls of appendMPSFinalize.
+     *
+     * This function either appends a new `Identity` gate to the graph when the
+     * gate cache is empty or update the existing gate operator by itself.
+     */
+    void dummy_tensor_update() {
+        if (gate_cache_->is_empty()) {
+            applyOperation("Identity", {0}, false);
+        }
+
+        const std::size_t id = gate_cache_->get_cache_head_idx();
+
+        PL_CUTENSORNET_IS_SUCCESS(cutensornetStateUpdateTensorOperator(
+            /* const cutensornetHandle_t */ getTNCudaHandle(),
+            /* cutensornetState_t */ getQuantumState(),
+            /* int64_t tensorId*/ static_cast<int64_t>(id),
+            /* void* */
+            static_cast<void *>(gate_cache_->get_gate_device_ptr(id)),
+            /* int32_t unitary*/ 1));
+    }
+
     /**
      * @brief Save quantumState information to data provided by a user
      *
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
index ba15c458cf..377ccc66ce 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/TNCudaGateCache.hpp
@@ -157,6 +157,22 @@ template <class PrecisionT> class TNCudaGateCache {
         return device_gates_.at(gate_id).second.getDataBuffer().getData();
     }
 
+    /**
+     * @brief Returns the key (index of the gate) of the first element in the
+     * `device_gates_`.
+     *
+     * @return size_t Key of the first element in the `device_gates_`.
+     */
+    auto get_cache_head_idx() const -> std::size_t {
+        auto it = device_gates_.begin();
+        return it->first;
+    }
+
+    /**
+     * @brief Returns if the `device_gates_` is empty.
+     */
+    auto is_empty() const -> bool { return device_gates_.empty(); }
+
   private:
     const DevTag<int> device_tag_;
     std::size_t total_alloc_bytes_;
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
index 5044e5b8db..8718cb1934 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/gates/tests/Test_MPSTNCuda_NonParam.cpp
@@ -74,7 +74,17 @@ TEMPLATE_TEST_CASE("MPSTNCuda::Gates::Hadamard", "[MPSTNCuda_Nonparam]", float,
             const std::size_t index = GENERATE(0, 1, 2);
             MPSTNCuda<TestType> mps_state{num_qubits, maxExtent, dev_tag};
 
+            mps_state.append_mps_final_state();
+
             mps_state.applyOperation("Hadamard", {index}, inverse);
+
+            mps_state.append_mps_final_state();
+
+            mps_state.applyOperation("Identity", {index}, inverse);
+
+            // Test for multiple final states appendings
+            mps_state.append_mps_final_state();
+
             cp_t expected(1.0 / std::sqrt(2), 0);
 
             auto results = mps_state.getDataVector();
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/CMakeLists.txt
index 12ea988efd..f5c3a80860 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/CMakeLists.txt
@@ -1,12 +1,19 @@
 cmake_minimum_required(VERSION 3.20)
 
-project(${PL_BACKEND}_measurements LANGUAGES CXX)
+project(${PL_BACKEND}_measurements LANGUAGES CXX C CUDA)
 
-add_library(${PL_BACKEND}_measurements INTERFACE)
+if(NOT DEFINED CMAKE_CUDA20_STANDARD_COMPILE_OPTION)
+  set(CMAKE_CUDA20_STANDARD_COMPILE_OPTION "")
+  set(CMAKE_CUDA20_EXTENSION_COMPILE_OPTION "")
+endif()
+
+set(LTENSOR_MPS_FILES cuda_kernels_measures.cu CACHE INTERNAL "" FORCE)
+
+add_library(${PL_BACKEND}_measurements STATIC ${LTENSOR_MPS_FILES})
 
 target_include_directories(${PL_BACKEND}_measurements INTERFACE ${CMAKE_CURRENT_SOURCE_DIR})
 
-target_link_libraries(${PL_BACKEND}_measurements INTERFACE  lightning_compile_options
+target_link_libraries(${PL_BACKEND}_measurements PUBLIC  lightning_compile_options
                                                             lightning_external_libs
                                                             ${PL_BACKEND}
                                                             ${PL_BACKEND}_utils
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp
index fed9f489b0..e7234367bc 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/MeasurementsTNCuda.hpp
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 /**
- * @file
+ * @file MeasurementsTNCuda.hpp
  * Defines a class for the measurement of observables in quantum states
  * represented by a Lightning Tensor class.
  */
@@ -21,9 +21,11 @@
 #pragma once
 
 #include <complex>
+#include <cuComplex.h>
 #include <cutensornet.h>
 #include <vector>
 
+#include "LinearAlg.hpp"
 #include "MPSTNCuda.hpp"
 #include "ObservablesTNCuda.hpp"
 #include "ObservablesTNCudaOperator.hpp"
@@ -39,6 +41,21 @@ using namespace Pennylane::LightningTensor::TNCuda::Util;
 /// @endcond
 
 namespace Pennylane::LightningTensor::TNCuda::Measures {
+extern void getProbs_CUDA(cuComplex *state, float *probs, const int data_size,
+                          const std::size_t thread_per_block,
+                          cudaStream_t stream_id);
+extern void getProbs_CUDA(cuDoubleComplex *state, double *probs,
+                          const int data_size,
+                          const std::size_t thread_per_block,
+                          cudaStream_t stream_id);
+extern void normalizeProbs_CUDA(float *probs, const int data_size,
+                                const float sum,
+                                const std::size_t thread_per_block,
+                                cudaStream_t stream_id);
+extern void normalizeProbs_CUDA(double *probs, const int data_size,
+                                const double sum,
+                                const std::size_t thread_per_block,
+                                cudaStream_t stream_id);
 /**
  * @brief ObservablesTNCuda's Measurement Class.
  *
@@ -51,6 +68,7 @@ template <class TensorNetT> class MeasurementsTNCuda {
   private:
     using PrecisionT = typename TensorNetT::PrecisionT;
     using ComplexT = typename TensorNetT::ComplexT;
+    using CFP_t = typename TensorNetT::CFP_t;
 
     const TensorNetT &tensor_network_;
 
@@ -58,6 +76,94 @@ template <class TensorNetT> class MeasurementsTNCuda {
     explicit MeasurementsTNCuda(const TensorNetT &tensor_network)
         : tensor_network_(tensor_network){};
 
+    /**
+     * @brief Probabilities for a subset of the full system.
+     *
+     * @tparam thread_per_block Number of threads per block in the CUDA kernel
+     * and is default as `256`. `256` is chosen as a default value because it is
+     * a balance of warp size and occupancy. Note that this number is not
+     * optimal for all cases and may need to be adjusted based on the specific
+     * use case, especially the number of elements in the subset is small.
+     *
+     * @param wires Wires will restrict probabilities to a subset
+     * of the full system.
+     * @param numHyperSamples Number of hyper samples to be used in the
+     * calculation and is default as 1.
+     *
+     * @return Floating point std::vector with probabilities.
+     */
+    template <std::size_t thread_per_block = 256>
+    auto probs(const std::vector<std::size_t> &wires,
+               const int32_t numHyperSamples = 1) -> std::vector<PrecisionT> {
+        PL_ABORT_IF_NOT(std::is_sorted(wires.begin(), wires.end()),
+                        "Invalid wire indices order. Please ensure that the "
+                        "wire indices are in the ascending order.");
+
+        const std::size_t length = std::size_t{1} << wires.size();
+
+        std::vector<PrecisionT> h_res(length, 0.0);
+
+        DataBuffer<CFP_t, int> d_output_tensor(
+            length, tensor_network_.getDevTag(), true);
+
+        d_output_tensor.zeroInit();
+
+        tensor_network_.get_state_tensor(d_output_tensor.getData(),
+                                         d_output_tensor.getLength(), wires,
+                                         numHyperSamples);
+
+        // `10` here means `1024` elements to be calculated
+        // LCOV_EXCL_START
+        if (wires.size() > 10) {
+            DataBuffer<PrecisionT, int> d_output_probs(
+                length, tensor_network_.getDevTag(), true);
+
+            getProbs_CUDA(d_output_tensor.getData(), d_output_probs.getData(),
+                          length, static_cast<int>(thread_per_block),
+                          tensor_network_.getDevTag().getStreamID());
+
+            PrecisionT sum;
+
+            asum_CUDA_device<PrecisionT>(
+                d_output_probs.getData(), length,
+                tensor_network_.getDevTag().getDeviceID(),
+                tensor_network_.getDevTag().getStreamID(),
+                tensor_network_.getCublasCaller(), &sum);
+
+            PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
+
+            normalizeProbs_CUDA(d_output_probs.getData(), length, sum,
+                                static_cast<int>(thread_per_block),
+                                tensor_network_.getDevTag().getStreamID());
+
+            d_output_probs.CopyGpuDataToHost(h_res.data(), h_res.size());
+        } else {
+            // LCOV_EXCL_STOP
+            // This branch dispatches the calculation to the CPU for a small
+            // number of wires. The CPU calculation is faster than the GPU
+            // calculation for a small number of wires due to the overhead of
+            // the GPU kernel launch.
+            std::vector<ComplexT> h_state_vector(length);
+            d_output_tensor.CopyGpuDataToHost(h_state_vector.data(),
+                                              h_state_vector.size());
+            // TODO: OMP support
+            for (std::size_t i = 0; i < length; i++) {
+                h_res[i] = std::norm(h_state_vector[i]);
+            }
+
+            // TODO: OMP support
+            PrecisionT sum = std::accumulate(h_res.begin(), h_res.end(), 0.0);
+
+            PL_ABORT_IF(sum == 0.0, "Sum of probabilities is zero.");
+            // TODO: OMP support
+            for (std::size_t i = 0; i < length; i++) {
+                h_res[i] /= sum;
+            }
+        }
+
+        return h_res;
+    }
+
     /**
      * @brief Calculate var value for a general ObservableTNCuda Observable.
      *
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/cuda_kernels_measures.cu b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/cuda_kernels_measures.cu
new file mode 100644
index 0000000000..520ca9fd2d
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/cuda_kernels_measures.cu
@@ -0,0 +1,161 @@
+// Copyright 2024 Xanadu Quantum Technologies Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//     http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file cuda_kernels_measures.cu
+ */
+#include <cuComplex.h>
+
+#include "cuError.hpp"
+#include "cuda_helpers.hpp"
+
+namespace Pennylane::LightningTensor::TNCuda::Measures {
+
+/**
+ * @brief The CUDA kernel that calculates the probability from a given state
+ * tensor data on GPU device.
+ *
+ * @tparam GPUDataT cuComplex data type (cuComplex or cuDoubleComplex).
+ * @tparam PrecisionT Floating data type.
+ *
+ * @param state Complex data pointer of state tensor on device.
+ * @param probs The probability result on device.
+ * @param data_size The length of state tensor on device.
+ */
+template <class GPUDataT, class PrecisionT>
+__global__ void getProbsKernel(GPUDataT *state, PrecisionT *probs,
+                               const int data_size) {
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < data_size) {
+        PrecisionT real = state[i].x;
+        PrecisionT imag = state[i].y;
+        probs[i] = real * real + imag * imag;
+    }
+}
+
+/**
+ * @brief The CUDA kernel that normalize the probability from a given state
+ * tensor data on GPU device.
+ *
+ * @tparam PrecisionT Floating data type.
+ *
+ * @param probs The probability to be normalized.
+ * @param data_size The length of state tensor on device.
+ * @param sum The sum of all probabilities.
+ */
+template <class PrecisionT>
+__global__ void normalizeProbsKernel(PrecisionT *probs, const int data_size,
+                                     const PrecisionT sum) {
+    const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (i < data_size) {
+        probs[i] /= sum;
+    }
+}
+
+/**
+ * @brief The CUDA kernel call wrapper.
+ *
+ * @tparam GPUDataT cuComplex data type (cuComplex or cuDoubleComplex).
+ * @tparam PrecisionT Floating data type.
+ *
+ * @param state Complex data pointer of state tensor on device.
+ * @param probs The probability result on device.
+ * @param data_size The length of state tensor on device.
+ * @param thread_per_block Number of threads set per block.
+ * @param stream_id Stream id of CUDA calls
+ */
+template <class GPUDataT, class PrecisionT>
+void getProbs_CUDA_call(GPUDataT *state, PrecisionT *probs, const int data_size,
+                        std::size_t thread_per_block, cudaStream_t stream_id) {
+    auto dv = std::div(data_size, thread_per_block);
+    const std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1);
+    const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks);
+    dim3 blockSize(thread_per_block, 1, 1);
+    dim3 gridSize(block_per_grid, 1);
+
+    getProbsKernel<GPUDataT, PrecisionT>
+        <<<gridSize, blockSize, 0, stream_id>>>(state, probs, data_size);
+    PL_CUDA_IS_SUCCESS(cudaGetLastError());
+}
+
+/**
+ * @brief The CUDA kernel call wrapper.
+ *
+ * @tparam PrecisionT Floating data type.
+ *
+ * @param probs The probability to be normalized.
+ * @param data_size The length of state tensor on device.
+ * @param thread_per_block Number of threads set per block.
+ * @param stream_id Stream id of CUDA calls
+ */
+template <class PrecisionT>
+void normalizeProbs_CUDA_call(PrecisionT *probs, const int data_size,
+                              const PrecisionT sum,
+                              std::size_t thread_per_block,
+                              cudaStream_t stream_id) {
+    auto dv = std::div(data_size, thread_per_block);
+    const std::size_t num_blocks = dv.quot + (dv.rem == 0 ? 0 : 1);
+    const std::size_t block_per_grid = (num_blocks == 0 ? 1 : num_blocks);
+    dim3 blockSize(thread_per_block, 1, 1);
+    dim3 gridSize(block_per_grid, 1);
+
+    normalizeProbsKernel<PrecisionT>
+        <<<gridSize, blockSize, 0, stream_id>>>(probs, data_size, sum);
+    PL_CUDA_IS_SUCCESS(cudaGetLastError());
+}
+
+// Definitions
+/**
+ * @brief Explicitly get the probability of given state tensor data on GPU
+ * device.
+ *
+ * @param state Complex data pointer of state tensor on device.
+ * @param probs The probability result on device.
+ * @param data_size The length of state tensor on device.
+ * @param thread_per_block Number of threads set per block.
+ * @param stream_id Stream id of CUDA calls
+ */
+void getProbs_CUDA(cuComplex *state, float *probs, const int data_size,
+                   const std::size_t thread_per_block, cudaStream_t stream_id) {
+    getProbs_CUDA_call<cuComplex, float>(state, probs, data_size,
+                                         thread_per_block, stream_id);
+}
+
+void getProbs_CUDA(cuDoubleComplex *state, double *probs, const int data_size,
+                   const std::size_t thread_per_block, cudaStream_t stream_id) {
+    getProbs_CUDA_call<cuDoubleComplex, double>(state, probs, data_size,
+                                                thread_per_block, stream_id);
+}
+
+/**
+ * @brief Explicitly get the probability of given state tensor data on GPU
+ * device.
+ *
+ * @param probs The probability to be normalized.
+ * @param data_size The length of state tensor on device.
+ * @param thread_per_block Number of threads set per block.
+ * @param stream_id Stream id of CUDA calls
+ */
+void normalizeProbs_CUDA(float *probs, const int data_size, const float sum,
+                         const std::size_t thread_per_block,
+                         cudaStream_t stream_id) {
+    normalizeProbs_CUDA_call<float>(probs, data_size, sum, thread_per_block,
+                                    stream_id);
+}
+
+void normalizeProbs_CUDA(double *probs, const int data_size, const double sum,
+                         const std::size_t thread_per_block,
+                         cudaStream_t stream_id) {
+    normalizeProbs_CUDA_call<double>(probs, data_size, sum, thread_per_block,
+                                     stream_id);
+}
+} // namespace Pennylane::LightningTensor::TNCuda::Measures
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/CMakeLists.txt b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/CMakeLists.txt
index 9ed838c230..e18e336449 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/CMakeLists.txt
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/CMakeLists.txt
@@ -31,7 +31,8 @@ target_sources(${PL_BACKEND}_measurements_tests INTERFACE runner_${PL_BACKEND}_m
 # Define targets
 ################################################################################
 set(TEST_SOURCES    Test_MPSTNCuda_Expval.cpp
-                    Test_MPSTNCuda_Var.cpp)
+                    Test_MPSTNCuda_Var.cpp
+                    Test_MPSTNCuda_Measure.cpp)
 
 add_executable(${PL_BACKEND}_measurements_test_runner ${TEST_SOURCES})
 target_link_libraries(${PL_BACKEND}_measurements_test_runner PRIVATE ${PL_BACKEND}_measurements_tests)
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp
new file mode 100644
index 0000000000..5e89426d49
--- /dev/null
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/measurements/tests/Test_MPSTNCuda_Measure.cpp
@@ -0,0 +1,94 @@
+// Copyright 2024 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the License);
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an AS IS BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <array>
+#include <complex>
+#include <limits>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include <catch2/catch.hpp>
+
+#include "MPSTNCuda.hpp"
+#include "MeasurementsTNCuda.hpp"
+#include "TNCudaGateCache.hpp"
+#include "cuda_helpers.hpp"
+
+/// @cond DEV
+namespace {
+using namespace Pennylane::LightningTensor::TNCuda::Measures;
+using namespace Pennylane::LightningTensor::TNCuda::Observables;
+using namespace Pennylane::LightningTensor::TNCuda;
+} // namespace
+/// @endcond
+
+TEMPLATE_TEST_CASE("Probabilities", "[Measures]", float, double) {
+    using TensorNetT = MPSTNCuda<TestType>;
+
+    SECTION("Looping over different wire configurations:") {
+        // Probabilities calculated with Pennylane default.qubit:
+        std::vector<std::pair<std::vector<std::size_t>, std::vector<TestType>>>
+            input = {
+                {{0, 1, 2},
+                 {0.65473791, 0.08501576, 0.02690407, 0.00349341, 0.19540418,
+                  0.02537265, 0.00802942, 0.0010426}},
+                {{0, 1}, {0.73975367, 0.03039748, 0.22077683, 0.00907202}},
+                {{0, 2}, {0.68164198, 0.08850918, 0.2034336, 0.02641525}},
+                {{1, 2}, {0.85014208, 0.11038841, 0.03493349, 0.00453601}},
+                {{0}, {0.77015115, 0.22984885}},
+                {{1}, {0.9605305, 0.0394695}},
+                {{2}, {0.88507558, 0.11492442}}}; // data from default.qubit
+
+        // Defining the State Vector that will be measured.
+        std::size_t bondDim = GENERATE(2, 3, 4, 5);
+        std::size_t num_qubits = 3;
+        std::size_t maxBondDim = bondDim;
+
+        TensorNetT mps_state{num_qubits, maxBondDim};
+
+        mps_state.applyOperations(
+            {{"RX"}, {"RX"}, {"RY"}, {"RY"}, {"RX"}, {"RY"}},
+            {{0}, {0}, {1}, {1}, {2}, {2}},
+            {{false}, {false}, {false}, {false}, {false}, {false}},
+            {{0.5}, {0.5}, {0.2}, {0.2}, {0.5}, {0.5}});
+        mps_state.append_mps_final_state();
+
+        auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
+
+        for (const auto &term : input) {
+            auto probabilities = measure.probs(term.first);
+            REQUIRE_THAT(term.second,
+                         Catch::Approx(probabilities).margin(1e-6));
+        }
+    }
+
+    SECTION("Test TNCudaOperator ctor failures") {
+        // Defining the State Vector that will be measured.
+        std::size_t bondDim = GENERATE(2, 3, 4, 5);
+        std::size_t num_qubits = 3;
+        std::size_t maxBondDim = bondDim;
+
+        TensorNetT mps_state{num_qubits, maxBondDim};
+
+        mps_state.applyOperations({{"RX"}, {"RY"}}, {{0}, {0}},
+                                  {{false}, {false}}, {{0.5}, {0.5}});
+        mps_state.append_mps_final_state();
+
+        auto measure = MeasurementsTNCuda<TensorNetT>(mps_state);
+        REQUIRE_THROWS_AS(measure.probs({2, 1}), LightningException);
+    }
+}
diff --git a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/observables/ObservablesTNCudaOperator.hpp b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/observables/ObservablesTNCudaOperator.hpp
index 62aabc360c..45018d41e3 100644
--- a/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/observables/ObservablesTNCudaOperator.hpp
+++ b/pennylane_lightning/core/src/simulators/lightning_tensor/tncuda/observables/ObservablesTNCudaOperator.hpp
@@ -12,6 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+/**
+ * @file ObservablesTNCudaOperator.hpp
+ * Class for appending a ObservablesTNCuda object to a tensor network object.
+ */
+
 #pragma once
 
 #include <cutensornet.h>
@@ -380,8 +385,6 @@ template <class TensorNetT> class ObservableTNCudaOperator {
      */
     void initHelper_var_(const TensorNetT &tensor_network,
                          ObservableTNCuda<TensorNetT> &obs) {
-        SharedCublasCaller cublascaller = make_shared_cublas_caller();
-
         // convert obs modes to cutensornet compatible format/order
         vector3D<int32_t> modes;
         for (std::size_t term_idx = 0; term_idx < numObsTerms_; term_idx++) {
@@ -430,8 +433,9 @@ template <class TensorNetT> class ObservableTNCudaOperator {
                     if (metaDataArr.size() == 1) {
                         obsKey = std::move(add_meta_data_(metaDataArr[0]));
                     } else if (metaDataArr.size() == 2) {
-                        obsKey = std::move(add_meta_data_(
-                            metaDataArr[0], metaDataArr[1], *cublascaller));
+                        obsKey = std::move(
+                            add_meta_data_(metaDataArr[0], metaDataArr[1],
+                                           tensor_network.getCublasCaller()));
                     } else {
                         PL_ABORT("Only one wire observables are supported "
                                  "for cutensornet v24.03");
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
index a06765849a..d1441b6aa8 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/LinearAlg.hpp
@@ -109,6 +109,31 @@ inline void GEMM_CUDA_device(T *A, T *B, T *C, const int m, const int k,
                     n, k, &alpha, A, m, B, n, &beta, C, m);
     }
 }
+
+/**
+ * @brief cuBLAS backed sum of the absolute value of a vector for GPU data.
+ *
+ * @tparam T  Float data-type. Accepts float and double
+ * @tparam DevTypeID Integer type of device id.
+ *
+ * @param A Device data pointer of vector A.
+ * @param n Length of the vector.
+ * @param dev_id the device on which the function should be executed.
+ * @param stream_id the CUDA stream on which the operation should be executed.
+ * @param cublas the CublasCaller object that manages the cuBLAS handle.
+ * @param res Device data pointer to store the result.
+ */
+template <class T = double, class DevTypeID = int>
+inline void asum_CUDA_device(const T *A, const int n, DevTypeID dev_id,
+                             cudaStream_t stream_id, const CublasCaller &cublas,
+                             T *res) {
+    if constexpr (std::is_same_v<T, float>) {
+        cublas.call(cublasSasum, dev_id, stream_id, n, A, 1, res);
+    } else if constexpr (std::is_same_v<T, double>) {
+        cublas.call(cublasDasum, dev_id, stream_id, n, A, 1, res);
+    }
+}
+
 /**
  * @brief cuBLAS backed inner product for GPU data.
  *
diff --git a/pennylane_lightning/core/src/utils/cuda_utils/tests/Test_LinearAlgebra.cpp b/pennylane_lightning/core/src/utils/cuda_utils/tests/Test_LinearAlgebra.cpp
index b5b837f1e2..a4201f22d5 100644
--- a/pennylane_lightning/core/src/utils/cuda_utils/tests/Test_LinearAlgebra.cpp
+++ b/pennylane_lightning/core/src/utils/cuda_utils/tests/Test_LinearAlgebra.cpp
@@ -26,8 +26,7 @@
 
 /**
  * @file
- *  Tests linear algebra functionality defined for the class
- * StateVectorCudaManaged.
+ *  Tests CUDA library based linear algebra functionality.
  */
 
 /// @cond DEV
@@ -97,3 +96,23 @@ TEMPLATE_TEST_CASE("Linear Algebra::SparseMV", "[Linear Algebra]", float,
         }
     }
 }
+
+TEMPLATE_TEST_CASE("Linear Algebra::asum_CUDA_device", "[Linear Algebra]",
+                   float, double) {
+    std::vector<TestType> vec{1.0, 2.0, 3.0, 4.0, 5.0,
+                              6.0, 7.0, 8.0, 9.0, 10.0};
+
+    DataBuffer<TestType> vec_d(vec.size());
+
+    vec_d.CopyHostDataToGpu(vec.data(), vec.size());
+
+    auto cublasCaller = make_shared_cublas_caller();
+
+    SECTION("Testing asum_CUDA_device") {
+        TestType result;
+        asum_CUDA_device(vec_d.getData(), vec_d.getLength(), vec_d.getDevice(),
+                         vec_d.getStream(), *cublasCaller, &result);
+
+        CHECK(result == Approx(55.0));
+    }
+}
diff --git a/pennylane_lightning/lightning_tensor/_measurements.py b/pennylane_lightning/lightning_tensor/_measurements.py
index 5d563866ed..93ab53e506 100644
--- a/pennylane_lightning/lightning_tensor/_measurements.py
+++ b/pennylane_lightning/lightning_tensor/_measurements.py
@@ -25,7 +25,13 @@
 
 import numpy as np
 import pennylane as qml
-from pennylane.measurements import ExpectationMP, MeasurementProcess, StateMeasurement, VarianceMP
+from pennylane.measurements import (
+    ExpectationMP,
+    MeasurementProcess,
+    ProbabilityMP,
+    StateMeasurement,
+    VarianceMP,
+)
 from pennylane.tape import QuantumScript
 from pennylane.typing import Result, TensorLike
 from pennylane.wires import Wires
@@ -74,10 +80,12 @@ def state_diagonalizing_gates(self, measurementprocess: StateMeasurement) -> Ten
         """
         diagonalizing_gates = measurementprocess.diagonalizing_gates()
         self._tensornet.apply_operations(diagonalizing_gates)
+        self._tensornet.appendMPSFinalState()
         state_array = self._tensornet.state
         wires = Wires(range(self._tensornet.num_wires))
         result = measurementprocess.process_state(state_array, wires)
         self._tensornet.apply_operations([qml.adjoint(g) for g in reversed(diagonalizing_gates)])
+        self._tensornet.appendMPSFinalState()
         return result
 
     # pylint: disable=protected-access
@@ -102,6 +110,27 @@ def expval(self, measurementprocess: MeasurementProcess):
         )._ob(measurementprocess.obs)
         return self._measurement_lightning.expval(ob_serialized)
 
+    def probs(self, measurementprocess: MeasurementProcess):
+        """Probabilities of the supplied observable or wires contained in the MeasurementProcess.
+
+        Args:
+            measurementprocess (StateMeasurement): measurement to apply to the state
+
+        Returns:
+            Probabilities of the supplied observable or wires
+        """
+        diagonalizing_gates = measurementprocess.diagonalizing_gates()
+        if diagonalizing_gates:
+            self._tensornet.apply_operations(diagonalizing_gates)
+            self._tensornet.appendMPSFinalState()
+        results = self._measurement_lightning.probs(measurementprocess.wires.tolist())
+        if diagonalizing_gates:
+            self._tensornet.apply_operations(
+                [qml.adjoint(g, lazy=False) for g in reversed(diagonalizing_gates)]
+            )
+            self._tensornet.appendMPSFinalState()
+        return results
+
     def var(self, measurementprocess: MeasurementProcess):
         """Variance of the supplied observable contained in the MeasurementProcess. Note that the variance is
         calculated as <obs**2> - <obs>**2. The current implementation only supports single-wire observables.
@@ -145,7 +174,10 @@ def get_measurement_function(
             if isinstance(measurementprocess, VarianceMP):
                 return self.var
 
-            if measurementprocess.obs is None:
+            if isinstance(measurementprocess, ProbabilityMP):
+                return self.probs
+
+            if measurementprocess.obs is None or measurementprocess.obs.has_diagonalizing_gates:
                 return self.state_diagonalizing_gates
 
         raise NotImplementedError("Unsupported measurement type.")
diff --git a/pennylane_lightning/lightning_tensor/_tensornet.py b/pennylane_lightning/lightning_tensor/_tensornet.py
index 1b3ddb8034..fd107a5bba 100644
--- a/pennylane_lightning/lightning_tensor/_tensornet.py
+++ b/pennylane_lightning/lightning_tensor/_tensornet.py
@@ -194,10 +194,14 @@ def set_tensor_network(self, circuit: QuantumScript):
 
         Args:
             circuit (QuantumScript): The single circuit to simulate
-
-        Returns:
-            LightningTensorNet: Lightning final state class.
         """
         self.apply_operations(circuit.operations)
+        self.appendMPSFinalState()
+
+    def appendMPSFinalState(self):
+        """
+        Append the final state to the tensor network for the MPS backend. This is an function to be called
+        by once apply_operations is called.
+        """
         if self._method == "mps":
             self._tensornet.appendMPSFinalState(self._cutoff, self._cutoff_mode)
diff --git a/tests/new_api/test_device.py b/tests/new_api/test_device.py
index 91573ebe22..31f66a31be 100644
--- a/tests/new_api/test_device.py
+++ b/tests/new_api/test_device.py
@@ -421,8 +421,6 @@ def test_execute_single_measurement(self, theta, phi, mp, dev):
         if device_name == "lightning.tensor":
             if isinstance(mp.obs, qml.SparseHamiltonian) or isinstance(mp.obs, qml.Projector):
                 pytest.skip("SparseHamiltonian/Projector obs not supported in lightning.tensor")
-            if isinstance(mp, ProbabilityMP):
-                pytest.skip("qml.probs() not supported in lightning.tensor")
 
         if isinstance(mp.obs, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             mp.obs = qml.operation.convert_to_legacy_H(mp.obs)
@@ -466,10 +464,6 @@ def test_execute_single_measurement(self, theta, phi, mp, dev):
     )
     def test_execute_multi_measurement(self, theta, phi, dev, mp1, mp2):
         """Test that execute returns the correct results with multiple measurements."""
-        if device_name == "lightning.tensor":
-            if isinstance(mp1, ProbabilityMP) or isinstance(mp2, ProbabilityMP):
-                pytest.skip("qml.probs() not supported in lightning.tensor")
-
         if isinstance(mp2.obs, qml.ops.LinearCombination) and not qml.operation.active_new_opmath():
             mp2.obs = qml.operation.convert_to_legacy_H(mp2.obs)
 
@@ -511,6 +505,10 @@ def test_custom_wires(self, phi, theta, wires):
         assert np.allclose(result[0], np.cos(phi))
         assert np.allclose(result[1], np.cos(phi) * np.cos(theta))
 
+    @pytest.mark.skipif(
+        device_name == "lightning.tensor",
+        reason="lightning.tensor does not support out of order probs",
+    )
     @pytest.mark.parametrize(
         "wires, wire_order", [(3, (0, 1, 2)), (("a", "b", "c"), ("a", "b", "c"))]
     )
diff --git a/tests/test_apply.py b/tests/test_apply.py
index cf329fce72..7cecbd550d 100644
--- a/tests/test_apply.py
+++ b/tests/test_apply.py
@@ -1308,7 +1308,7 @@ def circuit():
 
     @pytest.mark.skipif(
         device_name == "lightning.tensor",
-        reason="lightning.tensor does not support qml.prob()",
+        reason="lightning.tensor does not support _tensornet.state access",
     )
     def test_apply_qpe(self, qubit_device, tol):
         """Test the application of qml.QuantumPhaseEstimation"""
@@ -1347,7 +1347,7 @@ def circuit():
     # https://docs.pennylane.ai/en/stable/code/api/pennylane.BlockEncode.html
     @pytest.mark.skipif(
         device_name == "lightning.tensor",
-        reason="lightning.tensor does not support qml.state()",
+        reason="lightning.tensor does not support qml.BlockEncode",
     )
     @pytest.mark.parametrize(
         "op, op_wires",
diff --git a/tests/test_comparison.py b/tests/test_comparison.py
index 12beb31543..4967b72bd0 100644
--- a/tests/test_comparison.py
+++ b/tests/test_comparison.py
@@ -65,7 +65,7 @@ class TestComparison:
 
     @pytest.mark.skipif(
         device_name == "lightning.tensor",
-        reason="lightning.tensor device dose not support 1 wire tensor network",
+        reason="lightning.tensor device does not support one-qubit circuits",
     )
     @pytest.mark.parametrize("basis_state", itertools.product(*[(0, 1)] * 1))
     @pytest.mark.parametrize("wires", [1])
@@ -104,7 +104,7 @@ def circuit(measurement):
 
     @pytest.mark.skipif(
         device_name == "lightning.tensor",
-        reason="lightning.tensor device dose not support direct access to the state",
+        reason="lightning.tensor device does not support direct access to the state",
     )
     @pytest.mark.parametrize("basis_state", itertools.product(*[(0, 1)] * 2))
     @pytest.mark.parametrize("wires", [2])
@@ -152,7 +152,7 @@ def circuit(measurement):
 
     @pytest.mark.skipif(
         device_name == "lightning.tensor",
-        reason="lightning.tensor device dose not support the direct access to state",
+        reason="lightning.tensor device does not support the direct access to state",
     )
     @pytest.mark.parametrize("basis_state", itertools.product(*[(0, 1)] * 3))
     @pytest.mark.parametrize("wires", [3])
@@ -208,7 +208,7 @@ def circuit(measurement):
 
     @pytest.mark.skipif(
         device_name == "lightning.tensor",
-        reason="lightning.tensor device dose not support the direct access to state",
+        reason="lightning.tensor device does not support the direct access to state",
     )
     @pytest.mark.parametrize("basis_state", itertools.product(*[(0, 1)] * 4))
     @pytest.mark.parametrize("wires", [4])
@@ -269,7 +269,7 @@ def circuit(measurement):
 
     @pytest.mark.skipif(
         device_name == "lightning.tensor",
-        reason="lightning.tensor device dose not support initialization with a state vector",
+        reason="lightning.tensor device does not support initialization with a state vector",
     )
     @pytest.mark.parametrize(
         "lightning_dev_version", [lightning_backend_dev, lightning_backend_batch_obs_dev]
diff --git a/tests/test_expval.py b/tests/test_expval.py
index 5fb6e7fd2a..87c65bbb0b 100644
--- a/tests/test_expval.py
+++ b/tests/test_expval.py
@@ -158,12 +158,15 @@ def circuit():
 
             circ = qml.QNode(circuit, dev)
             circ_def = qml.QNode(circuit, dev_def)
-            if device_name == "lightning.tensor" and n_wires > 1:
-                with pytest.raises(
-                    ValueError,
-                    match="The number of Hermitian observables target wires should be 1.",
-                ):
-                    assert np.allclose(circ(), circ_def(), tol)
+            if device_name == "lightning.tensor":
+                if n_wires > 1:
+                    with pytest.raises(
+                        ValueError,
+                        match="The number of Hermitian observables target wires should be 1.",
+                    ):
+                        assert np.allclose(circ(), circ_def(), tol)
+                else:
+                    np.allclose(circ(), circ_def(), rtol=1e-6)
             else:
                 assert np.allclose(circ(), circ_def(), tol)
 
diff --git a/tests/test_measurements.py b/tests/test_measurements.py
index c2bf5d8307..0fdb3fafa3 100644
--- a/tests/test_measurements.py
+++ b/tests/test_measurements.py
@@ -50,10 +50,6 @@ def circuit(x):
         circuit(0.65)
 
 
-@pytest.mark.skipif(
-    device_name == "lightning.tensor",
-    reason="lightning.tensor does not support qml.probs()",
-)
 class TestProbs:
     """Test Probs in Lightning devices"""
 
@@ -155,8 +151,8 @@ def circuit():
             _ = circuit()
 
     @pytest.mark.skipif(
-        device_name == "lightning.gpu",
-        reason="lightning.gpu does not support out of order prob.",
+        device_name == "lightning.gpu" or device_name == "lightning.tensor",
+        reason="lightning.gpu/lightning.tensor does not support out of order prob.",
     )
     @pytest.mark.parametrize(
         "cases",
diff --git a/tests/test_templates.py b/tests/test_templates.py
index 4774a12496..3f242d7af5 100644
--- a/tests/test_templates.py
+++ b/tests/test_templates.py
@@ -30,11 +30,9 @@
 class TestGrover:
     """Test Grover's algorithm (multi-controlled gates, decomposition, etc.)"""
 
-    @pytest.mark.skipif(
-        device_name == "lightning.tensor",
-        reason="lightning.tensor does not support multi-controlled gates and probs()",
+    @pytest.mark.parametrize(
+        "n_qubits", range(4, 8) if device_name != "lightning.tensor" else range(4, 6)
     )
-    @pytest.mark.parametrize("n_qubits", range(4, 8))
     def test_grover(self, n_qubits):
         np.random.seed(42)
         omega = np.random.rand(n_qubits) > 0.5
@@ -728,11 +726,7 @@ def circuit():
                 estimation_wires=estimation_wires,
             )
 
-            return (
-                qml.probs(estimation_wires)
-                if device_name != "lightning.tensor"
-                else qml.expval(qml.PauliZ(0))
-            )  # lightning.tensor does not support qml.probs()
+            return qml.probs(estimation_wires)
 
         res = qml.QNode(circuit, dev, diff_method=None)()
         ref = qml.QNode(circuit, dq, diff_method=None)()