diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
index 1d5e2ccb81..78bc16cef0 100644
--- a/.github/CHANGELOG.md
+++ b/.github/CHANGELOG.md
@@ -8,6 +8,9 @@
 * Add a new dispatch mechanism for future kernels.
 [(#291)](https://github.com/PennyLaneAI/pennylane-lightning/pull/291)
 
+* Support qml.state() in vjp and Hamiltonian in adjoint jacobian.
+[(#294)](https://github.com/PennyLaneAI/pennylane-lightning/pull/294)
+
 ### Breaking changes
 
 * Codebase is now moving to C++20. The default compiler for Linux is now GCC10.
diff --git a/.gitignore b/.gitignore
index 0423bef437..81d77afb50 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,9 +3,9 @@ doc/_build/
 PennyLane_Lightning.egg-info/
 build/
 Build/
-BuildBench/
 BuildGBench/
 BuildTests/
+BuildTidy/
 dist/
 tests/__pycache__/
 .idea
diff --git a/Makefile b/Makefile
index 0ef33bda43..c74efa2fec 100644
--- a/Makefile
+++ b/Makefile
@@ -58,7 +58,7 @@ docs:
 clean-docs:
 	$(MAKE) -C doc clean
 
-.PHONY : test-builtin test-suite test-python coverage test-cpp
+.PHONY : test-builtin test-suite test-python coverage test-cpp test-cpp-no-omp test-cpp-blas test-cpp-kokkos
 test-builtin:
 	$(PYTHON) -I $(TESTRUNNER)
 
@@ -86,9 +86,9 @@ test-cpp-blas:
 	cmake --build ./BuildTests --target runner
 	cmake --build ./BuildTests --target test
 
-test-cpp-omp:
+test-cpp-no-omp:
 	rm -rf ./BuildTests
-	cmake $(LIGHTNING_CPP_DIR) -BBuildTests -DBUILD_TESTS=ON -DENABLE_OPENMP=ON
+	cmake $(LIGHTNING_CPP_DIR) -BBuildTests -DBUILD_TESTS=ON -DENABLE_OPENMP=OFF
 	cmake --build ./BuildTests --target runner
 	cmake --build ./BuildTests --target test
 
diff --git a/doc/conf.py b/doc/conf.py
index cc2249239e..044dbb72ef 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -120,7 +120,7 @@ def __getattr__(cls, name):
     # TIP: if using the sphinx-bootstrap-theme, you need
     # "treeViewIsBootstrap": True,
     "exhaleExecutesDoxygen": True,
-    "exhaleDoxygenStdin": ("INPUT = " + " ".join(CPP_FILES) + " " "EXCLUDE_SYMBOLS = std::* "),
+    "exhaleDoxygenStdin": ("INPUT = " + " ".join(CPP_FILES) + "\nEXCLUDE_SYMBOLS = std::* "),
     "afterTitleDescription": inspect.cleandoc(
         """
         The Pennylane Lightning C++ API is intended to be called from Python through Pybind11. Direct use of the C++ API is currently unsupported and is provided for reference only.
diff --git a/pennylane_lightning/_serialize.py b/pennylane_lightning/_serialize.py
index f3d464d160..309484ad3d 100644
--- a/pennylane_lightning/_serialize.py
+++ b/pennylane_lightning/_serialize.py
@@ -35,85 +35,109 @@
 try:
     from .lightning_qubit_ops import (
         StateVectorC64,
-        ObsStructC64,
         StateVectorC128,
-        ObsStructC128,
+    )
+    from .lightning_qubit_ops.adjoint_diff import (
+        NamedObsC64,
+        NamedObsC128,
+        HermitianObsC64,
+        HermitianObsC128,
+        TensorProdObsC64,
+        TensorProdObsC128,
+        HamiltonianC64,
+        HamiltonianC128,
+        OpsStructC64,
+        OpsStructC128,
     )
 except ImportError:
     pass
 
 
-def _obs_has_kernel(obs: Observable) -> bool:
+def _obs_has_kernel(ob: Observable) -> bool:
     """Returns True if the input observable has a supported kernel in the C++ backend.
 
     Args:
-        obs (Observable): the input observable
+        ob (Observable): the input observable
 
     Returns:
         bool: indicating whether ``obs`` has a dedicated kernel in the backend
     """
-    if is_pauli_word(obs):
+    if is_pauli_word(ob):
         return True
-    if isinstance(obs, (Hadamard, Projector)):
+    if isinstance(ob, (Hadamard, Projector)):
         return True
-    if isinstance(obs, Tensor):
-        return all(_obs_has_kernel(o) for o in obs.obs)
+    if isinstance(ob, Tensor):
+        return all(_obs_has_kernel(o) for o in ob.obs)
     return False
 
 
-def _serialize_obs(tape: QuantumTape, wires_map: dict, use_csingle: bool = False) -> List:
-    """Serializes the observables of an input tape.
-
-    Args:
-        tape (QuantumTape): the input quantum tape
-        wires_map (dict): a dictionary mapping input wires to the device's backend wires
-        use_csingle (bool): whether to use np.complex64 instead of np.complex128
-
-    Returns:
-        list(ObsStructC128 or ObsStructC64): A list of observable objects compatible with the C++ backend
-    """
-    obs = []
+def _serialize_named_hermitian_ob(o, wires_map: dict, use_csingle: bool):
+    """Serializes an observable (Named or Hermitian)"""
+    assert not isinstance(o, Tensor)
 
     if use_csingle:
         ctype = np.complex64
-        obs_py = ObsStructC64
+        named_obs = NamedObsC64
+        hermitian_obs = HermitianObsC64
     else:
         ctype = np.complex128
-        obs_py = ObsStructC128
+        named_obs = NamedObsC128
+        hermitian_obs = HermitianObsC128
 
-    for o in tape.observables:
-        is_tensor = isinstance(o, Tensor)
+    wires_list = o.wires.tolist()
+    wires = [wires_map[w] for w in wires_list]
+    if _obs_has_kernel(o):
+        return named_obs(o.name, wires)
+    return hermitian_obs(qml.matrix(o).ravel().astype(ctype), wires)
 
-        wires = []
 
-        if is_tensor:
-            for o_ in o.obs:
-                wires_list = o_.wires.tolist()
-                w = [wires_map[w] for w in wires_list]
-                wires.append(w)
-        else:
-            wires_list = o.wires.tolist()
-            w = [wires_map[w] for w in wires_list]
-            wires.append(w)
+def _serialize_tensor_ob(ob, wires_map: dict, use_csingle: bool):
+    """Serialize a tensor observable"""
+    assert isinstance(ob, Tensor)
 
-        name = o.name if is_tensor else [o.name]
+    if use_csingle:
+        tensor_obs = TensorProdObsC64
+    else:
+        tensor_obs = TensorProdObsC128
 
-        params = []
+    return tensor_obs([_serialize_ob(o, wires_map, use_csingle) for o in ob.obs])
 
-        if not _obs_has_kernel(o):
-            if is_tensor:
-                for o_ in o.obs:
-                    if not _obs_has_kernel(o_):
-                        params.append(qml.matrix(o_).ravel().astype(ctype))
-                    else:
-                        params.append([])
-            else:
-                params.append(qml.matrix(o).ravel().astype(ctype))
 
-        ob = obs_py(name, params, wires)
-        obs.append(ob)
+def _serialize_hamiltonian(ob, wires_map: dict, use_csingle: bool):
+    if use_csingle:
+        rtype = np.float32
+        hamiltonian_obs = HamiltonianC64
+    else:
+        rtype = np.float64
+        hamiltonian_obs = HamiltonianC128
+
+    coeffs = np.array(ob.coeffs).astype(rtype)
+    terms = [_serialize_ob(t, wires_map, use_csingle) for t in ob.ops]
+    return hamiltonian_obs(coeffs, terms)
+
+
+def _serialize_ob(ob, wires_map, use_csingle):
+    if isinstance(ob, Tensor):
+        return _serialize_tensor_ob(ob, wires_map, use_csingle)
+    elif ob.name == "Hamiltonian":
+        return _serialize_hamiltonian(ob, wires_map, use_csingle)
+    else:
+        return _serialize_named_hermitian_ob(ob, wires_map, use_csingle)
+
+
+def _serialize_observables(tape: QuantumTape, wires_map: dict, use_csingle: bool = False) -> List:
+    """Serializes the observables of an input tape.
+
+    Args:
+        tape (QuantumTape): the input quantum tape
+        wires_map (dict): a dictionary mapping input wires to the device's backend wires
+        use_csingle (bool): whether to use np.complex64 instead of np.complex128
+
+    Returns:
+        list(ObsStructC128 or ObsStructC64): A list of observable objects compatible with the C++ backend
+    """
 
-    return obs
+    return [_serialize_ob(ob, wires_map, use_csingle) for ob in tape.observables]
 
 
 def _serialize_ops(
diff --git a/pennylane_lightning/_version.py b/pennylane_lightning/_version.py
index ed26d5505b..39fff18c09 100644
--- a/pennylane_lightning/_version.py
+++ b/pennylane_lightning/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.24.0-dev15"
+__version__ = "0.24.0-dev16"
diff --git a/pennylane_lightning/lightning_qubit.py b/pennylane_lightning/lightning_qubit.py
index eaa3733581..71dc4d67ff 100644
--- a/pennylane_lightning/lightning_qubit.py
+++ b/pennylane_lightning/lightning_qubit.py
@@ -34,8 +34,8 @@
     DeviceError,
 )
 from pennylane.devices import DefaultQubit
-from pennylane.operation import Tensor
-from pennylane.measurements import Expectation
+from pennylane.operation import Tensor, Operation
+from pennylane.measurements import MeasurementProcess, Expectation, State
 from pennylane.wires import Wires
 
 # Remove after the next release of PL
@@ -46,21 +46,18 @@
 
 try:
     from .lightning_qubit_ops import (
+        adjoint_diff,
         MeasuresC64,
         StateVectorC64,
-        AdjointJacobianC64,
-        VectorJacobianProductC64,
         MeasuresC128,
         StateVectorC128,
-        AdjointJacobianC128,
-        VectorJacobianProductC128,
+        Kokkos_info,
         allocate_aligned_array,
         get_alignment,
         best_alignment,
-        Kokkos_info,
     )
 
-    from ._serialize import _serialize_obs, _serialize_ops
+    from ._serialize import _serialize_observables, _serialize_ops
 
     CPP_BINARY_AVAILABLE = True
 except ModuleNotFoundError:
@@ -90,11 +87,13 @@ class LightningQubit(DefaultQubit):
 
     Args:
         wires (int): the number of wires to initialize the device with
+        c_dtype: Datatypes for statevector representation. Must be one of ``np.complex64`` or ``np.complex128``.
         shots (int): How many times the circuit should be evaluated (or sampled) to estimate
             the expectation values. Defaults to ``None`` if not specified. Setting
             to ``None`` results in computing statistics like expectation values and
             variances analytically.
-        c_dtype: Datatypes for statevector representation. Must be one of ``np.complex64`` or ``np.complex128``.
+        batch_obs (bool): Determine whether we process observables parallelly when computing the
+            jacobian. This value is only relevant when the lightning qubit is built with OpenMP.
     """
 
     name = "Lightning Qubit PennyLane plugin"
@@ -223,47 +222,122 @@ def apply_lightning(self, state, operations):
 
         return np.reshape(state_vector, state.shape)
 
-    def adjoint_diff_support_check(self, tape):
-        """Check Lightning adjoint differentiation method support for a tape.
-
-        Raise ``QuantumFunctionError`` if ``tape`` contains not supported measurements,
-        observables, or operations by the Lightning adjoint differentiation method.
+    @staticmethod
+    def _check_adjdiff_supported_measurements(measurements: List[MeasurementProcess]):
+        """Check whether given list of measurement is supported by adjoint_diff
 
         Args:
-            tape (.QuantumTape): quantum tape to differentiate
+            measurements (List[MeasurementProcess]): a list of measurement processes to check.
+
+        Returns:
+            Expectation or State: a common return type of measurements.
         """
-        for m in tape.measurements:
-            if m.return_type is not Expectation:
-                raise QuantumFunctionError(
-                    "Adjoint differentiation method does not support"
-                    f" measurement {m.return_type.value}"
-                )
+        if len(measurements) == 0:
+            return None
+
+        if len(measurements) == 1 and measurements[0].return_type is State:
+            return State
+
+        # Now the return_type of measurement processes must be expectation
+        if not all([m.return_type is Expectation for m in measurements]):
+            raise QuantumFunctionError(
+                "Adjoint differentiation method does not support expectation return type "
+                "mixed with other return types"
+            )
+
+        for m in measurements:
             if not isinstance(m.obs, Tensor):
                 if isinstance(m.obs, Projector):
                     raise QuantumFunctionError(
                         "Adjoint differentiation method does not support the Projector observable"
                     )
-                if isinstance(m.obs, Hermitian):
-                    raise QuantumFunctionError(
-                        "Lightning adjoint differentiation method does not currently support the Hermitian observable"
-                    )
             else:
                 if any([isinstance(o, Projector) for o in m.obs.non_identity_obs]):
                     raise QuantumFunctionError(
                         "Adjoint differentiation method does not support the Projector observable"
                     )
-                if any([isinstance(o, Hermitian) for o in m.obs.non_identity_obs]):
-                    raise QuantumFunctionError(
-                        "Lightning adjoint differentiation method does not currently support the Hermitian observable"
-                    )
+        return Expectation
+
+    @staticmethod
+    def _check_adjdiff_supported_operations(operations):
+        """Check Lightning adjoint differentiation method support for a tape.
 
-        for op in tape.operations:
+        Raise ``QuantumFunctionError`` if ``tape`` contains not supported measurements,
+        observables, or operations by the Lightning adjoint differentiation method.
+
+        Args:
+            tape (.QuantumTape): quantum tape to differentiate
+
+        """
+        for op in operations:
             if op.num_params > 1 and not isinstance(op, Rot):
                 raise QuantumFunctionError(
                     f"The {op.name} operation is not supported using "
                     'the "adjoint" differentiation method'
                 )
 
+    def _process_jacobian_tape(self, tape, starting_state, use_device_state):
+        # To support np.complex64 based on the type of self._state
+        if self.use_csingle:
+            create_ops_list = adjoint_diff.create_ops_list_C64
+        else:
+            create_ops_list = adjoint_diff.create_ops_list_C128
+
+        # Initialization of state
+        if starting_state is not None:
+            if starting_state.size != 2 ** len(self.wires):
+                raise QuantumFunctionError(
+                    "The number of qubits of starting_state must be the same as "
+                    "that of the device."
+                )
+            ket = self._asarray(starting_state, dtype=self.C_DTYPE)
+        else:
+            if not use_device_state:
+                self.reset()
+                self.apply(tape.operations)
+            ket = self._pre_rotated_state
+
+        obs_serialized = _serialize_observables(tape, self.wire_map, use_csingle=self.use_csingle)
+        ops_serialized, use_sp = _serialize_ops(tape, self.wire_map)
+
+        ops_serialized = create_ops_list(*ops_serialized)
+
+        # We need to filter out indices in trainable_params which do not
+        # correspond to operators.
+        trainable_params = sorted(tape.trainable_params)
+        if len(trainable_params) == 0:
+            return None
+
+        tp_shift = []
+        record_tp_rows = []
+        all_params = 0
+
+        for op_idx, tp in enumerate(trainable_params):
+            op, _ = tape.get_operation(
+                op_idx
+            )  # get op_idx-th operator among differentiable operators
+            if isinstance(op, Operation) and not isinstance(op, (BasisState, QubitStateVector)):
+                # We now just ignore non-op or state preps
+                tp_shift.append(tp)
+                record_tp_rows.append(all_params)
+            all_params += 1
+
+        if use_sp:
+            # When the first element of the tape is state preparation. Still, I am not sure
+            # whether there must be only one state preparation...
+            tp_shift = [i - 1 for i in tp_shift]
+
+        ket = ket.reshape(-1)
+        state_vector = StateVectorC64(ket) if self.use_csingle else StateVectorC128(ket)
+        return {
+            "state_vector": state_vector,
+            "obs_serialized": obs_serialized,
+            "ops_serialized": ops_serialized,
+            "tp_shift": tp_shift,
+            "record_tp_rows": record_tp_rows,
+            "all_params": all_params,
+        }
+
     def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
         if self.shots is not None:
             warn(
@@ -272,39 +346,25 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
                 UserWarning,
             )
 
-        if len(tape.trainable_params) == 0:
-            return np.array(0)
-
-        # Check adjoint diff support
-        self.adjoint_diff_support_check(tape)
+        tape_return_type = self._check_adjdiff_supported_measurements(tape.measurements)
 
-        # Initialization of state
-        if starting_state is not None:
-            ket = np.ravel(starting_state)
-        else:
-            if not use_device_state:
-                self.reset()
-                self.execute(tape)
-            ket = np.ravel(self._pre_rotated_state)
+        if not tape_return_type:  # the tape does not have measurements
+            return np.array([], dtype=self._state.dtype)
 
-        if self.use_csingle:
-            adj = AdjointJacobianC64()
-        else:
-            adj = AdjointJacobianC128()
+        if tape_return_type is State:
+            raise QuantumFunctionError(
+                "This method does not support statevector return type. "
+                "Use vjp method instead for this purpose."
+            )
 
-        obs_serialized = _serialize_obs(tape, self.wire_map, use_csingle=self.use_csingle)
-        ops_serialized, use_sp = _serialize_ops(tape, self.wire_map)
+        self._check_adjdiff_supported_operations(tape.operations)
 
-        ops_serialized = adj.create_ops_list(*ops_serialized)
+        processed_data = self._process_jacobian_tape(tape, starting_state, use_device_state)
 
-        trainable_params = sorted(tape.trainable_params)
-        first_elem = 1 if trainable_params[0] == 0 else 0
+        if not processed_data:  # training_params is empty
+            return np.array([], dtype=self._state.dtype)
 
-        tp_shift = (
-            trainable_params if not use_sp else [i - 1 for i in trainable_params[first_elem:]]
-        )  # exclude first index if explicitly setting sv
-
-        state_vector = StateVectorC64(ket) if self.use_csingle else StateVectorC128(ket)
+        trainable_params = processed_data["tp_shift"]
 
         # If requested batching over observables, chunk into OMP_NUM_THREADS sized chunks.
         # This will allow use of Lightning with adjoint for large-qubit numbers AND large
@@ -312,88 +372,59 @@ def adjoint_jacobian(self, tape, starting_state=None, use_device_state=False):
         requested_threads = int(getenv("OMP_NUM_THREADS", "1"))
 
         if self._batch_obs and requested_threads > 1:
-            obs_partitions = _chunk_iterable(obs_serialized, requested_threads)
+            obs_partitions = _chunk_iterable(processed_data["obs_serialized"], requested_threads)
             jac = []
             for obs_chunk in obs_partitions:
-                jac_local = adj.adjoint_jacobian(
-                    state_vector,
+                jac_local = adjoint_diff.adjoint_jacobian(
+                    processed_data["state_vector"],
                     obs_chunk,
-                    ops_serialized,
-                    tp_shift,
-                    tape.num_params,
+                    processed_data["ops_serialized"],
+                    trainable_params,
                 )
                 jac.extend(jac_local)
-            jac = np.array(jac)
         else:
-            jac = adj.adjoint_jacobian(
-                state_vector,
-                obs_serialized,
-                ops_serialized,
-                tp_shift,
-                tape.num_params,
+            jac = adjoint_diff.adjoint_jacobian(
+                processed_data["state_vector"],
+                processed_data["obs_serialized"],
+                processed_data["ops_serialized"],
+                trainable_params,
             )
-        return jac.reshape(-1, tape.num_params)
+        jac = np.array(jac)
+        jac = jac.reshape(-1, len(trainable_params))
+        jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
+        jac_r[:, processed_data["record_tp_rows"]] = jac
+        return jac_r
 
-    def compute_vjp(self, dy, jac, num=None):
-        """Convenience function to compute the vector-Jacobian product for a given
-        vector of gradient outputs and a Jacobian.
-        Args:
-            dy (tensor_like): vector of gradient outputs
-            jac (tensor_like): Jacobian matrix. For an n-dimensional ``dy``
-                vector, the first n-dimensions of ``jac`` should match
-                the shape of ``dy``.
-        Keyword Args:
-        num (int): The length of the flattened ``dy`` argument. This is an
-            optional argument, but can be useful to provide if ``dy`` potentially
-            has no shape (for example, due to tracing or just-in-time compilation).
-        Returns:
-            tensor_like: the vector-Jacobian product
-        """
-        if jac is None:
-            return None
+    def vjp(self, measurements, dy, starting_state=None, use_device_state=False):
+        """Generate the processing function required to compute the vector-Jacobian products of a tape.
 
-        if not isinstance(dy, np.ndarray) or not isinstance(jac, np.ndarray):
-            return gradients.compute_vjp(dy, jac)
+        This function can be used with multiple expectation values or a quantum state. When a quantum state
+        is given,
 
-        dy_row = math.reshape(dy, [-1])
+        .. code-block:: python
 
-        if num is None:
-            num = math.shape(dy_row)[0]
+            vjp_f = dev.vjp([qml.state()], dy)
+            vjp = vjp_f(tape)
 
-        jac = math.reshape(jac, [num, -1])
-        num_params = jac.shape[1]
+        computes :math:`w = (w_1,\cdots,w_m)` where
 
-        if math.allclose(dy, 0):
-            return math.convert_like(np.zeros([num_params]), dy)
+        .. math::
 
-        if self.use_csingle:
-            VJP = VectorJacobianProductC64()
-        else:
-            VJP = VectorJacobianProductC128()
+            w_k = \\langle v| \\frac{\partial}{\partial \\theta_k} | \psi_{\pmb{\\theta}} \\rangle.
 
-        vjp_tensor = VJP.compute_vjp_from_jac(
-            math.reshape(jac, [-1]),
-            dy_row,
-            num,
-            num_params,
-        )
-        return vjp_tensor
+        Here, :math:`m` is the total number of trainable parameters, :math:`\pmb{\\theta}` is the vector of trainable parameters and :math:`\psi_{\pmb{\\theta}}`
+        is the output quantum state.
 
-    def vjp(self, tape, dy, starting_state=None, use_device_state=False):
-        """Generate the processing function required to compute the vector-Jacobian products of a tape.
         Args:
-            tape (.QuantumTape): quantum tape to differentiate
-            dy (tensor_like): Gradient-output vector. Must have shape
-                matching the output shape of the corresponding tape.
-        Keyword Args:
+            measurements (list): List of measurement processes for vector-Jacobian product. Now it must be expectation values or a quantum state.
+            dy (tensor_like): Gradient-output vector. Must have shape matching the output shape of the corresponding tape, i.e. number of measrurements if the return type is expectation or :math:`2^N` if the return type is statevector
             starting_state (tensor_like): post-forward pass state to start execution with. It should be
                 complex-valued. Takes precedence over ``use_device_state``.
             use_device_state (bool): use current device state to initialize. A forward pass of the same
                 circuit should be the last thing the device has executed. If a ``starting_state`` is
                 provided, that takes precedence.
         Returns:
-            The processing function required to compute the vector-Jacobian
-            products of a tape.
+            The processing function required to compute the vector-Jacobian products of a tape.
         """
         if self.shots is not None:
             warn(
@@ -402,48 +433,61 @@ def vjp(self, tape, dy, starting_state=None, use_device_state=False):
                 UserWarning,
             )
 
-        num_params = len(tape.trainable_params)
+        tape_return_type = self._check_adjdiff_supported_measurements(measurements)
 
-        if num_params == 0:
-            return lambda _: None
+        if math.allclose(dy, 0) or tape_return_type is None:
+            return lambda tape: math.convert_like(np.zeros(len(tape.trainable_params)), dy)
 
-        if math.allclose(dy, 0):
-            return lambda _: math.convert_like(np.zeros([num_params]), dy)
+        if tape_return_type is Expectation:
+            if len(dy) != len(measurements):
+                raise ValueError(
+                    "Number of observables in the tape must be the same as the length of dy in the vjp method"
+                )
 
-        V = VectorJacobianProductC64() if self.use_csingle else VectorJacobianProductC128()
+            if np.iscomplexobj(dy):
+                raise ValueError(
+                    "The vjp method only works with a real-valued dy when the tape is returning an expectation value"
+                )
 
-        fn = V.vjp_fn(math.reshape(dy, [-1]), tape.num_params)
+            ham = qml.Hamiltonian(dy, [m.obs for m in measurements])
 
-        def processing_fn(tape):
-            # Check adjoint diff support
-            self.adjoint_diff_support_check(tape)
+            def processing_fn(tape):
+                nonlocal ham
+                num_params = len(tape.trainable_params)
 
-            # Initialization of state
-            if starting_state is not None:
-                ket = np.ravel(starting_state)
-            else:
-                if not use_device_state:
-                    self.reset()
-                    self.execute(tape)
-                ket = np.ravel(self._pre_rotated_state)
+                if num_params == 0:
+                    return np.array([], dtype=self._state.dtype)
 
-            obs_serialized = _serialize_obs(tape, self.wire_map, use_csingle=self.use_csingle)
-            ops_serialized, use_sp = _serialize_ops(tape, self.wire_map)
+                new_tape = tape.copy()
+                new_tape._measurements = [qml.expval(ham)]
 
-            ops_serialized = V.create_ops_list(*ops_serialized)
+                return self.adjoint_jacobian(new_tape, starting_state, use_device_state).reshape(-1)
 
-            trainable_params = sorted(tape.trainable_params)
-            first_elem = 1 if trainable_params[0] == 0 else 0
+            return processing_fn
 
-            tp_shift = (
-                trainable_params if not use_sp else [i - 1 for i in trainable_params[first_elem:]]
-            )  # exclude first index if explicitly setting sv
+        if tape_return_type is State:
+            if len(dy) != 2 ** len(self.wires):
+                raise ValueError(
+                    "Size of the provided vector dy must be the same as the size of the statevector"
+                )
+            if np.isrealobj(dy):
+                warn(
+                    "The vjp method only works with complex-valued dy when the tape is returning a statevector. Upcasting dy."
+                )
 
-            state_vector = StateVectorC64(ket) if self.use_csingle else StateVectorC128(ket)
+            dy = dy.astype(self.C_DTYPE)
 
-            return fn(state_vector, obs_serialized, ops_serialized, tp_shift)
+            def processing_fn(tape):
+                nonlocal dy
+                processed_data = self._process_jacobian_tape(tape, starting_state, use_device_state)
+                return adjoint_diff.statevector_vjp(
+                    processed_data["state_vector"],
+                    processed_data["ops_serialized"],
+                    dy,
+                    processed_data["tp_shift"],
+                )
 
-        return processing_fn
+            return processing_fn
 
     def batch_vjp(
         self, tapes, dys, reduction="append", starting_state=None, use_device_state=False
@@ -473,7 +517,10 @@ def batch_vjp(
         # Loop through the tapes and dys vector
         for tape, dy in zip(tapes, dys):
             fn = self.vjp(
-                tape, dy, starting_state=starting_state, use_device_state=use_device_state
+                tape.measurements,
+                dy,
+                starting_state=starting_state,
+                use_device_state=use_device_state,
             )
             fns.append(fn)
 
@@ -482,11 +529,6 @@ def processing_fns(tapes):
             for t, f in zip(tapes, fns):
                 vjp = f(t)
 
-                if vjp is None:
-                    if reduction == "append":
-                        vjps.append(None)
-                    continue
-
                 if isinstance(reduction, str):
                     getattr(vjps, reduction)(vjp)
                 elif callable(reduction):
@@ -545,7 +587,7 @@ def generate_samples(self):
         state_vector = StateVectorC64(ket) if self.use_csingle else StateVectorC128(ket)
         M = MeasuresC64(state_vector) if self.use_csingle else MeasuresC128(state_vector)
 
-        return M.generate_samples(len(self.wires), self.shots).astype(int)
+        return M.generate_samples(len(self.wires), self.shots).astype(int, copy=False)
 
     def expval(self, observable, shot_range=None, bin_size=None):
         """Expectation value of the supplied observable.
diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.cpp b/pennylane_lightning/src/algorithms/AdjointDiff.cpp
index ae23f00eac..dcbc9377e3 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.cpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.cpp
@@ -15,5 +15,9 @@
 #include "AdjointDiff.hpp"
 
 // explicit instantiation
-template class Pennylane::Algorithms::AdjointJacobian<float>;
-template class Pennylane::Algorithms::AdjointJacobian<double>;
\ No newline at end of file
+template void Pennylane::Algorithms::adjointJacobian<float>(
+    std::span<float> jac, const JacobianData<float> &jd, bool apply_operations);
+template void
+Pennylane::Algorithms::adjointJacobian<double>(std::span<double> jac,
+                                               const JacobianData<double> &jd,
+                                               bool apply_operations);
diff --git a/pennylane_lightning/src/algorithms/AdjointDiff.hpp b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
index 58186ea4aa..402c00010c 100644
--- a/pennylane_lightning/src/algorithms/AdjointDiff.hpp
+++ b/pennylane_lightning/src/algorithms/AdjointDiff.hpp
@@ -11,399 +11,145 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+/**
+ * @file
+ * Represent the logic for the adjoint Jacobian method of arXiv:2009.02823
+ */
 #pragma once
+#include "AlgUtil.hpp"
+#include "Error.hpp"
+#include "JacobianTape.hpp"
+#include "LinearAlgebra.hpp"
+#include "StateVectorManagedCPU.hpp"
 
 #include <complex>
-#include <cstring>
 #include <numeric>
-#include <stdexcept>
+#include <span>
+#include <string>
 #include <type_traits>
 #include <utility>
-#include <variant>
 #include <vector>
 
-#include "DynamicDispatcher.hpp"
-#include "Error.hpp"
-#include "JacobianTape.hpp"
-#include "LinearAlgebra.hpp"
-#include "StateVectorManagedCPU.hpp"
-
-#include <iostream>
-
-/// @cond DEV
-namespace {
-
-using namespace Pennylane;
-using namespace Pennylane::Util;
-
-} // namespace
-/// @endcond
-
 namespace Pennylane::Algorithms {
 /**
- * @brief Represent the logic for the adjoint Jacobian method of
- * arXiV:2009.02823
+ * @brief Calculates the Jacobian for the statevector for the selected set
+ * of parametric gates.
+ *
+ * For the statevector data associated with `psi` of length `num_elements`,
+ * we make internal copies to a `%StateVectorManagedCPU<T>` object, with one
+ * per required observable. The `operations` will be applied to the internal
+ * statevector copies, with the operation indices participating in the
+ * gradient calculations given in `trainableParams`, and the overall number
+ * of parameters for the gradient calculation provided within `num_params`.
+ * The resulting row-major ordered `jac` matrix representation will be of
+ * size `jd.getSizeStateVec() * jd.getObservables().size()`. OpenMP is used
+ * to enable independent operations to be offloaded to threads.
  *
- * @tparam T Floating-point precision.
+ * @param jac Preallocated vector for Jacobian data results.
+ * @param jd JacobianData represents the QuantumTape to differentiate.
+ * @param apply_operations Indicate whether to apply operations to tape.psi
+ * prior to calculation.
  */
-template <class T = double> class AdjointJacobian {
-  private:
-    using GeneratorFunc = void (*)(StateVectorManagedCPU<T> &,
-                                   const std::vector<size_t> &,
-                                   const bool); // function pointer type
-
-    /**
-     * @brief Utility method to update the Jacobian at a given index by
-     * calculating the overlap between two given states.
-     *
-     * @param sv1 Statevector <sv1|. Data will be conjugated.
-     * @param sv2 Statevector |sv2>
-     * @param jac Jacobian receiving the values.
-     * @param scaling_coeff Generator coefficient for given gate derivative.
-     * @param obs_index Observable index position of Jacobian to update.
-     * @param param_index Parameter index position of Jacobian to update.
-     */
-    inline void updateJacobian(const StateVectorManagedCPU<T> &sv1,
-                               const StateVectorManagedCPU<T> &sv2,
-                               std::vector<std::vector<T>> &jac,
-                               T scaling_coeff, size_t obs_index,
-                               size_t param_index) {
-        jac[obs_index][param_index] =
-            -2 * scaling_coeff *
-            std::imag(innerProdC(sv1.getDataVector(), sv2.getDataVector()));
+template <typename T>
+void adjointJacobian(std::span<T> jac, const JacobianData<T> &jd,
+                     bool apply_operations = false) {
+    const OpsData<T> &ops = jd.getOperations();
+    const std::vector<std::string> &ops_name = ops.getOpsName();
+
+    const auto &obs = jd.getObservables();
+    const size_t num_observables = obs.size();
+
+    // We can assume the trainable params are sorted (from Python)
+    const std::vector<size_t> &tp = jd.getTrainableParams();
+    const size_t tp_size = tp.size();
+    const size_t num_param_ops = ops.getNumParOps();
+
+    if (!jd.hasTrainableParams()) {
+        return;
     }
 
-    /**
-     * @brief Utility method to apply all operations from given `%OpsData<T>`
-     * object to `%StateVectorManagedCPU<T>`
-     *
-     * @param state Statevector to be updated.
-     * @param operations Operations to apply.
-     * @param adj Take the adjoint of the given operations.
-     */
-    inline void applyOperations(StateVectorManagedCPU<T> &state,
-                                const OpsData<T> &operations,
-                                bool adj = false) {
-        for (size_t op_idx = 0; op_idx < operations.getOpsName().size();
-             op_idx++) {
-            state.applyOperation(operations.getOpsName()[op_idx],
-                                 operations.getOpsWires()[op_idx],
-                                 operations.getOpsInverses()[op_idx] ^ adj,
-                                 operations.getOpsParams()[op_idx]);
-        }
-    }
-    /**
-     * @brief Utility method to apply the adjoint indexed operation from
-     * `%OpsData<T>` object to `%StateVectorManagedCPU<T>`.
-     *
-     * @param state Statevector to be updated.
-     * @param operations Operations to apply.
-     * @param op_idx Adjointed operation index to apply.
-     */
-    inline void applyOperationAdj(StateVectorManagedCPU<T> &state,
-                                  const OpsData<T> &operations, size_t op_idx) {
-        state.applyOperation(operations.getOpsName()[op_idx],
-                             operations.getOpsWires()[op_idx],
-                             !operations.getOpsInverses()[op_idx],
-                             operations.getOpsParams()[op_idx]);
-    }
+    PL_ABORT_IF_NOT(jac.size() == tp_size * num_observables,
+                    "The size of preallocated jacobian must be same as "
+                    "the number of trainable parameters times the number of "
+                    "observables provided.");
 
-    /**
-     * @brief Utility method to apply a given operations from given
-     * `%ObsDatum<T>` object to `%StateVectorManagedCPU<T>`
-     *
-     * @param state Statevector to be updated.
-     * @param observable Observable to apply.
-     */
-    inline void applyObservable(StateVectorManagedCPU<T> &state,
-                                const ObsDatum<T> &observable) {
-        using namespace Pennylane::Util;
-        for (size_t j = 0; j < observable.getSize(); j++) {
-            if (!observable.getObsParams().empty()) {
-                std::visit(
-                    [&](const auto &param) {
-                        using p_t = std::decay_t<decltype(param)>;
-                        // Apply supported gate with given params
-                        if constexpr (std::is_same_v<p_t, std::vector<T>>) {
-                            state.applyOperation(observable.getObsName()[j],
-                                                 observable.getObsWires()[j],
-                                                 false, param);
-                        }
-                        // Apply provided matrix
-                        else if constexpr (std::is_same_v<
-                                               p_t,
-                                               std::vector<std::complex<T>>>) {
-                            state.applyMatrix(
-                                param, observable.getObsWires()[j], false);
-                        } else {
-                            state.applyOperation(observable.getObsName()[j],
-                                                 observable.getObsWires()[j],
-                                                 false);
-                        }
-                    },
-                    observable.getObsParams()[j]);
-            } else { // Offload to SV dispatcher if no parameters provided
-                state.applyOperation(observable.getObsName()[j],
-                                     observable.getObsWires()[j], false);
-            }
-        }
-    }
+    // Track positions within par and non-par operations
+    size_t trainableParamNumber = tp_size - 1;
+    size_t current_param_idx =
+        num_param_ops - 1; // total number of parametric ops
 
-    /**
-     * @brief OpenMP accelerated application of observables to given
-     * statevectors
-     *
-     * @param states Vector of statevector copies, one per observable.
-     * @param reference_state Reference statevector
-     * @param observables Vector of observables to apply to each statevector.
-     */
-    inline void
-    applyObservables(std::vector<StateVectorManagedCPU<T>> &states,
-                     const StateVectorManagedCPU<T> &reference_state,
-                     const std::vector<ObsDatum<T>> &observables) {
-        // clang-format off
-        // Globally scoped exception value to be captured within OpenMP block.
-        // See the following for OpenMP design decisions:
-        // https://www.openmp.org/wp-content/uploads/openmp-examples-4.5.0.pdf
-        std::exception_ptr ex = nullptr;
-        size_t num_observables = observables.size();
-        #if defined(_OPENMP)
-            #pragma omp parallel default(none)                                 \
-            shared(states, reference_state, observables, ex, num_observables)
-        {
-            #pragma omp for
-        #endif
-            for (size_t h_i = 0; h_i < num_observables; h_i++) {
-                try {
-                    states[h_i].updateData(reference_state.getDataVector());
-                    applyObservable(states[h_i], observables[h_i]);
-                } catch (...) {
-                    #if defined(_OPENMP)
-                        #pragma omp critical
-                    #endif
-                    ex = std::current_exception();
-                    #if defined(_OPENMP)
-                        #pragma omp cancel for
-                    #endif
-                }
-            }
-        #if defined(_OPENMP)
-            if (ex) {
-                #pragma omp cancel parallel
-            }
-        }
-        #endif
-        if (ex) {
-            std::rethrow_exception(ex); //LCOV_EXCL_LINE
-        }
-        // clang-format on
-    }
-
-    /**
-     * @brief OpenMP accelerated application of adjoint operations to
-     * statevectors.
-     *
-     * @param states Vector of all statevectors; 1 per observable
-     * @param operations Operations list.
-     * @param op_idx Index of given operation within operations list to take
-     * adjoint of.
-     */
-    inline void
-    applyOperationsAdj(std::vector<StateVectorManagedCPU<T>> &states,
-                       const OpsData<T> &operations, size_t op_idx) {
-        // clang-format off
-        // Globally scoped exception value to be captured within OpenMP block.
-        // See the following for OpenMP design decisions:
-        // https://www.openmp.org/wp-content/uploads/openmp-examples-4.5.0.pdf
-        std::exception_ptr ex = nullptr;
-        size_t num_states = states.size();
-        #if defined(_OPENMP)
-            #pragma omp parallel default(none)                                 \
-                shared(states, operations, op_idx, ex, num_states)
-        {
-            #pragma omp for
-        #endif
-            for (size_t obs_idx = 0; obs_idx < num_states; obs_idx++) {
-                try {
-                    applyOperationAdj(states[obs_idx], operations, op_idx);
-                } catch (...) {
-                    #if defined(_OPENMP)
-                        #pragma omp critical
-                    #endif
-                    ex = std::current_exception();
-                    #if defined(_OPENMP)
-                        #pragma omp cancel for
-                    #endif
-                }
-            }
-        #if defined(_OPENMP)
-            if (ex) {
-                #pragma omp cancel parallel
-            }
-        }
-        #endif
-        if (ex) {
-            std::rethrow_exception(ex); //LCOV_EXCL_LINE
-        }
-        // clang-format on
-    }
-
-    /**
-     * @brief Inline utility to assist with getting the Jacobian index offset.
-     *
-     * @param obs_index
-     * @param tp_index
-     * @param tp_size
-     * @return size_t
-     */
-    inline auto getJacIndex(size_t obs_index, size_t tp_index, size_t tp_size)
-        -> size_t {
-        return obs_index * tp_size + tp_index;
-    }
+    // Create $U_{1:p}\vert \lambda \rangle$
+    StateVectorManagedCPU<T> lambda(jd.getPtrStateVec(), jd.getSizeStateVec());
 
-    /**
-     * @brief Copies complex data array into a `%vector` of the same dimension.
-     *
-     * @param input_state
-     * @param state_length
-     * @return std::vector<std::complex<T>>
-     */
-    auto copyStateData(const std::complex<T> *input_state, size_t state_length)
-        -> std::vector<std::complex<T>> {
-        return {input_state, input_state + state_length};
+    // Apply given operations to statevector if requested
+    if (apply_operations) {
+        applyOperations(lambda, ops);
     }
 
-    /**
-     * @brief Applies the gate generator for a given parameteric gate. Returns
-     * the associated scaling coefficient.
-     *
-     * @param sv Statevector data to operate upon.
-     * @param op_name Name of parametric gate.
-     * @param wires Wires to operate upon.
-     * @param adj Indicate whether to take the adjoint of the operation.
-     * @return T Generator scaling coefficient.
-     */
-    template <class SVType>
-    inline auto applyGenerator(StateVectorBase<T, SVType> &sv,
-                               const std::string &op_name,
-                               const std::vector<size_t> &wires, const bool adj)
-        -> T {
-        return sv.applyGenerator(op_name, wires, adj);
-    }
-
-  public:
-    AdjointJacobian() = default;
-
-    /**
-     * @brief Calculates the Jacobian for the statevector for the selected set
-     * of parametric gates.
-     *
-     * For the statevector data associated with `psi` of length `num_elements`,
-     * we make internal copies to a `%StateVectorManagedCPU<T>` object, with one
-     * per required observable. The `operations` will be applied to the internal
-     * statevector copies, with the operation indices participating in the
-     * gradient calculations given in `trainableParams`, and the overall number
-     * of parameters for the gradient calculation provided within `num_params`.
-     * The resulting row-major ordered `jac` matrix representation will be of
-     * size `jd.getSizeStateVec() * jd.getObservables().size()`. OpenMP is used
-     * to enable independent operations to be offloaded to threads.
-     *
-     * @param jac Preallocated vector for Jacobian data results.
-     * @param jd JacobianData represents the QuantumTape to differentiate
-     * @param apply_operations Indicate whether to apply operations to tape.psi
-     * prior to calculation.
-     */
-    void adjointJacobian(std::vector<T> &jac, const JacobianData<T> &jd,
-                         bool apply_operations = false) {
-        PL_ABORT_IF(!jd.hasTrainableParams(),
-                    "No trainable parameters provided.");
-
-        const OpsData<T> &ops = jd.getOperations();
-        const std::vector<std::string> &ops_name = ops.getOpsName();
+    const auto tp_rend = tp.rend();
+    auto tp_it = tp.rbegin();
 
-        const std::vector<ObsDatum<T>> &obs = jd.getObservables();
-        const size_t num_observables = obs.size();
+    // Create observable-applied state-vectors
+    std::vector<StateVectorManagedCPU<T>> H_lambda(
+        num_observables, StateVectorManagedCPU<T>{lambda.getNumQubits()});
+    applyObservables(H_lambda, lambda, obs);
 
-        const std::vector<size_t> &tp = jd.getTrainableParams();
-        const size_t tp_size = tp.size();
-        const size_t num_param_ops = ops.getNumParOps();
+    StateVectorManagedCPU<T> mu(lambda.getNumQubits());
 
-        // Track positions within par and non-par operations
-        size_t trainableParamNumber = tp_size - 1;
-        size_t current_param_idx =
-            num_param_ops - 1; // total number of parametric ops
-
-        // Create $U_{1:p}\vert \lambda \rangle$
-        StateVectorManagedCPU<T> lambda(jd.getPtrStateVec(),
-                                        jd.getSizeStateVec());
-
-        // Apply given operations to statevector if requested
-        if (apply_operations) {
-            applyOperations(lambda, ops);
+    for (int op_idx = static_cast<int>(ops_name.size() - 1); op_idx >= 0;
+         op_idx--) {
+        PL_ABORT_IF(ops.getOpsParams()[op_idx].size() > 1,
+                    "The operation is not supported using the adjoint "
+                    "differentiation method");
+        if ((ops_name[op_idx] == "QubitStateVector") ||
+            (ops_name[op_idx] == "BasisState")) {
+            continue; // Ignore them
         }
 
-        auto tp_it = tp.rbegin();
-        const auto tp_rend = tp.rend();
-
-        // Create observable-applied state-vectors
-        std::vector<StateVectorManagedCPU<T>> H_lambda(
-            num_observables, StateVectorManagedCPU<T>{lambda.getNumQubits()});
-        applyObservables(H_lambda, lambda, obs);
-
-        StateVectorManagedCPU<T> mu(lambda.getNumQubits());
-
-        for (int op_idx = static_cast<int>(ops_name.size() - 1); op_idx >= 0;
-             op_idx--) {
-            PL_ABORT_IF(ops.getOpsParams()[op_idx].size() > 1,
-                        "The operation is not supported using the adjoint "
-                        "differentiation method");
-            if ((ops_name[op_idx] == "QubitStateVector") ||
-                (ops_name[op_idx] == "BasisState")) {
-                continue;
-            }
-            if (tp_it == tp_rend) {
-                break; // All done
-            }
-            mu.updateData(lambda.getDataVector());
-            applyOperationAdj(lambda, ops, op_idx);
-
-            if (ops.hasParams(op_idx)) {
-                if (current_param_idx == *tp_it) {
-                    const T scalingFactor =
-                        applyGenerator(mu, ops_name[op_idx],
-                                       ops.getOpsWires()[op_idx],
-                                       !ops.getOpsInverses()[op_idx]) *
-                        (ops.getOpsInverses()[op_idx] ? -1 : 1);
-
-                    const size_t mat_row_idx =
-                        trainableParamNumber * num_observables;
-
-                    // clang-format off
-
-                    #if defined(_OPENMP)
-                        #pragma omp parallel for default(none)   \
-                        shared(H_lambda, jac, mu, scalingFactor, \
-                            mat_row_idx,        \
+        if (tp_it == tp_rend) {
+            break; // All done
+        }
+        mu.updateData(lambda.getDataVector());
+        applyOperationAdj(lambda, ops, op_idx);
+
+        if (ops.hasParams(op_idx)) {
+            if (current_param_idx == *tp_it) {
+                // if current parameter is a trainable parameter
+                const T scalingFactor =
+                    mu.applyGenerator(ops_name[op_idx],
+                                      ops.getOpsWires()[op_idx],
+                                      !ops.getOpsInverses()[op_idx]) *
+                    (ops.getOpsInverses()[op_idx] ? -1 : 1);
+
+                const size_t mat_row_idx =
+                    trainableParamNumber * num_observables;
+
+                // clang-format off
+                
+                #if defined(_OPENMP)
+                #pragma omp parallel for default(none)                         \
+                    shared(H_lambda, jac, mu, scalingFactor, mat_row_idx,      \
                             num_observables)
-                    #endif
-
-                    // clang-format on
-                    for (size_t obs_idx = 0; obs_idx < num_observables;
-                         obs_idx++) {
-                        jac[mat_row_idx + obs_idx] =
-                            -2 * scalingFactor *
-                            std::imag(
-                                innerProdC(H_lambda[obs_idx].getDataVector(),
-                                           mu.getDataVector()));
-                    }
-                    trainableParamNumber--;
-                    ++tp_it;
+                #endif
+                // clang-format on
+
+                for (size_t obs_idx = 0; obs_idx < num_observables; obs_idx++) {
+                    jac[mat_row_idx + obs_idx] =
+                        -2 * scalingFactor *
+                        std::imag(
+                            Util::innerProdC(H_lambda[obs_idx].getDataVector(),
+                                             mu.getDataVector()));
                 }
-                current_param_idx--;
+                trainableParamNumber--;
+                ++tp_it;
             }
-            applyOperationsAdj(H_lambda, ops, static_cast<size_t>(op_idx));
+            current_param_idx--;
         }
-        jac = Transpose(jac, jd.getNumParams(), num_observables);
+        applyOperationsAdj(H_lambda, ops, static_cast<size_t>(op_idx));
     }
-}; // class AdjointJacobian
+    const auto jac_transpose =
+        Util::Transpose(std::span<const T>{jac}, tp_size, num_observables);
+    std::copy(std::begin(jac_transpose), std::end(jac_transpose),
+              std::begin(jac));
+}
 } // namespace Pennylane::Algorithms
diff --git a/pennylane_lightning/src/algorithms/AlgUtil.hpp b/pennylane_lightning/src/algorithms/AlgUtil.hpp
new file mode 100644
index 0000000000..8e1f50a064
--- /dev/null
+++ b/pennylane_lightning/src/algorithms/AlgUtil.hpp
@@ -0,0 +1,180 @@
+// Copyright 2021 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "JacobianTape.hpp"
+#include "LinearAlgebra.hpp"
+#include "StateVectorManagedCPU.hpp"
+#include "Util.hpp"
+
+#include <stdexcept>
+#include <variant>
+
+namespace Pennylane::Algorithms {
+/**
+ * @brief Utility method to apply all operations from given `%OpsData<T>`
+ * object to `%StateVectorManagedCPU<T>`
+ *
+ * @param state Statevector to be updated.
+ * @param operations Operations to apply.
+ * @param adj Take the adjoint of the given operations.
+ */
+template <typename T>
+inline void applyOperations(StateVectorManagedCPU<T> &state,
+                            const OpsData<T> &operations, bool adj = false) {
+    for (size_t op_idx = 0; op_idx < operations.getOpsName().size(); op_idx++) {
+        state.applyOperation(operations.getOpsName()[op_idx],
+                             operations.getOpsWires()[op_idx],
+                             operations.getOpsInverses()[op_idx] ^ adj,
+                             operations.getOpsParams()[op_idx]);
+    }
+}
+/**
+ * @brief Utility method to apply the adjoint indexed operation from
+ * `%OpsData<T>` object to `%StateVectorManagedCPU<T>`.
+ *
+ * @param state Statevector to be updated.
+ * @param operations Operations to apply.
+ * @param op_idx Adjointed operation index to apply.
+ */
+template <typename T>
+inline void applyOperationAdj(StateVectorManagedCPU<T> &state,
+                              const OpsData<T> &operations, size_t op_idx) {
+    state.applyOperation(operations.getOpsName()[op_idx],
+                         operations.getOpsWires()[op_idx],
+                         !operations.getOpsInverses()[op_idx],
+                         operations.getOpsParams()[op_idx]);
+}
+
+/**
+ * @brief Utility method to apply a given operations from given
+ * `%ObsDatum<T>` object to `%StateVectorManagedCPU<T>`
+ *
+ * @param state Statevector to be updated.
+ * @param observable Observable to apply.
+ */
+template <typename T>
+inline void applyObservable(StateVectorManagedCPU<T> &state,
+                            Observable<T> &observable) {
+    observable.applyInPlace(state);
+}
+
+/**
+ * @brief OpenMP accelerated application of observables to given
+ * statevectors
+ *
+ * @param states Vector of statevector copies, one per observable.
+ * @param reference_state Reference statevector
+ * @param observables Vector of observables to apply to each statevector.
+ */
+template <typename T>
+inline void applyObservables(
+    std::vector<StateVectorManagedCPU<T>> &states,
+    const StateVectorManagedCPU<T> &reference_state,
+    const std::vector<std::shared_ptr<Observable<T>>> &observables) {
+    std::exception_ptr ex = nullptr;
+    size_t num_observables = observables.size();
+
+    if (num_observables > 1) {
+        /* Globally scoped exception value to be captured within OpenMP block.
+         * See the following for OpenMP design decisions:
+         * https://www.openmp.org/wp-content/uploads/openmp-examples-4.5.0.pdf
+         * */
+        // clang-format off
+
+        #if defined(_OPENMP)
+            #pragma omp parallel default(none)                                 \
+            shared(states, reference_state, observables, ex, num_observables)
+        {
+            #pragma omp for
+        #endif
+            for (size_t h_i = 0; h_i < num_observables; h_i++) {
+                try {
+                    states[h_i].updateData(reference_state.getDataVector());
+                    applyObservable(states[h_i], *observables[h_i]);
+                } catch (...) {
+                    #if defined(_OPENMP)
+                        #pragma omp critical
+                    #endif
+                    ex = std::current_exception();
+                    #if defined(_OPENMP)
+                        #pragma omp cancel for
+                    #endif
+                }
+            }
+        #if defined(_OPENMP)
+            if (ex) {
+                #pragma omp cancel parallel
+            }
+        }
+        #endif
+        if (ex) {
+            std::rethrow_exception(ex);
+        }
+        // clang-format on
+    } else {
+        states[0].updateData(reference_state.getDataVector());
+        applyObservable(states[0], *observables[0]);
+    }
+}
+
+/**
+ * @brief OpenMP accelerated application of adjoint operations to
+ * statevectors.
+ *
+ * @param states Vector of all statevectors; 1 per observable
+ * @param operations Operations list.
+ * @param op_idx Index of given operation within operations list to take
+ * adjoint of.
+ */
+template <typename T>
+inline void applyOperationsAdj(std::vector<StateVectorManagedCPU<T>> &states,
+                               const OpsData<T> &operations, size_t op_idx) {
+    // clang-format off
+    // Globally scoped exception value to be captured within OpenMP block.
+    // See the following for OpenMP design decisions:
+    // https://www.openmp.org/wp-content/uploads/openmp-examples-4.5.0.pdf
+    std::exception_ptr ex = nullptr;
+    size_t num_states = states.size();
+    #if defined(_OPENMP)
+        #pragma omp parallel default(none)                                 \
+            shared(states, operations, op_idx, ex, num_states)
+    {
+        #pragma omp for
+    #endif
+        for (size_t st_idx = 0; st_idx < num_states; st_idx++) {
+            try {
+                applyOperationAdj(states[st_idx], operations, op_idx);
+            } catch (...) {
+                #if defined(_OPENMP)
+                    #pragma omp critical
+                #endif
+                ex = std::current_exception();
+                #if defined(_OPENMP)
+                    #pragma omp cancel for
+                #endif
+            }
+        }
+    #if defined(_OPENMP)
+        if (ex) {
+            #pragma omp cancel parallel
+        }
+    }
+    #endif
+    if (ex) {
+        std::rethrow_exception(ex);
+    }
+    // clang-format on
+}
+} // namespace Pennylane::Algorithms
diff --git a/pennylane_lightning/src/algorithms/CMakeLists.txt b/pennylane_lightning/src/algorithms/CMakeLists.txt
index 7ec9e0bb0b..7b7bfca8d1 100644
--- a/pennylane_lightning/src/algorithms/CMakeLists.txt
+++ b/pennylane_lightning/src/algorithms/CMakeLists.txt
@@ -1,6 +1,6 @@
 project(lightning_algorithms LANGUAGES CXX)
 
-set(ALGORITHM_FILES AdjointDiff.hpp AdjointDiff.cpp JacobianProd.hpp JacobianProd.cpp CACHE INTERNAL "" FORCE)
+set(ALGORITHM_FILES AdjointDiff.cpp Observables.cpp JacobianTape.cpp StateVecAdjDiff.cpp CACHE INTERNAL "" FORCE)
 add_library(lightning_algorithms STATIC ${ALGORITHM_FILES})
 
 target_link_libraries(lightning_algorithms PRIVATE lightning_compile_options
diff --git a/pennylane_lightning/src/algorithms/JacobianProd.hpp b/pennylane_lightning/src/algorithms/JacobianProd.hpp
deleted file mode 100644
index 8da0d13a63..0000000000
--- a/pennylane_lightning/src/algorithms/JacobianProd.hpp
+++ /dev/null
@@ -1,141 +0,0 @@
-// Copyright 2021 Xanadu Quantum Technologies Inc.
-
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-
-//     http://www.apache.org/licenses/LICENSE-2.0
-
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-
-#include <algorithm>
-
-#include "AdjointDiff.hpp"
-#include "JacobianTape.hpp"
-#include "LinearAlgebra.hpp"
-
-namespace Pennylane::Algorithms {
-
-/**
- * @brief Represent the class to compute the vector-Jacobian products
- * following the implementation in Pennylane.
- *
- * @tparam T Floating-point precision.
- */
-template <class T = double> class VectorJacobianProduct {
-  private:
-    /**
-     * @brief Computes the vector-Jacobian product for a given vector of
-     * gradient outputs and a Jacobian.
-     *
-     * @param res Prealloacted vector for row-major ordered `jac` matrix
-     * representation.
-     * @param jac Jacobian matrix from `AdjointJacobian`.
-     * @param len Total allocation size of `jac`.
-     */
-    void getRowMajor(std::vector<T> &res,
-                     const std::vector<std::vector<T>> &jac, size_t len = 0U) {
-        if (jac.empty()) {
-            return;
-        }
-
-        const size_t r_len = jac.size();
-        const size_t c_len = jac.front().size();
-        const size_t t_len = len != 0U ? len : r_len * c_len;
-
-        if (res.size() != t_len) {
-            res.resize(t_len);
-        }
-
-        size_t k = 0;
-        for (size_t i = 0; i < r_len; i++) {
-            for (size_t j = 0; j < c_len; j++) {
-                res[k] = jac[i][j];
-                k++;
-            }
-        }
-    }
-
-  public:
-    VectorJacobianProduct() = default;
-
-    /**
-     * @brief Computes the vector-Jacobian product for a given vector of
-     * gradient outputs and a Jacobian.
-     *
-     * @param vjp Preallocated vector for vector-jacobian product data results.
-     * @param jac Row-wise flatten Jacobian matrix of shape `m * n`.
-     * @param dy_row Gradient-output vector.
-     * @param m Number of rows of `jac`.
-     * @param n Number of columns of `jac`.
-     */
-    void computeVJP(std::vector<T> &vjp, const std::vector<T> &jac,
-                    const std::vector<T> &dy_row, size_t m, size_t n) {
-        if (jac.empty() || dy_row.empty()) {
-            vjp.clear();
-            return;
-        }
-
-        if (dy_row.size() != m) {
-            throw std::invalid_argument(
-                "Invalid size for the gradient-output vector");
-        }
-
-        Util::vecMatrixProd(vjp, dy_row, jac, m, n);
-    }
-
-    /**
-     * @brief Calculates the VectorJacobianProduct for the statevector
-     * for the selected set of parametric gates using `AdjointJacobian`.
-     *
-     * @param dy Gradient-output vector.
-     * @param num_params Total number of parameters in the QuantumTape
-     * @param apply_operations Indicate whether to apply operations to jd.psi
-     * prior to calculation.
-     *
-     * @return std::function<std::vector<T>(const JacobianData<T> &jd)>
-     * where `jd` is a JacobianData object representing the QuantumTape
-     * to differentiate.
-     *
-     */
-    auto vectorJacobianProduct(const std::vector<T> &dy, size_t num_params,
-                               bool apply_operations = false)
-        -> std::function<std::vector<T>(const JacobianData<T> &)> {
-        if (dy.empty() ||
-            std::all_of(dy.cbegin(), dy.cend(), [](T e) { return e == 0; })) {
-            // If the dy vector is zero, then the
-            // corresponding element of the VJP will be zero,
-            // and we can avoid unnecessary computation.
-            return
-                [num_params =
-                     num_params]([[maybe_unused]] const JacobianData<T> &jd)
-                    -> std::vector<T> { return std::vector<T>(num_params, 0); };
-        }
-
-        return [=, this](const JacobianData<T> &jd) -> std::vector<T> {
-            if (!jd.hasTrainableParams()) {
-                // The jd has no trainable parameters;
-                // the VJP is simple {}.
-                return {};
-            }
-
-            std::vector<T> vjp(num_params);
-            std::vector<T> jac(jd.getNumObservables() * num_params, 0);
-
-            // Compute Jacobian for the input jd using `adjoint` method
-            AdjointJacobian<T> v;
-            v.adjointJacobian(jac, jd, apply_operations);
-
-            // Compute VJP
-            computeVJP(vjp, jac, dy, jd.getNumObservables(), num_params);
-            return vjp;
-        };
-    }
-}; // class VectorJacobianProduct
-
-} // namespace Pennylane::Algorithms
diff --git a/pennylane_lightning/src/algorithms/JacobianTape.cpp b/pennylane_lightning/src/algorithms/JacobianTape.cpp
index 242a9720a8..00ebe5e0f3 100644
--- a/pennylane_lightning/src/algorithms/JacobianTape.cpp
+++ b/pennylane_lightning/src/algorithms/JacobianTape.cpp
@@ -14,11 +14,8 @@
 
 #include "JacobianTape.hpp"
 
-template class Pennylane::Algorithms::ObsDatum<float>;
-template class Pennylane::Algorithms::ObsDatum<double>;
-
-template class Pennylane::Algorithms::ObsDatum<std::complex<float>>;
-template class Pennylane::Algorithms::ObsDatum<std::complex<double>>;
+template class Pennylane::Algorithms::OpsData<float>;
+template class Pennylane::Algorithms::OpsData<double>;
 
 template class Pennylane::Algorithms::JacobianData<float>;
-template class Pennylane::Algorithms::JacobianData<double>;
\ No newline at end of file
+template class Pennylane::Algorithms::JacobianData<double>;
diff --git a/pennylane_lightning/src/algorithms/JacobianTape.hpp b/pennylane_lightning/src/algorithms/JacobianTape.hpp
index 26ff1e2951..1c94696a1b 100644
--- a/pennylane_lightning/src/algorithms/JacobianTape.hpp
+++ b/pennylane_lightning/src/algorithms/JacobianTape.hpp
@@ -13,88 +13,27 @@
 // limitations under the License.
 #pragma once
 
+#include "Macros.hpp"
+#include "Observables.hpp"
+#include "StateVectorManagedCPU.hpp"
+#include "Util.hpp"
+
+#if defined(_OPENMP)
+#include <omp.h>
+#endif
+
 #include <complex>
 #include <cstring>
+#include <memory>
+#include <stdexcept>
+#include <typeinfo>
 #include <utility>
-#include <variant>
 #include <vector>
 
 namespace Pennylane::Algorithms {
-
-/**
- * @brief Utility struct for observable operations used by AdjointJacobian
- * class.
- *
- */
-template <class T = double> class ObsDatum {
-  public:
-    /**
-     * @brief Variant type of stored parameter data.
-     */
-    using param_var_t = std::variant<std::monostate, std::vector<T>,
-                                     std::vector<std::complex<T>>>;
-
-    /**
-     * @brief Copy constructor for an ObsDatum object, representing a given
-     * observable.
-     *
-     * @param obs_name Name of each operation of the observable. Tensor product
-     * observables have more than one operation.
-     * @param obs_params Parameters for a given observable operation ({} if
-     * optional).
-     * @param obs_wires Wires upon which to apply operation. Each observable
-     * operation will be a separate nested list.
-     */
-    ObsDatum(std::vector<std::string> obs_name,
-             std::vector<param_var_t> obs_params,
-             std::vector<std::vector<size_t>> obs_wires)
-        : obs_name_{std::move(obs_name)},
-          obs_params_(std::move(obs_params)), obs_wires_{
-                                                  std::move(obs_wires)} {};
-
-    /**
-     * @brief Get the number of operations in observable.
-     *
-     * @return size_t
-     */
-    [[nodiscard]] auto getSize() const -> size_t { return obs_name_.size(); }
-    /**
-     * @brief Get the name of the observable operations.
-     *
-     * @return const std::vector<std::string>&
-     */
-    [[nodiscard]] auto getObsName() const -> const std::vector<std::string> & {
-        return obs_name_;
-    }
-    /**
-     * @brief Get the parameters for the observable operations.
-     *
-     * @return const std::vector<std::vector<T>>&
-     */
-    [[nodiscard]] auto getObsParams() const
-        -> const std::vector<param_var_t> & {
-        return obs_params_;
-    }
-    /**
-     * @brief Get the wires for each observable operation.
-     *
-     * @return const std::vector<std::vector<size_t>>&
-     */
-    [[nodiscard]] auto getObsWires() const
-        -> const std::vector<std::vector<size_t>> & {
-        return obs_wires_;
-    }
-
-  private:
-    const std::vector<std::string> obs_name_;
-    const std::vector<param_var_t> obs_params_;
-    const std::vector<std::vector<size_t>> obs_wires_;
-};
-
 /**
  * @brief Utility class for encapsulating operations used by AdjointJacobian
  * class.
- *
  */
 template <class T> class OpsData {
   private:
@@ -243,26 +182,39 @@ template <class T> class OpsData {
     [[nodiscard]] auto getNumNonParOps() const -> size_t {
         return num_nonpar_ops_;
     }
+
+    /**
+     * @brief Get total number of parameters.
+     */
+    [[nodiscard]] auto getTotalNumParams() const -> size_t {
+        return std::accumulate(
+            ops_params_.begin(), ops_params_.end(), size_t{0U},
+            [](size_t acc, auto &params) { return acc + params.size(); });
+    }
 };
 
 /**
  * @brief Represent the serialized data of a QuantumTape to differentiate
- *
- * @param num_parameters Number of parameters in the Tape.
- * @param num_elements Length of the statevector data.
- * @param psi Pointer to the statevector data.
- * @param observables Observables for which to calculate Jacobian.
- * @param operations Operations used to create given state.
- * @param trainableParams List of parameters participating in Jacobian
- * calculation.
  */
 template <class T> class JacobianData {
   private:
-    size_t num_parameters;
-    size_t num_elements;
-    const std::complex<T> *psi;
-    const std::vector<ObsDatum<T>> observables;
+    size_t num_parameters;      /**< Number of parameters in the tape */
+    size_t num_elements;        /**< Length of the statevector data */
+    const std::complex<T> *psi; /**< Pointer to the statevector data */
+
+    /**
+     * @var observables
+     * Observables for which to calculate Jacobian.
+     */
+    const std::vector<std::shared_ptr<Observable<T>>> observables;
+
+    /**
+     * @var operations
+     * operations Operations used to create given state.
+     */
     const OpsData<T> operations;
+
+    /* @var trainableParams      */
     const std::vector<size_t> trainableParams;
 
   public:
@@ -274,15 +226,26 @@ template <class T> class JacobianData {
      * @param ps Pointer to the statevector data.
      * @param obs Observables for which to calculate Jacobian.
      * @param ops Operations used to create given state.
-     * @param trainP List of parameters participating in Jacobian
-     * calculation. This must be sorted.
+     * @param trainP Sorted list of parameters participating in Jacobian
+     * computation.
+     *
+     * @rst
+     * Each value :math:`i` in trainable params means that
+     * we want to take a derivative respect to the :math:`i`-th operation.
+     *
+     * Further note that ``ops`` does not contain state preparation operations
+     * (e.g. QubitStateVector) or Hamiltonian coefficients.
+     * @endrst
      */
     JacobianData(size_t num_params, size_t num_elem, std::complex<T> *ps,
-                 std::vector<ObsDatum<T>> obs, OpsData<T> ops,
-                 std::vector<size_t> trainP)
+                 std::vector<std::shared_ptr<Observable<T>>> obs,
+                 OpsData<T> ops, std::vector<size_t> trainP)
         : num_parameters(num_params), num_elements(num_elem), psi(ps),
           observables(std::move(obs)), operations(std::move(ops)),
-          trainableParams(std::move(trainP)) {}
+          trainableParams(std::move(trainP)) {
+        /* When the Hamiltonian has parameters, trainable parameters include
+         * these. We explicitly ignore them. */
+    }
 
     /**
      * @brief Get Number of parameters in the Tape.
@@ -312,10 +275,10 @@ template <class T> class JacobianData {
     /**
      * @brief Get observables for which to calculate Jacobian.
      *
-     * @return std::vector<ObsDatum<T>>&
+     * @return List of observables
      */
     [[nodiscard]] auto getObservables() const
-        -> const std::vector<ObsDatum<T>> & {
+        -> const std::vector<std::shared_ptr<Observable<T>>> & {
         return observables;
     }
 
diff --git a/pennylane_lightning/src/algorithms/Observables.cpp b/pennylane_lightning/src/algorithms/Observables.cpp
new file mode 100644
index 0000000000..d784910db0
--- /dev/null
+++ b/pennylane_lightning/src/algorithms/Observables.cpp
@@ -0,0 +1,27 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "Observables.hpp"
+
+template class Pennylane::Algorithms::NamedObs<float>;
+template class Pennylane::Algorithms::NamedObs<double>;
+
+template class Pennylane::Algorithms::HermitianObs<float>;
+template class Pennylane::Algorithms::HermitianObs<double>;
+
+template class Pennylane::Algorithms::TensorProdObs<float>;
+template class Pennylane::Algorithms::TensorProdObs<double>;
+
+template class Pennylane::Algorithms::Hamiltonian<float>;
+template class Pennylane::Algorithms::Hamiltonian<double>;
diff --git a/pennylane_lightning/src/algorithms/Observables.hpp b/pennylane_lightning/src/algorithms/Observables.hpp
new file mode 100644
index 0000000000..bf2e765272
--- /dev/null
+++ b/pennylane_lightning/src/algorithms/Observables.hpp
@@ -0,0 +1,445 @@
+// Copyright 2022 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "Error.hpp"
+#include "LinearAlgebra.hpp"
+#include "StateVectorManagedCPU.hpp"
+#include "Util.hpp"
+
+#include <memory>
+#include <unordered_set>
+
+namespace Pennylane::Algorithms {
+
+/**
+ * @brief A base class for all observable classes.
+ *
+ * We note that all subclasses must be immutable (does not provide any setter).
+ *
+ * @tparam T Floating point type
+ */
+template <typename T> class Observable {
+  private:
+    /**
+     * @brief Polymorphic function comparing this to another Observable
+     * object.
+     *
+     * @param Another instance of subclass of Observable<T> to compare
+     */
+    [[nodiscard]] virtual bool isEqual(const Observable<T> &other) const = 0;
+
+  protected:
+    Observable() = default;
+    Observable(const Observable &) = default;
+    Observable(Observable &&) noexcept = default;
+    Observable &operator=(const Observable &) = default;
+    Observable &operator=(Observable &&) noexcept = default;
+
+  public:
+    virtual ~Observable() = default;
+
+    /**
+     * @brief Apply the observable to the given statevector in place.
+     */
+    virtual void applyInPlace(StateVectorManagedCPU<T> &sv) const = 0;
+
+    /**
+     * @brief Get the name of the observable
+     */
+    [[nodiscard]] virtual auto getObsName() const -> std::string = 0;
+
+    /**
+     * @brief Get the wires the observable applies to.
+     */
+    [[nodiscard]] virtual auto getWires() const -> std::vector<size_t> = 0;
+
+    /**
+     * @brief Test whether this object is equal to another object
+     */
+    [[nodiscard]] bool operator==(const Observable<T> &other) const {
+        return typeid(*this) == typeid(other) && isEqual(other);
+    }
+
+    /**
+     * @brief Test whether this object is different from another object.
+     */
+    [[nodiscard]] bool operator!=(const Observable<T> &other) const {
+        return !(*this == other);
+    }
+};
+
+/**
+ * @brief Class models named observables (PauliX, PauliY, PauliZ, etc.)
+ *
+ * @tparam T Floating point type
+ */
+template <typename T> class NamedObs final : public Observable<T> {
+  private:
+    std::string obs_name_;
+    std::vector<size_t> wires_;
+    std::vector<T> params_;
+
+    [[nodiscard]] bool isEqual(const Observable<T> &other) const override {
+        const auto &other_cast = static_cast<const NamedObs<T> &>(other);
+
+        return (obs_name_ == other_cast.obs_name_) &&
+               (wires_ == other_cast.wires_) && (params_ == other_cast.params_);
+    }
+
+  public:
+    /**
+     * @brief Construct a NamedObs object, representing a given observable.
+     *
+     * @param arg1 Name of the observable.
+     * @param arg2 Argument to construct wires.
+     * @param arg3 Argument to construct parameters
+     */
+    NamedObs(std::string obs_name, std::vector<size_t> wires,
+             std::vector<T> params = {})
+        : obs_name_{std::move(obs_name)}, wires_{std::move(wires)},
+          params_{std::move(params)} {
+        using Gates::Constant::gate_names;
+        using Gates::Constant::gate_num_params;
+        using Gates::Constant::gate_wires;
+
+        const auto gate_op = Util::lookup(Util::reverse_pairs(gate_names),
+                                          std::string_view{obs_name_});
+        PL_ASSERT(Util::lookup(gate_wires, gate_op) == wires_.size());
+        PL_ASSERT(Util::lookup(gate_num_params, gate_op) == params_.size());
+    }
+
+    [[nodiscard]] auto getObsName() const -> std::string override {
+        using Util::operator<<;
+        std::ostringstream obs_stream;
+        obs_stream << obs_name_ << wires_;
+        return obs_stream.str();
+    }
+
+    [[nodiscard]] auto getWires() const -> std::vector<size_t> override {
+        return wires_;
+    }
+
+    void applyInPlace(StateVectorManagedCPU<T> &sv) const override {
+        sv.applyOperation(obs_name_, wires_, false, params_);
+    }
+};
+
+/**
+ * @brief Class models
+ *
+ */
+template <typename T> class HermitianObs final : public Observable<T> {
+  public:
+    using MatrixT = std::vector<std::complex<T>>;
+
+  private:
+    MatrixT matrix_;
+    std::vector<size_t> wires_;
+
+    [[nodiscard]] bool isEqual(const Observable<T> &other) const override {
+        const auto &other_cast = static_cast<const HermitianObs<T> &>(other);
+
+        return (matrix_ == other_cast.matrix_) && (wires_ == other_cast.wires_);
+    }
+
+  public:
+    /**
+     * @brief Create Hermitian observable
+     *
+     * @param matrix Matrix in row major format.
+     * @param wires Wires the observable applies to.
+     */
+    template <typename T1>
+    HermitianObs(T1 &&matrix, std::vector<size_t> wires)
+        : matrix_{std::forward<T1>(matrix)}, wires_{std::move(wires)} {
+        PL_ASSERT(matrix_.size() ==
+                  Util::exp2(wires_.size()) * Util::exp2(wires_.size()));
+    }
+
+    [[nodiscard]] auto getMatrix() const -> const MatrixT & { return matrix_; }
+
+    [[nodiscard]] auto getWires() const -> std::vector<size_t> override {
+        return wires_;
+    }
+
+    [[nodiscard]] auto getObsName() const -> std::string override {
+        return "Hermitian";
+    }
+
+    void applyInPlace(StateVectorManagedCPU<T> &sv) const override {
+        sv.applyMatrix(matrix_, wires_);
+    }
+};
+
+/**
+ * @brief Tensor product observable class
+ */
+template <typename T> class TensorProdObs final : public Observable<T> {
+  private:
+    std::vector<std::shared_ptr<Observable<T>>> obs_;
+    std::vector<size_t> all_wires_;
+
+    [[nodiscard]] bool isEqual(const Observable<T> &other) const override {
+        const auto &other_cast = static_cast<const TensorProdObs<T> &>(other);
+
+        if (obs_.size() != other_cast.obs_.size()) {
+            return false;
+        }
+
+        for (size_t i = 0; i < obs_.size(); i++) {
+            if (*obs_[i] != *other_cast.obs_[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+  public:
+    /**
+     * @brief Create a tensor product of observables
+     *
+     * @param arg Arguments perfect forwarded to vector of observables.
+     */
+    template <typename... Ts>
+    explicit TensorProdObs(Ts &&...arg) : obs_{std::forward<Ts>(arg)...} {
+        std::unordered_set<size_t> wires;
+
+        for (const auto &ob : obs_) {
+            const auto ob_wires = ob->getWires();
+            for (const auto wire : ob_wires) {
+                if (wires.contains(wire)) {
+                    PL_ABORT("All wires in observables must be disjoint.");
+                }
+                wires.insert(wire);
+            }
+        }
+        all_wires_ = std::vector<size_t>(wires.begin(), wires.end());
+        std::sort(all_wires_.begin(), all_wires_.end());
+    }
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param obs List of observables
+     */
+    static auto
+    create(std::initializer_list<std::shared_ptr<Observable<T>>> obs)
+        -> std::shared_ptr<TensorProdObs<T>> {
+        return std::shared_ptr<TensorProdObs<T>>{
+            new TensorProdObs(std::move(obs))};
+    }
+
+    static auto create(std::vector<std::shared_ptr<Observable<T>>> obs)
+        -> std::shared_ptr<TensorProdObs<T>> {
+        return std::shared_ptr<TensorProdObs<T>>{
+            new TensorProdObs(std::move(obs))};
+    }
+
+    /**
+     * @brief Get the number of operations in observable.
+     *
+     * @return size_t
+     */
+    [[nodiscard]] auto getSize() const -> size_t { return obs_.size(); }
+
+    /**
+     * @brief Get the wires for each observable operation.
+     *
+     * @return const std::vector<std::vector<size_t>>&
+     */
+    [[nodiscard]] auto getWires() const -> std::vector<size_t> override {
+        return all_wires_;
+    }
+
+    void applyInPlace(StateVectorManagedCPU<T> &sv) const override {
+        for (const auto &ob : obs_) {
+            ob->applyInPlace(sv);
+        }
+    }
+
+    [[nodiscard]] auto getObsName() const -> std::string override {
+        using Util::operator<<;
+        std::ostringstream obs_stream;
+        const auto obs_size = obs_.size();
+        for (size_t idx = 0; idx < obs_size; idx++) {
+            obs_stream << obs_[idx]->getObsName();
+            if (idx != obs_size - 1) {
+                obs_stream << " @ ";
+            }
+        }
+        return obs_stream.str();
+    }
+};
+/// @cond DEV
+namespace detail {
+// Default implementation
+template <class T, bool use_openmp> struct HamiltonianApplyInPlace {
+    static void run(const std::vector<T> &coeffs,
+                    const std::vector<std::shared_ptr<Observable<T>>> &terms,
+                    StateVectorManagedCPU<T> &sv) {
+        auto allocator = sv.allocator();
+        std::vector<std::complex<T>, decltype(allocator)> res(
+            sv.getLength(), std::complex<T>{0.0, 0.0}, allocator);
+        for (size_t term_idx = 0; term_idx < coeffs.size(); term_idx++) {
+            StateVectorManagedCPU<T> tmp(sv);
+            terms[term_idx]->applyInPlace(tmp);
+            Util::scaleAndAdd(tmp.getLength(),
+                              std::complex<T>{coeffs[term_idx], 0.0},
+                              tmp.getData(), res.data());
+        }
+        sv.updateData(res);
+    }
+};
+#if defined(_OPENMP)
+template <class T> struct HamiltonianApplyInPlace<T, true> {
+    static void run(const std::vector<T> &coeffs,
+                    const std::vector<std::shared_ptr<Observable<T>>> &terms,
+                    StateVectorManagedCPU<T> &sv) {
+        const size_t length = sv.getLength();
+        const auto allocator = sv.allocator();
+
+        std::vector<std::complex<T>, decltype(allocator)> sum(
+            length, std::complex<T>{}, allocator);
+
+#pragma omp parallel default(none) firstprivate(length, allocator)             \
+    shared(coeffs, terms, sv, sum)
+        {
+            StateVectorManagedCPU<T> tmp(sv.getNumQubits());
+
+            std::vector<std::complex<T>, decltype(allocator)> local_sv(
+                length, std::complex<T>{}, allocator);
+
+#pragma omp for
+            for (size_t term_idx = 0; term_idx < terms.size(); term_idx++) {
+                tmp.updateData(sv.getDataVector());
+                terms[term_idx]->applyInPlace(tmp);
+                Util::scaleAndAdd(length,
+                                  std::complex<T>{coeffs[term_idx], 0.0},
+                                  tmp.getData(), local_sv.data());
+            }
+
+#pragma omp critical
+            {
+                Util::scaleAndAdd(length, std::complex<T>{1.0, 0.0},
+                                  local_sv.data(), sum.data());
+            }
+        }
+
+        sv.updateData(sum);
+    }
+};
+#endif
+
+} // namespace detail
+/// @endcond
+
+/**
+ * @brief General Hamiltonian as a sum of observables.
+ *
+ * TODO: Check whether caching a sparse matrix representation can give
+ * a speedup
+ */
+template <typename T> class Hamiltonian final : public Observable<T> {
+  public:
+    using PrecisionT = T;
+
+  private:
+    std::vector<T> coeffs_;
+    std::vector<std::shared_ptr<Observable<T>>> obs_;
+
+    [[nodiscard]] bool isEqual(const Observable<T> &other) const override {
+        const auto &other_cast = static_cast<const Hamiltonian<T> &>(other);
+
+        if (coeffs_ != other_cast.coeffs_) {
+            return false;
+        }
+
+        for (size_t i = 0; i < obs_.size(); i++) {
+            if (*obs_[i] != *other_cast.obs_[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+
+  public:
+    /**
+     * @brief Create a Hamiltonian from coefficients and observables
+     *
+     * @param arg1 Arguments to construct coefficients
+     * @param arg2 Arguments to construct observables
+     */
+    template <typename T1, typename T2>
+    Hamiltonian(T1 &&arg1, T2 &&arg2)
+        : coeffs_{std::forward<T1>(arg1)}, obs_{std::forward<T2>(arg2)} {
+        PL_ASSERT(coeffs_.size() == obs_.size());
+    }
+
+    /**
+     * @brief Convenient wrapper for the constructor as the constructor does not
+     * convert the std::shared_ptr with a derived class correctly.
+     *
+     * This function is useful as std::make_shared does not handle
+     * brace-enclosed initializer list correctly.
+     *
+     * @param arg1 Argument to construct coefficients
+     * @param arg2 Argument to construct terms
+     */
+    static auto
+    create(std::initializer_list<T> arg1,
+           std::initializer_list<std::shared_ptr<Observable<T>>> arg2)
+        -> std::shared_ptr<Hamiltonian<T>> {
+        return std::shared_ptr<Hamiltonian<T>>(
+            new Hamiltonian<T>{std::move(arg1), std::move(arg2)});
+    }
+
+    void applyInPlace(StateVectorManagedCPU<T> &sv) const override {
+        detail::HamiltonianApplyInPlace<T, Util::Constant::use_openmp>::run(
+            coeffs_, obs_, sv);
+    }
+
+    [[nodiscard]] auto getWires() const -> std::vector<size_t> override {
+        std::unordered_set<size_t> wires;
+
+        for (const auto &ob : obs_) {
+            const auto ob_wires = ob->getWires();
+            wires.insert(ob_wires.begin(), ob_wires.end());
+        }
+        auto all_wires = std::vector<size_t>(wires.begin(), wires.end());
+        std::sort(all_wires.begin(), all_wires.end());
+        return all_wires;
+    }
+
+    [[nodiscard]] auto getObsName() const -> std::string override {
+        using Util::operator<<;
+        std::ostringstream ss;
+        ss << "Hamiltonian: { 'coeffs' : " << coeffs_ << ", 'observables' : [";
+        const auto term_size = coeffs_.size();
+        for (size_t t = 0; t < term_size; t++) {
+            ss << obs_[t]->getObsName();
+            if (t != term_size - 1) {
+                ss << ", ";
+            }
+        }
+        ss << "]}";
+        return ss.str();
+    }
+};
+
+} // namespace Pennylane::Algorithms
diff --git a/pennylane_lightning/src/algorithms/JacobianProd.cpp b/pennylane_lightning/src/algorithms/StateVecAdjDiff.cpp
similarity index 55%
rename from pennylane_lightning/src/algorithms/JacobianProd.cpp
rename to pennylane_lightning/src/algorithms/StateVecAdjDiff.cpp
index 9768b59284..10fcef45cc 100644
--- a/pennylane_lightning/src/algorithms/JacobianProd.cpp
+++ b/pennylane_lightning/src/algorithms/StateVecAdjDiff.cpp
@@ -11,9 +11,16 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+/**
+ * @file
+ */
+#include "StateVecAdjDiff.hpp"
 
-#include "JacobianProd.hpp"
+// explicit template instantiations
+template void Pennylane::Algorithms::statevectorVJP<float>(
+    std::span<std::complex<float>> jac, const JacobianData<float> &jd,
+    std::span<const std::complex<float>> dy, bool apply_operations);
 
-// explicit instantiation
-template class Pennylane::Algorithms::VectorJacobianProduct<float>;
-template class Pennylane::Algorithms::VectorJacobianProduct<double>;
\ No newline at end of file
+template void Pennylane::Algorithms::statevectorVJP<double>(
+    std::span<std::complex<double>> jac, const JacobianData<double> &jd,
+    std::span<const std::complex<double>> dy, bool apply_operations);
diff --git a/pennylane_lightning/src/algorithms/StateVecAdjDiff.hpp b/pennylane_lightning/src/algorithms/StateVecAdjDiff.hpp
new file mode 100644
index 0000000000..4b7b241b90
--- /dev/null
+++ b/pennylane_lightning/src/algorithms/StateVecAdjDiff.hpp
@@ -0,0 +1,132 @@
+// Copyright 2021 Xanadu Quantum Technologies Inc.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/**
+ * @file
+ * Define vjp algorithm for a statevector
+ */
+#pragma once
+#include "AlgUtil.hpp"
+
+#include <cassert>
+#include <span>
+
+namespace Pennylane::Algorithms {
+/**
+ * @brief Compute vector Jacobian product for a statevector Jacobian.
+ *
+ * @rst
+ * Product of statevector Jacobian :math:`J_{ij} = \partial_{\theta_j}
+ * \psi_{\pmb{\theta}}(i)` and a vector, i.e. this function returns
+ * :math:`w = v^\dagger J`. This is
+ * equivalent to
+ *
+ * .. math::
+ *
+ *     w_j = \langle v | \partial_{\theta_j} \psi_{\pmb{\theta}} \rangle
+ *
+ * where :math:`\pmb{\theta}=(\theta_1, \theta_2, \cdots)` is a list of all
+ * parameters and $v = dy$.
+ *
+ * Note that :math:`J` is :math:`2^n \times m` matrix where
+ * :math:`n` is the number of qubits and :math:`m` is the number of
+ * trainable parameters in the tape.
+ * Thus the result vector is length :math:`m`.
+ * @endrst
+ *
+ * @param jac Preallocated vector for Jacobian data results.
+ * @param jd Jacobian data
+ * @param vec A cotangent vector of size 2^n
+ * @param apply_operations Assume the given state is an input state and apply
+ * operations if true
+ */
+template <typename PrecisionT>
+void statevectorVJP(std::span<std::complex<PrecisionT>> jac,
+                    const JacobianData<PrecisionT> &jd,
+                    std::span<const std::complex<PrecisionT>> dy,
+                    bool apply_operations = false) {
+    using ComplexPrecisionT = std::complex<PrecisionT>;
+
+    assert(dy.size() == jd.getSizeStateVec());
+
+    if (!jd.hasTrainableParams()) {
+        return;
+    }
+
+    const OpsData<PrecisionT> &ops = jd.getOperations();
+    const std::vector<std::string> &ops_name = ops.getOpsName();
+
+    // We can assume the trainable params are sorted (from Python)
+    const size_t num_param_ops = ops.getNumParOps();
+    const auto &trainable_params = jd.getTrainableParams();
+
+    PL_ABORT_IF_NOT(jac.size() == trainable_params.size(),
+                    "The size of preallocated jacobian must be same as "
+                    "the number of trainable parameters.");
+
+    // Create $U_{1:p}\vert \lambda \rangle$
+    StateVectorManagedCPU<PrecisionT> lambda(jd.getPtrStateVec(),
+                                             jd.getSizeStateVec());
+
+    // Apply given operations to statevector if requested
+    if (apply_operations) {
+        applyOperations(lambda, ops);
+    }
+    StateVectorManagedCPU<PrecisionT> mu(dy.data(), dy.size());
+    StateVectorManagedCPU<PrecisionT> mu_d(
+        Util::log2PerfectPower(jd.getSizeStateVec()));
+
+    const auto tp_rend = trainable_params.rend();
+    auto tp_it = trainable_params.rbegin();
+    size_t current_param_idx =
+        num_param_ops - 1; // total number of parametric ops
+    size_t trainable_param_idx = trainable_params.size() - 1;
+
+    for (int op_idx = static_cast<int>(ops_name.size() - 1); op_idx >= 0;
+         op_idx--) {
+        PL_ABORT_IF(ops.getOpsParams()[op_idx].size() > 1,
+                    "The operation is not supported using the adjoint "
+                    "differentiation method");
+        if ((ops_name[op_idx] == "QubitStateVector") ||
+            (ops_name[op_idx] == "BasisState")) {
+            continue; // ignore them
+        }
+
+        if (tp_it == tp_rend) {
+            break; // All done
+        }
+
+        if (ops.hasParams(op_idx)) {
+            if (current_param_idx == *tp_it) {
+                // if current parameter is a trainable parameter
+                mu_d.updateData(mu.getDataVector());
+                const auto scalingFactor =
+                    mu_d.applyGenerator(ops_name[op_idx],
+                                        ops.getOpsWires()[op_idx],
+                                        !ops.getOpsInverses()[op_idx]) *
+                    (ops.getOpsInverses()[op_idx] ? -1 : 1);
+
+                jac[trainable_param_idx] =
+                    ComplexPrecisionT{0.0, scalingFactor} *
+                    Util::innerProdC(mu_d.getDataVector(),
+                                     lambda.getDataVector());
+                --trainable_param_idx;
+                ++tp_it;
+            }
+            --current_param_idx;
+        }
+        applyOperationAdj(lambda, ops, static_cast<size_t>(op_idx));
+        applyOperationAdj(mu, ops, static_cast<size_t>(op_idx));
+    }
+};
+} // namespace Pennylane::Algorithms
diff --git a/pennylane_lightning/src/benchmarks/Bench_LinearAlgebra.cpp b/pennylane_lightning/src/benchmarks/Bench_LinearAlgebra.cpp
index 9839f37cb0..0e2811807e 100644
--- a/pennylane_lightning/src/benchmarks/Bench_LinearAlgebra.cpp
+++ b/pennylane_lightning/src/benchmarks/Bench_LinearAlgebra.cpp
@@ -18,6 +18,8 @@
 
 #include <benchmark/benchmark.h>
 
+using namespace Pennylane;
+
 /**
  * @brief Benchmark generating a vector of random complex numbers.
  *
@@ -73,9 +75,9 @@ template <class T> static void std_innerProd_cmplx(benchmark::State &state) {
     for (auto _ : state) {
         std::complex<T> res = std::inner_product(
             vec1.data(), vec1.data() + sz, vec2.data(), std::complex<T>(),
-            Pennylane::Util::ConstSum<T>,
+            Util::ConstSum<T>,
             static_cast<std::complex<T> (*)(std::complex<T>, std::complex<T>)>(
-                &Pennylane::Util::ConstMult<T>));
+                &Util::ConstMult<T>));
         benchmark::DoNotOptimize(res);
     }
 }
@@ -88,7 +90,7 @@ BENCHMARK(std_innerProd_cmplx<double>)
     ->Range(1l << 5, 1l << 10);
 
 /**
- * @brief Benchmark Pennylane::Util::omp_innerProd for two vectors of complex
+ * @brief Benchmark Util::omp_innerProd for two vectors of complex
  * numbers.
  *
  * @tparam T Floating point precision type.
@@ -110,7 +112,7 @@ template <class T> static void omp_innerProd_cmplx(benchmark::State &state) {
     for (auto _ : state) {
         std::complex<T> res(.0, .0);
 
-        Pennylane::Util::omp_innerProd(vec1.data(), vec2.data(), res, sz);
+        Util::omp_innerProd(vec1.data(), vec2.data(), res, sz);
         benchmark::DoNotOptimize(res);
     }
 }
@@ -205,7 +207,7 @@ BENCHMARK(naive_transpose_cmplx<double>)
     ->Range(1l << 5, 1l << 10);
 
 /**
- * @brief Benchmark Pennylane::Util::CFTranspose for a randomly generated matrix
+ * @brief Benchmark Util::CFTranspose for a randomly generated matrix
  * of complex numbers.
  *
  * @tparam T Floating point precision type.
@@ -225,8 +227,8 @@ static void cf_transpose_cmplx(benchmark::State &state) {
     for (auto _ : state) {
         std::vector<std::complex<T>> mat2(sz * sz);
 
-        Pennylane::Util::CFTranspose<T, BLOCKSIZE>(mat1.data(), mat2.data(), sz,
-                                                   sz, 0, sz, 0, sz);
+        Util::CFTranspose<T, BLOCKSIZE>(mat1.data(), mat2.data(), sz, sz, 0, sz,
+                                        0, sz);
         benchmark::DoNotOptimize(mat2[sz * sz - 1]);
     }
 }
@@ -275,8 +277,8 @@ static void omp_matrixVecProd_cmplx(benchmark::State &state) {
     for (auto _ : state) {
         std::vector<std::complex<T>> vec2(sz);
 
-        Pennylane::Util::omp_matrixVecProd(mat.data(), vec1.data(), vec2.data(),
-                                           sz, sz, Trans::NoTranspose);
+        Util::omp_matrixVecProd(mat.data(), vec1.data(), vec2.data(), sz, sz,
+                                Trans::NoTranspose);
         benchmark::DoNotOptimize(vec2[sz - 1]);
     }
 }
@@ -343,7 +345,7 @@ BENCHMARK(blas_matrixVecProd_cmplx<double>)
 //***********************************************************************//
 
 /**
- * @brief Benchmark Pennylane::Util::omp_matrixMatProd for two randomly
+ * @brief Benchmark Util::omp_matrixMatProd for two randomly
  * generated matrices of complex numbers.
  *
  * @tparam T Floating point precision type.
@@ -364,14 +366,13 @@ static void omp_matrixMatProd_cmplx(benchmark::State &state) {
     for (size_t i = 0; i < sz * sz; i++)
         m_right.push_back({distr(eng), distr(eng)});
 
-    const auto m_right_tr = Pennylane::Util::Transpose(m_right, sz, sz);
+    const auto m_right_tr = Util::Transpose(m_right, sz, sz);
 
     for (auto _ : state) {
         std::vector<std::complex<T>> m_out(sz * sz);
 
-        Pennylane::Util::omp_matrixMatProd(m_left.data(), m_right_tr.data(),
-                                           m_out.data(), sz, sz, sz,
-                                           Trans::Transpose);
+        Util::omp_matrixMatProd(m_left.data(), m_right_tr.data(), m_out.data(),
+                                sz, sz, sz, Trans::Transpose);
         benchmark::DoNotOptimize(m_out[sz * sz - 1]);
     }
 }
@@ -433,3 +434,117 @@ BENCHMARK(blas_matrixMatProd_cmplx<double>)
     ->RangeMultiplier(1l << 2)
     ->Range(1l << 4, 1l << 8);
 #endif
+
+//***********************************************************************//
+//                         Scale and add
+//***********************************************************************//
+
+/**
+ * @brief Benchmark scaleAndAdd function implemented in the standard way
+ *
+ * @tparam T Floating point precision type.
+ */
+template <class T> static void std_scaleAndAdd_cmplx(benchmark::State &state) {
+    std::random_device rd;
+    std::mt19937_64 eng(rd());
+    std::uniform_real_distribution<T> distr;
+    const auto sz = static_cast<size_t>(state.range(0));
+
+    std::vector<std::complex<T>> vec1;
+    std::vector<std::complex<T>> vec2;
+    std::complex<T> scale{std::cos(0.4123), std::sin(0.4123)};
+
+    for (size_t i = 0; i < sz; i++) {
+        vec1.push_back({distr(eng), distr(eng)});
+    }
+    for (size_t i = 0; i < sz; i++) {
+        vec2.push_back({distr(eng), distr(eng)});
+    }
+
+    for (auto _ : state) {
+        for (size_t i = 0; i < sz; i++) {
+            vec2[i] += scale * vec1[i];
+        }
+        benchmark::DoNotOptimize(vec2[sz - 1]);
+    }
+}
+BENCHMARK(std_scaleAndAdd_cmplx<float>)
+    ->RangeMultiplier(1U << 2U)
+    ->Range(1U << 4U, 1U << 20U);
+
+BENCHMARK(std_scaleAndAdd_cmplx<double>)
+    ->RangeMultiplier(1U << 2)
+    ->Range(1U << 4U, 1U << 20U);
+
+/**
+ * @brief Benchmark PennyLane::Util::omp_scaleAndAdd for a randomly generated
+ * matrix and vector of complex numbers.
+ *
+ * @tparam T Floating point precision type.
+ */
+template <class T> static void omp_scaleAndAdd_cmplx(benchmark::State &state) {
+    std::random_device rd;
+    std::mt19937_64 eng(rd());
+    std::uniform_real_distribution<T> distr;
+    const auto sz = static_cast<size_t>(state.range(0));
+
+    std::vector<std::complex<T>> vec1;
+    std::vector<std::complex<T>> vec2;
+    std::complex<T> scale{std::cos(0.4123), std::sin(0.4123)};
+
+    for (size_t i = 0; i < sz; i++) {
+        vec1.push_back({distr(eng), distr(eng)});
+    }
+    for (size_t i = 0; i < sz; i++) {
+        vec2.push_back({distr(eng), distr(eng)});
+    }
+
+    for (auto _ : state) {
+        Util::omp_scaleAndAdd(sz, scale, vec1.data(), vec2.data());
+        benchmark::DoNotOptimize(vec2[sz - 1]);
+    }
+}
+BENCHMARK(omp_scaleAndAdd_cmplx<float>)
+    ->RangeMultiplier(1U << 2U)
+    ->Range(1U << 4U, 1U << 20U);
+
+BENCHMARK(omp_scaleAndAdd_cmplx<double>)
+    ->RangeMultiplier(1U << 2U)
+    ->Range(1U << 4U, 1U << 20U);
+
+#if __has_include(<cblas.h>) && defined _ENABLE_BLAS
+/**
+ * @brief Benchmark blas_scaleAndAdd
+ *
+ * @tparam T Floating point precision type.
+ */
+template <class T> static void blas_scaleAndAdd_cmplx(benchmark::State &state) {
+    std::random_device rd;
+    std::mt19937_64 eng(rd());
+    std::uniform_real_distribution<T> distr;
+    const auto sz = static_cast<size_t>(state.range(0));
+
+    std::vector<std::complex<T>> vec1;
+    std::vector<std::complex<T>> vec2;
+    std::complex<T> scale{std::cos(0.4123), std::sin(0.4123)};
+
+    for (size_t i = 0; i < sz; i++) {
+        vec1.push_back({distr(eng), distr(eng)});
+    }
+    for (size_t i = 0; i < sz; i++) {
+        vec2.push_back({distr(eng), distr(eng)});
+    }
+
+    for (auto _ : state) {
+        Util::blas_scaleAndAdd(sz, scale, vec1.data(), vec2.data());
+        benchmark::DoNotOptimize(vec2[sz - 1]);
+    }
+}
+BENCHMARK(blas_scaleAndAdd_cmplx<float>)
+    ->RangeMultiplier(1U << 2U)
+    ->Range(1U << 4U, 1U << 20U);
+
+BENCHMARK(blas_scaleAndAdd_cmplx<double>)
+    ->RangeMultiplier(1U << 2)
+    ->Range(1U << 4U, 1U << 20U);
+#endif
diff --git a/pennylane_lightning/src/bindings/Bindings.cpp b/pennylane_lightning/src/bindings/Bindings.cpp
index 64a958b7b8..5a7af05345 100644
--- a/pennylane_lightning/src/bindings/Bindings.cpp
+++ b/pennylane_lightning/src/bindings/Bindings.cpp
@@ -19,12 +19,15 @@
 
 #include "GateUtil.hpp"
 #include "SelectKernel.hpp"
+#include "StateVecAdjDiff.hpp"
 #include "StateVectorManagedCPU.hpp"
 
 #include "pybind11/pybind11.h"
 
 /// @cond DEV
 namespace {
+using namespace Pennylane;
+using namespace Pennylane::Util;
 using namespace Pennylane::Algorithms;
 using namespace Pennylane::Gates;
 
@@ -53,8 +56,6 @@ void lightning_class_bindings(py::module_ &m) {
 
     using np_arr_c = py::array_t<std::complex<ParamT>,
                                  py::array::c_style | py::array::forcecast>;
-    using np_arr_r =
-        py::array_t<ParamT, py::array::c_style | py::array::forcecast>;
     using sparse_index_type =
         long int; // Kokkos Kernels needs signed int as Ordinal type.
     using np_arr_sparse_ind =
@@ -76,246 +77,6 @@ void lightning_class_bindings(py::module_ &m) {
     pyclass.def("kernel_map", &svKernelMap<PrecisionT>,
                 "Get internal kernels for operations");
 
-    //***********************************************************************//
-    //                              Observable
-    //***********************************************************************//
-
-    class_name = "ObsStructC" + bitsize;
-    using obs_data_var = std::variant<std::monostate, np_arr_r, np_arr_c>;
-    py::class_<ObsDatum<PrecisionT>>(m, class_name.c_str(), py::module_local())
-        .def(py::init([](const std::vector<std::string> &names,
-                         const std::vector<obs_data_var> &params,
-                         const std::vector<std::vector<size_t>> &wires) {
-            std::vector<typename ObsDatum<PrecisionT>::param_var_t> conv_params(
-                params.size());
-            for (size_t p_idx = 0; p_idx < params.size(); p_idx++) {
-                std::visit(
-                    [&](const auto &param) {
-                        using p_t = std::decay_t<decltype(param)>;
-                        if constexpr (std::is_same_v<p_t, np_arr_c>) {
-                            auto buffer = param.request();
-                            auto ptr =
-                                static_cast<std::complex<ParamT> *>(buffer.ptr);
-                            if (buffer.size) {
-                                conv_params[p_idx] =
-                                    std::vector<std::complex<ParamT>>{
-                                        ptr, ptr + buffer.size};
-                            }
-                        } else if constexpr (std::is_same_v<p_t, np_arr_r>) {
-                            auto buffer = param.request();
-
-                            auto *ptr = static_cast<ParamT *>(buffer.ptr);
-                            if (buffer.size) {
-                                conv_params[p_idx] =
-                                    std::vector<ParamT>{ptr, ptr + buffer.size};
-                            }
-                        } else {
-                            PL_ABORT(
-                                "Parameter datatype not current supported");
-                        }
-                    },
-                    params[p_idx]);
-            }
-            return ObsDatum<PrecisionT>(names, conv_params, wires);
-        }))
-        .def("__repr__",
-             [](const ObsDatum<PrecisionT> &obs) {
-                 using namespace Pennylane::Util;
-                 std::ostringstream obs_stream;
-                 std::string obs_name = obs.getObsName()[0];
-                 for (size_t o = 1; o < obs.getObsName().size(); o++) {
-                     if (o < obs.getObsName().size()) {
-                         obs_name += " @ ";
-                     }
-                     obs_name += obs.getObsName()[o];
-                 }
-                 obs_stream << "'wires' : " << obs.getObsWires();
-                 return "Observable: { 'name' : " + obs_name + ", " +
-                        obs_stream.str() + " }";
-             })
-        .def("get_name",
-             [](const ObsDatum<PrecisionT> &obs) { return obs.getObsName(); })
-        .def("get_wires",
-             [](const ObsDatum<PrecisionT> &obs) { return obs.getObsWires(); })
-        .def("get_params", [](const ObsDatum<PrecisionT> &obs) {
-            py::list params;
-            for (size_t i = 0; i < obs.getObsParams().size(); i++) {
-                std::visit(
-                    [&](const auto &param) {
-                        using p_t = std::decay_t<decltype(param)>;
-                        if constexpr (std::is_same_v<
-                                          p_t,
-                                          std::vector<std::complex<ParamT>>>) {
-                            params.append(py::array_t<std::complex<ParamT>>(
-                                py::cast(param)));
-                        } else if constexpr (std::is_same_v<
-                                                 p_t, std::vector<ParamT>>) {
-                            params.append(py::array_t<ParamT>(py::cast(param)));
-                        } else if constexpr (std::is_same_v<p_t,
-                                                            std::monostate>) {
-                            params.append(py::list{});
-                        } else {
-                            throw("Unsupported data type");
-                        }
-                    },
-                    obs.getObsParams()[i]);
-            }
-            return params;
-        });
-
-    //***********************************************************************//
-    //                              Operations
-    //***********************************************************************//
-
-    class_name = "OpsStructC" + bitsize;
-    py::class_<OpsData<PrecisionT>>(m, class_name.c_str(), py::module_local())
-        .def(py::init<
-             const std::vector<std::string> &,
-             const std::vector<std::vector<ParamT>> &,
-             const std::vector<std::vector<size_t>> &,
-             const std::vector<bool> &,
-             const std::vector<std::vector<std::complex<PrecisionT>>> &>())
-        .def("__repr__", [](const OpsData<PrecisionT> &ops) {
-            using namespace Pennylane::Util;
-            std::ostringstream ops_stream;
-            for (size_t op = 0; op < ops.getSize(); op++) {
-                ops_stream << "{'name': " << ops.getOpsName()[op];
-                ops_stream << ", 'params': " << ops.getOpsParams()[op];
-                ops_stream << ", 'inv': " << ops.getOpsInverses()[op];
-                ops_stream << "}";
-                if (op < ops.getSize() - 1) {
-                    ops_stream << ",";
-                }
-            }
-            return "Operations: [" + ops_stream.str() + "]";
-        });
-
-    //***********************************************************************//
-    //                              Adjoint Jacobian
-    //***********************************************************************//
-
-    class_name = "AdjointJacobianC" + bitsize;
-    py::class_<AdjointJacobian<PrecisionT>>(m, class_name.c_str(),
-                                            py::module_local())
-        .def(py::init<>())
-        .def("create_ops_list",
-             [](AdjointJacobian<PrecisionT> &adj,
-                const std::vector<std::string> &ops_name,
-                const std::vector<np_arr_r> &ops_params,
-                const std::vector<std::vector<size_t>> &ops_wires,
-                const std::vector<bool> &ops_inverses,
-                const std::vector<np_arr_c> &ops_matrices) {
-                 std::vector<std::vector<PrecisionT>> conv_params(
-                     ops_params.size());
-                 std::vector<std::vector<std::complex<PrecisionT>>>
-                     conv_matrices(ops_matrices.size());
-                 static_cast<void>(adj);
-                 for (size_t op = 0; op < ops_name.size(); op++) {
-                     const auto p_buffer = ops_params[op].request();
-                     const auto m_buffer = ops_matrices[op].request();
-                     if (p_buffer.size) {
-                         const auto *const p_ptr =
-                             static_cast<const ParamT *>(p_buffer.ptr);
-                         conv_params[op] =
-                             std::vector<ParamT>{p_ptr, p_ptr + p_buffer.size};
-                     }
-                     if (m_buffer.size) {
-                         const auto m_ptr =
-                             static_cast<const std::complex<ParamT> *>(
-                                 m_buffer.ptr);
-                         conv_matrices[op] = std::vector<std::complex<ParamT>>{
-                             m_ptr, m_ptr + m_buffer.size};
-                     }
-                 }
-                 return OpsData<PrecisionT>{ops_name, conv_params, ops_wires,
-                                            ops_inverses, conv_matrices};
-             })
-        .def("adjoint_jacobian", &AdjointJacobian<PrecisionT>::adjointJacobian)
-        .def("adjoint_jacobian",
-             [](AdjointJacobian<PrecisionT> &adj,
-                const StateVectorRawCPU<PrecisionT> &sv,
-                const std::vector<ObsDatum<PrecisionT>> &observables,
-                const OpsData<PrecisionT> &operations,
-                const std::vector<size_t> &trainableParams, size_t num_params) {
-                 std::vector<PrecisionT> jac(observables.size() * num_params,
-                                             0);
-
-                 const JacobianData<PrecisionT> jd{
-                     num_params,  sv.getLength(), sv.getData(),
-                     observables, operations,     trainableParams};
-
-                 adj.adjointJacobian(jac, jd);
-
-                 return py::array_t<ParamT>(py::cast(jac));
-             });
-
-    //***********************************************************************//
-    //                              VJP
-    //***********************************************************************//
-
-    class_name = "VectorJacobianProductC" + bitsize;
-    py::class_<VectorJacobianProduct<PrecisionT>>(m, class_name.c_str(),
-                                                  py::module_local())
-        .def(py::init<>())
-        .def("create_ops_list",
-             [](VectorJacobianProduct<PrecisionT> &v,
-                const std::vector<std::string> &ops_name,
-                const std::vector<np_arr_r> &ops_params,
-                const std::vector<std::vector<size_t>> &ops_wires,
-                const std::vector<bool> &ops_inverses,
-                const std::vector<np_arr_c> &ops_matrices) {
-                 std::vector<std::vector<PrecisionT>> conv_params(
-                     ops_params.size());
-                 std::vector<std::vector<std::complex<PrecisionT>>>
-                     conv_matrices(ops_matrices.size());
-                 static_cast<void>(v);
-                 for (size_t op = 0; op < ops_name.size(); op++) {
-                     const auto p_buffer = ops_params[op].request();
-                     const auto m_buffer = ops_matrices[op].request();
-                     if (p_buffer.size) {
-                         const auto *const p_ptr =
-                             static_cast<const ParamT *>(p_buffer.ptr);
-                         conv_params[op] =
-                             std::vector<ParamT>{p_ptr, p_ptr + p_buffer.size};
-                     }
-                     if (m_buffer.size) {
-                         const auto m_ptr =
-                             static_cast<const std::complex<ParamT> *>(
-                                 m_buffer.ptr);
-                         conv_matrices[op] = std::vector<std::complex<ParamT>>{
-                             m_ptr, m_ptr + m_buffer.size};
-                     }
-                 }
-                 return OpsData<PrecisionT>{ops_name, conv_params, ops_wires,
-                                            ops_inverses, conv_matrices};
-             })
-        .def("compute_vjp_from_jac",
-             &VectorJacobianProduct<PrecisionT>::computeVJP)
-        .def("compute_vjp_from_jac",
-             [](VectorJacobianProduct<PrecisionT> &v,
-                const std::vector<PrecisionT> &jac,
-                const std::vector<PrecisionT> &dy_row, size_t m, size_t n) {
-                 std::vector<PrecisionT> vjp_res(n);
-                 v.computeVJP(vjp_res, jac, dy_row, m, n);
-                 return py::array_t<ParamT>(py::cast(vjp_res));
-             })
-        .def("vjp_fn",
-             [](VectorJacobianProduct<PrecisionT> &v,
-                const std::vector<PrecisionT> &dy, size_t num_params) {
-                 auto fn = v.vectorJacobianProduct(dy, num_params);
-                 return py::cpp_function(
-                     [fn, num_params](
-                         const StateVectorRawCPU<PrecisionT> &sv,
-                         const std::vector<ObsDatum<PrecisionT>> &observables,
-                         const OpsData<PrecisionT> &operations,
-                         const std::vector<size_t> &trainableParams) {
-                         const JacobianData<PrecisionT> jd{
-                             num_params,  sv.getLength(), sv.getData(),
-                             observables, operations,     trainableParams};
-                         return py::array_t<ParamT>(py::cast(fn(jd)));
-                     });
-             });
-
     //***********************************************************************//
     //                              Measures
     //***********************************************************************//
@@ -371,6 +132,230 @@ void lightning_class_bindings(py::module_ &m) {
         });
 }
 
+template <class PrecisionT, class ParamT>
+void registerAlgorithms(py::module_ &m) {
+    const std::string bitsize =
+        std::to_string(sizeof(std::complex<PrecisionT>) * 8);
+
+    //***********************************************************************//
+    //                              Observable
+    //***********************************************************************//
+
+    using np_arr_c = py::array_t<std::complex<ParamT>, py::array::c_style>;
+    using np_arr_r = py::array_t<ParamT, py::array::c_style>;
+
+    std::string class_name;
+
+    class_name = "ObservableC" + bitsize;
+    py::class_<Observable<PrecisionT>, std::shared_ptr<Observable<PrecisionT>>>(
+        m, class_name.c_str(), py::module_local());
+
+    class_name = "NamedObsC" + bitsize;
+    py::class_<NamedObs<PrecisionT>, std::shared_ptr<NamedObs<PrecisionT>>,
+               Observable<PrecisionT>>(m, class_name.c_str(),
+                                       py::module_local())
+        .def(py::init(
+            [](const std::string &name, const std::vector<size_t> &wires) {
+                return NamedObs<PrecisionT>(name, wires);
+            }))
+        .def("__repr__", &NamedObs<PrecisionT>::getObsName)
+        .def("get_wires", &NamedObs<PrecisionT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const NamedObs<PrecisionT> &self, py::handle other) -> bool {
+                if (!py::isinstance<NamedObs<PrecisionT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<NamedObs<PrecisionT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
+    class_name = "HermitianObsC" + bitsize;
+    py::class_<HermitianObs<PrecisionT>,
+               std::shared_ptr<HermitianObs<PrecisionT>>,
+               Observable<PrecisionT>>(m, class_name.c_str(),
+                                       py::module_local())
+        .def(py::init([](const np_arr_c &matrix,
+                         const std::vector<size_t> &wires) {
+            auto buffer = matrix.request();
+            const auto *ptr =
+                static_cast<std::complex<PrecisionT> *>(buffer.ptr);
+            return HermitianObs<PrecisionT>(
+                std::vector<std::complex<PrecisionT>>(ptr, ptr + buffer.size),
+                wires);
+        }))
+        .def("__repr__", &HermitianObs<PrecisionT>::getObsName)
+        .def("get_wires", &HermitianObs<PrecisionT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const HermitianObs<PrecisionT> &self, py::handle other) -> bool {
+                if (!py::isinstance<HermitianObs<PrecisionT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<HermitianObs<PrecisionT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
+    class_name = "TensorProdObsC" + bitsize;
+    py::class_<TensorProdObs<PrecisionT>,
+               std::shared_ptr<TensorProdObs<PrecisionT>>,
+               Observable<PrecisionT>>(m, class_name.c_str(),
+                                       py::module_local())
+        .def(py::init(
+            [](const std::vector<std::shared_ptr<Observable<PrecisionT>>>
+                   &obs) { return TensorProdObs<PrecisionT>(obs); }))
+        .def("__repr__", &TensorProdObs<PrecisionT>::getObsName)
+        .def("get_wires", &TensorProdObs<PrecisionT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const TensorProdObs<PrecisionT> &self,
+               py::handle other) -> bool {
+                if (!py::isinstance<TensorProdObs<PrecisionT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<TensorProdObs<PrecisionT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
+    class_name = "HamiltonianC" + bitsize;
+    using ObsPtr = std::shared_ptr<Observable<PrecisionT>>;
+    py::class_<Hamiltonian<PrecisionT>,
+               std::shared_ptr<Hamiltonian<PrecisionT>>,
+               Observable<PrecisionT>>(m, class_name.c_str(),
+                                       py::module_local())
+        .def(py::init([](const np_arr_r &coeffs,
+                         const std::vector<ObsPtr> &obs) {
+            auto buffer = coeffs.request();
+            const auto ptr = static_cast<const ParamT *>(buffer.ptr);
+            return Hamiltonian<PrecisionT>{std::vector(ptr, ptr + buffer.size),
+                                           obs};
+        }))
+        .def("__repr__", &Hamiltonian<PrecisionT>::getObsName)
+        .def("get_wires", &Hamiltonian<PrecisionT>::getWires,
+             "Get wires of observables")
+        .def(
+            "__eq__",
+            [](const Hamiltonian<PrecisionT> &self, py::handle other) -> bool {
+                if (!py::isinstance<Hamiltonian<PrecisionT>>(other)) {
+                    return false;
+                }
+                auto other_cast = other.cast<Hamiltonian<PrecisionT>>();
+                return self == other_cast;
+            },
+            "Compare two observables");
+
+    //***********************************************************************//
+    //                              Operations
+    //***********************************************************************//
+
+    class_name = "OpsStructC" + bitsize;
+    py::class_<OpsData<PrecisionT>>(m, class_name.c_str(), py::module_local())
+        .def(py::init<
+             const std::vector<std::string> &,
+             const std::vector<std::vector<ParamT>> &,
+             const std::vector<std::vector<size_t>> &,
+             const std::vector<bool> &,
+             const std::vector<std::vector<std::complex<PrecisionT>>> &>())
+        .def("__repr__", [](const OpsData<PrecisionT> &ops) {
+            using namespace Pennylane::Util;
+            std::ostringstream ops_stream;
+            for (size_t op = 0; op < ops.getSize(); op++) {
+                ops_stream << "{'name': " << ops.getOpsName()[op];
+                ops_stream << ", 'params': " << ops.getOpsParams()[op];
+                ops_stream << ", 'inv': " << ops.getOpsInverses()[op];
+                ops_stream << "}";
+                if (op < ops.getSize() - 1) {
+                    ops_stream << ",";
+                }
+            }
+            return "Operations: [" + ops_stream.str() + "]";
+        });
+
+    /**
+     * Create operation list
+     * */
+    std::string function_name = "create_ops_list_C" + bitsize;
+    m.def(
+        function_name.c_str(),
+        [](const std::vector<std::string> &ops_name,
+           const std::vector<std::vector<PrecisionT>> &ops_params,
+           const std::vector<std::vector<size_t>> &ops_wires,
+           const std::vector<bool> &ops_inverses,
+           const std::vector<np_arr_c> &ops_matrices) {
+            std::vector<std::vector<std::complex<PrecisionT>>> conv_matrices(
+                ops_matrices.size());
+            for (size_t op = 0; op < ops_name.size(); op++) {
+                const auto m_buffer = ops_matrices[op].request();
+                if (m_buffer.size) {
+                    const auto m_ptr =
+                        static_cast<const std::complex<ParamT> *>(m_buffer.ptr);
+                    conv_matrices[op] = std::vector<std::complex<ParamT>>{
+                        m_ptr, m_ptr + m_buffer.size};
+                }
+            }
+            return OpsData<PrecisionT>{ops_name, ops_params, ops_wires,
+                                       ops_inverses, conv_matrices};
+        },
+        "Create a list of operations from data.");
+    m.def(
+        "adjoint_jacobian",
+        [](const StateVectorRawCPU<PrecisionT> &sv,
+           const std::vector<std::shared_ptr<Observable<PrecisionT>>>
+               &observables,
+           const OpsData<PrecisionT> &operations,
+           const std::vector<size_t> &trainableParams) {
+            std::vector<PrecisionT> jac(
+                observables.size() * trainableParams.size(), PrecisionT{0.0});
+
+            const JacobianData<PrecisionT> jd{operations.getTotalNumParams(),
+                                              sv.getLength(),
+                                              sv.getData(),
+                                              observables,
+                                              operations,
+                                              trainableParams};
+
+            adjointJacobian(std::span{jac}, jd);
+
+            return py::array_t<ParamT>(py::cast(jac));
+        },
+        "Compute jacobian of the circuit using the adjoint method.");
+
+    m.def(
+        "statevector_vjp",
+        /* Do not cast non-conforming array. Argument trainableParams should
+         * only contain indices for operations.
+         */
+        [](const StateVectorRawCPU<PrecisionT> &sv,
+           const OpsData<PrecisionT> &operations, const np_arr_c &dy,
+           const std::vector<size_t> &trainableParams) {
+            std::vector<std::complex<PrecisionT>> vjp(
+                trainableParams.size(), std::complex<PrecisionT>{});
+
+            const JacobianData<PrecisionT> jd{operations.getTotalNumParams(),
+                                              sv.getLength(),
+                                              sv.getData(),
+                                              {},
+                                              operations,
+                                              trainableParams};
+            const auto buffer = dy.request();
+
+            statevectorVJP<PrecisionT>(
+                std::span{vjp}, jd,
+                std::span{
+                    static_cast<const std::complex<PrecisionT> *>(buffer.ptr),
+                    static_cast<size_t>(buffer.size)});
+
+            return py::array_t<std::complex<PrecisionT>>(py::cast(vjp));
+        },
+        "Compute jacobian of the circuit using the adjoint method.");
+}
+
 /**
  * @brief Add C++ classes, methods and functions to Python module.
  */
@@ -405,6 +390,13 @@ PYBIND11_MODULE(lightning_qubit_ops, // NOLINT: No control over Pybind internals
               &Gates::getIndicesAfterExclusion),
           "Get statevector indices for gate application");
 
+    /* Algorithms submodule */
+    py::module_ alg_submodule = m.def_submodule(
+        "adjoint_diff", "A submodule for adjoint differentiation method.");
+
+    registerAlgorithms<float, float>(alg_submodule);
+    registerAlgorithms<double, double>(alg_submodule);
+
     /* Add CPUMemoryModel enum class */
     py::enum_<CPUMemoryModel>(m, "CPUMemoryModel")
         .value("Unaligned", CPUMemoryModel::Unaligned)
diff --git a/pennylane_lightning/src/bindings/Bindings.hpp b/pennylane_lightning/src/bindings/Bindings.hpp
index 0f328a9143..693d757a16 100644
--- a/pennylane_lightning/src/bindings/Bindings.hpp
+++ b/pennylane_lightning/src/bindings/Bindings.hpp
@@ -19,7 +19,6 @@
 #pragma once
 #include "AdjointDiff.hpp"
 #include "CPUMemoryModel.hpp"
-#include "JacobianProd.hpp"
 #include "Kokkos_Sparse.hpp"
 #include "Macros.hpp"
 #include "Measures.hpp"
@@ -138,8 +137,8 @@ auto alignedNumpyArray(CPUMemoryModel memory_model, size_t size)
     -> pybind11::array {
     if (getAlignment<T>(memory_model) > alignof(std::max_align_t)) {
         void *ptr =
-            alignedAlloc(getAlignment<T>(memory_model), sizeof(T) * size);
-        auto capsule = pybind11::capsule(ptr, &alignedFree);
+            Util::alignedAlloc(getAlignment<T>(memory_model), sizeof(T) * size);
+        auto capsule = pybind11::capsule(ptr, &Util::alignedFree);
         return pybind11::array{
             pybind11::dtype::of<T>(), {size}, {sizeof(T)}, ptr, capsule};
     } // else
@@ -230,7 +229,7 @@ void registerGatesForStateVector(PyClass &pyclass) {
 
     Util::for_each_enum<GateOperation>([&pyclass](GateOperation gate_op) {
         const auto gate_name =
-            std::string(lookup(Constant::gate_names, gate_op));
+            std::string(Util::lookup(Constant::gate_names, gate_op));
         const std::string doc = "Apply the " + gate_name + " gate.";
         auto func = [gate_name = gate_name](
                         SVType &sv, const std::vector<size_t> &wires,
@@ -324,7 +323,7 @@ auto getCompileInfo() -> pybind11::dict {
  * @brief Return basic information of runtime environment
  */
 auto getRuntimeInfo() -> pybind11::dict {
-    using namespace Util::Constant;
+    using Util::RuntimeInfo;
     using namespace pybind11::literals;
 
     return pybind11::dict("AVX"_a = RuntimeInfo::AVX(),
diff --git a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
index f49458f79b..fab44c81cb 100644
--- a/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorManagedCPU.hpp
@@ -150,5 +150,9 @@ class StateVectorManagedCPU
         std::copy(new_data.data(), new_data.data() + new_data.size(),
                   data_.data());
     }
+
+    Util::AlignedAllocator<ComplexPrecisionT> allocator() const {
+        return data_.get_allocator();
+    }
 };
 } // namespace Pennylane
diff --git a/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
index eca8d9c694..42f0302fa6 100644
--- a/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
+++ b/pennylane_lightning/src/simulator/StateVectorRawCPU.hpp
@@ -98,7 +98,7 @@ class StateVectorRawCPU
      * @param data New raw data pointer.
      * @param length The size of the data, i.e. 2^(number of qubits).
      */
-    void setData(ComplexPrecisionT *data, size_t length) {
+    void changeDataPtr(ComplexPrecisionT *data, size_t length) {
         if (!Util::isPerfectPowerOf2(length)) {
             PL_ABORT("The length of the array for StateVector must be "
                      "a perfect power of 2. But " +
@@ -110,6 +110,21 @@ class StateVectorRawCPU
         length_ = length;
     }
 
+    /**
+     * @brief Set statevector data from another data.
+     *
+     * @param data New raw data pointer.
+     * @param length The size of the data, i.e. 2^(number of qubits).
+     */
+    void setDataFrom(ComplexPrecisionT *new_data, size_t length) {
+        if (length != this->getLength()) {
+            PL_ABORT("The length of data to set must be the same as "
+                     "the original data size"); // TODO: change to std::format
+                                                // in C++20
+        }
+        std::copy(new_data, new_data + length, data_);
+    }
+
     /**
      * @brief Get the number of data elements in the statevector array.
      *
diff --git a/pennylane_lightning/src/tests/CMakeLists.txt b/pennylane_lightning/src/tests/CMakeLists.txt
index 20f024530e..74a483bd39 100644
--- a/pennylane_lightning/src/tests/CMakeLists.txt
+++ b/pennylane_lightning/src/tests/CMakeLists.txt
@@ -68,7 +68,8 @@ target_link_libraries(compile_time_tests lightning_compile_options lightning_gat
 
 set(TEST_SOURCES CreateAllWires.cpp
                  Test_AdjDiff.cpp
-#                 Test_Bindings.cpp
+                 Test_AlgUtil.cpp
+                 #Test_Bindings.cpp
                  Test_CompilerSupport.cpp
                  Test_DynamicDispatcher.cpp
                  Test_Error.cpp
@@ -81,16 +82,18 @@ set(TEST_SOURCES CreateAllWires.cpp
                  Test_GateUtil.cpp
                  Test_Internal.cpp
                  Test_KernelMap.cpp
+                 Test_Kokkos_Sparse.cpp
                  Test_LinearAlgebra.cpp
                  Test_Measures.cpp
-                 Test_Kokkos_Sparse.cpp
                  Test_Measures_Sparse.cpp
+                 Test_Observables.cpp
                  Test_OpToMemberFuncPtr.cpp
                  Test_RuntimeInfo.cpp
+                 Test_StateVecAdjDiff.cpp
                  Test_StateVectorManagedCPU.cpp
                  Test_StateVectorRawCPU.cpp
-                 Test_Util.cpp
-                 Test_VectorJacobianProduct.cpp)
+                 Test_Util.cpp)
+                 #Test_VectorJacobianProduct.cpp)
 
 add_executable(runner ${TEST_SOURCES})
 target_link_libraries(runner PRIVATE lightning_tests_dependency
diff --git a/pennylane_lightning/src/tests/TestHelpers.hpp b/pennylane_lightning/src/tests/TestHelpers.hpp
index 8b39342311..82ba46ab69 100644
--- a/pennylane_lightning/src/tests/TestHelpers.hpp
+++ b/pennylane_lightning/src/tests/TestHelpers.hpp
@@ -99,6 +99,55 @@ bool operator!=(const std::vector<T, AllocA> &lhs,
     return !rhs.compare(lhs);
 }
 
+template <class PrecisionT> struct PLApproxComplex {
+    const std::complex<PrecisionT> comp_;
+
+    explicit PLApproxComplex(const std::complex<PrecisionT> &comp)
+        : comp_{comp} {}
+
+    PrecisionT margin_{};
+    PrecisionT epsilon_ = std::numeric_limits<float>::epsilon() * 100;
+
+    [[nodiscard]] bool compare(const std::complex<PrecisionT> &lhs) const {
+        return (lhs.real() ==
+                Approx(comp_.real()).epsilon(epsilon_).margin(margin_)) &&
+               (lhs.imag() ==
+                Approx(comp_.imag()).epsilon(epsilon_).margin(margin_));
+    }
+    [[nodiscard]] std::string describe() const {
+        std::ostringstream ss;
+        ss << "is Approx to " << comp_;
+        return ss.str();
+    }
+    PLApproxComplex &epsilon(PrecisionT eps) {
+        epsilon_ = eps;
+        return *this;
+    }
+    PLApproxComplex &margin(PrecisionT m) {
+        margin_ = m;
+        return *this;
+    }
+};
+
+template <class T>
+bool operator==(const std::complex<T> &lhs, const PLApproxComplex<T> &rhs) {
+    return rhs.compare(lhs);
+}
+template <class T>
+bool operator!=(const std::complex<T> &lhs, const PLApproxComplex<T> &rhs) {
+    return !rhs.compare(lhs);
+}
+
+template <typename T> PLApproxComplex<T> approx(const std::complex<T> &val) {
+    return PLApproxComplex<T>{val};
+}
+
+template <typename T>
+std::ostream &operator<<(std::ostream &os, const PLApproxComplex<T> &approx) {
+    os << approx.describe();
+    return os;
+}
+
 /**
  * @brief Utility function to compare complex statevector data.
  *
@@ -229,6 +278,7 @@ auto createRandomState(RandomEngine &re, size_t num_qubits)
  * @brief Create an arbitrary product state in X- or Z-basis.
  *
  * Example: createProductState("+01") will produce |+01> state.
+ * Note that the wire index starts from the left.
  */
 template <typename PrecisionT>
 auto createProductState(std::string_view str)
diff --git a/pennylane_lightning/src/tests/Test_AdjDiff.cpp b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
index efe98bb2ac..a131433ab0 100644
--- a/pennylane_lightning/src/tests/Test_AdjDiff.cpp
+++ b/pennylane_lightning/src/tests/Test_AdjDiff.cpp
@@ -3,6 +3,7 @@
 #include <complex>
 #include <iostream>
 #include <limits>
+#include <random>
 #include <type_traits>
 #include <utility>
 #include <variant>
@@ -23,31 +24,16 @@
 using namespace Pennylane;
 using namespace Pennylane::Algorithms;
 
-/**
- * @brief Tests the constructability of the AdjointDiff.hpp classes.
- *
- */
-TEMPLATE_TEST_CASE("AdjointJacobian::AdjointJacobian", "[AdjointJacobian]",
-                   float, double) {
-    SECTION("AdjointJacobian") {
-        REQUIRE(std::is_constructible<AdjointJacobian<>>::value);
-    }
-    SECTION("AdjointJacobian<TestType> {}") {
-        REQUIRE(std::is_constructible<AdjointJacobian<TestType>>::value);
-    }
-}
-
-TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=Z",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
-    std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
-
+TEST_CASE("Algorithms::adjointJacobian Op=RX, Obs=Z", "[Algorithms]") {
+    const std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    const std::vector<size_t> tp{0};
     {
         const size_t num_qubits = 1;
         const size_t num_params = 3;
         const size_t num_obs = 1;
-        auto obs = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
-        std::vector<double> jacobian(num_obs * num_params, 0);
+        const auto obs = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{0});
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
@@ -57,29 +43,83 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=Z",
 
             StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
-            std::vector<size_t> tp{0};
-            std::vector<ObsDatum<double>> obs_ls{obs};
             JacobianData<double> tape{
-                num_params, psi.getLength(), psi.getData(), obs_ls, ops, tp};
+                num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
 
-            adj.adjointJacobian(jacobian, tape, true);
+            adjointJacobian(std::span{jacobian}, tape, true);
 
             CAPTURE(jacobian);
             CHECK(-sin(p) == Approx(jacobian[0]));
         }
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Op=RY, Obs=X",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
+
+TEST_CASE("Algorithms::adjointJacobian without trainable params",
+          "[Algorithms]") {
+    const std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    const std::vector<size_t> tp{};
+    {
+        const size_t num_qubits = 1;
+        const size_t num_params = 3;
+        const size_t num_obs = 1;
+        const auto obs = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{0});
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
+
+        for (const auto &p : param) {
+            auto ops = OpsData<double>({"RX"}, {{p}}, {{0}}, {false});
+
+            std::vector<std::complex<double>> cdata(1U << num_qubits);
+            cdata[0] = std::complex<double>{1, 0};
+
+            StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+
+            JacobianData<double> tape{
+                num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
+
+            REQUIRE_NOTHROW(adjointJacobian(std::span{jacobian}, tape, true));
+        }
+    }
+}
+
+TEST_CASE(
+    "Algorithms::adjointJacobian throws an exception when size mismatches",
+    "[Algorithms]") {
+    const std::vector<size_t> tp{0, 1};
+    const size_t num_qubits = 1;
+    const size_t num_params = 3;
+    const size_t num_obs = 1;
+    const auto obs =
+        std::make_shared<NamedObs<double>>("PauliZ", std::vector<size_t>{0});
+    std::vector<double> jacobian(num_obs * tp.size() - 1, 0);
+
+    auto ops = OpsData<double>({"RX"}, {{0.742}}, {{0}}, {false});
+
+    std::vector<std::complex<double>> cdata(1U << num_qubits);
+    cdata[0] = std::complex<double>{1, 0};
+
+    StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+
+    JacobianData<double> tape{
+        num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
+
+    PL_REQUIRE_THROWS_MATCHES(
+        adjointJacobian(std::span{jacobian}, tape, true),
+        Util::LightningException,
+        "The size of preallocated jacobian must be same as");
+}
+
+TEST_CASE("Algorithms::adjointJacobian Op=RY, Obs=X", "[Algorithms]") {
     std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> tp{0};
     {
         const size_t num_qubits = 1;
         const size_t num_params = 3;
         const size_t num_obs = 1;
 
-        auto obs = ObsDatum<double>({"PauliX"}, {{}}, {{0}});
-        std::vector<double> jacobian(num_obs * num_params, 0);
+        const auto obs = std::make_shared<NamedObs<double>>(
+            "PauliX", std::vector<size_t>{0});
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         for (const auto &p : param) {
             auto ops = OpsData<double>({"RY"}, {{p}}, {{0}}, {false});
@@ -89,77 +129,78 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=RY, Obs=X",
 
             StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
-            std::vector<size_t> tp{0};
-            std::vector<ObsDatum<double>> obs_ls{obs};
             JacobianData<double> tape{
-                num_params, psi.getLength(), psi.getData(), obs_ls, ops, tp};
+                num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
 
-            adj.adjointJacobian(jacobian, tape, true);
+            adjointJacobian(std::span{jacobian}, tape, true);
 
             CAPTURE(jacobian);
             CHECK(cos(p) == Approx(jacobian[0]).margin(1e-7));
         }
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Op=RX, Obs=[Z,Z]",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
+
+TEST_CASE("Algorithms::adjointJacobian Op=RX, Obs=[Z,Z]", "[Algorithms]") {
     std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> tp{0};
     {
         const size_t num_qubits = 2;
         const size_t num_params = 1;
         const size_t num_obs = 2;
-        std::vector<double> jacobian(num_obs * num_params, 0);
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
-        auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
-        auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
+        const auto obs1 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{0});
+        const auto obs2 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{1});
 
         auto ops = OpsData<double>({"RX"}, {{param[0]}}, {{0}}, {false});
 
-        std::vector<size_t> tp{0};
-        std::vector<ObsDatum<double>> obs_ls{obs1, obs2};
         JacobianData<double> tape{
-            num_params, psi.getLength(), psi.getData(), obs_ls, ops, tp};
+            num_params, psi.getLength(), psi.getData(), {obs1, obs2}, ops, tp};
 
-        adj.adjointJacobian(jacobian, tape, true);
+        adjointJacobian(std::span{jacobian}, tape, true);
 
         CAPTURE(jacobian);
         CHECK(-sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
         CHECK(0.0 == Approx(jacobian[1 * num_obs - 1]).margin(1e-7));
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z]",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
+
+TEST_CASE("Algorithms::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z]",
+          "[Algorithms]") {
     std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> tp{0, 1, 2};
     {
         const size_t num_qubits = 3;
         const size_t num_params = 3;
         const size_t num_obs = 3;
-        std::vector<double> jacobian(num_obs * num_params, 0);
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
-        auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
-        auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
-        auto obs3 = ObsDatum<double>({"PauliZ"}, {{}}, {{2}});
+        const auto obs1 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{0});
+        const auto obs2 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{1});
+        const auto obs3 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{2});
 
         auto ops = OpsData<double>({"RX", "RX", "RX"},
                                    {{param[0]}, {param[1]}, {param[2]}},
                                    {{0}, {1}, {2}}, {false, false, false});
 
-        std::vector<size_t> tp{0, 1, 2};
-        std::vector<ObsDatum<double>> obs_ls{obs1, obs2, obs3};
-        JacobianData<double> tape{
-            num_params, psi.getLength(), psi.getData(), obs_ls, ops, tp};
+        JacobianData<double> tape{num_params,    psi.getLength(),
+                                  psi.getData(), {obs1, obs2, obs3},
+                                  ops,           tp};
 
-        adj.adjointJacobian(jacobian, tape, true);
+        adjointJacobian(std::span{jacobian}, tape, true);
 
         CAPTURE(jacobian);
         CHECK(-sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
@@ -169,69 +210,76 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z]",
               Approx(jacobian[2 * num_params + 2]).margin(1e-7));
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z], "
+
+TEST_CASE("Algorithms::adjointJacobian Op=[RX,RX,RX], Obs=[Z,Z,Z], "
           "TParams=[0,2]",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
+          "[Algorithms]") {
     std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> t_params{0, 2};
     {
         const size_t num_qubits = 3;
         const size_t num_params = 3;
         const size_t num_obs = 3;
-        std::vector<double> jacobian(num_obs * num_params, 0);
-        std::vector<size_t> t_params{0, 2};
+        std::vector<double> jacobian(num_obs * t_params.size(), 0);
 
         std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
-        auto obs1 = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
-        auto obs2 = ObsDatum<double>({"PauliZ"}, {{}}, {{1}});
-        auto obs3 = ObsDatum<double>({"PauliZ"}, {{}}, {{2}});
+        const auto obs1 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{0});
+        const auto obs2 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{1});
+        const auto obs3 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{2});
 
         auto ops = OpsData<double>({"RX", "RX", "RX"},
                                    {{param[0]}, {param[1]}, {param[2]}},
                                    {{0}, {1}, {2}}, {false, false, false});
 
-        std::vector<ObsDatum<double>> obs_ls{obs1, obs2, obs3};
-        JacobianData<double> tape{
-            num_params, psi.getLength(), psi.getData(), obs_ls, ops, t_params};
+        JacobianData<double> tape{num_params,    psi.getLength(),
+                                  psi.getData(), {obs1, obs2, obs3},
+                                  ops,           t_params};
 
-        adj.adjointJacobian(jacobian, tape, true);
+        adjointJacobian(std::span{jacobian}, tape, true);
 
         CAPTURE(jacobian);
         CHECK(-sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
-        CHECK(0 == Approx(jacobian[1 * num_params + 1]).margin(1e-7));
+        CHECK(0 == Approx(jacobian[1 * t_params.size() + 1]).margin(1e-7));
         CHECK(-sin(param[2]) ==
-              Approx(jacobian[2 * num_params + 1]).margin(1e-7));
+              Approx(jacobian[2 * t_params.size() + 1]).margin(1e-7));
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
+
+TEST_CASE("Algorithms::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
+          "[Algorithms]") {
     std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> tp{0, 1, 2};
     {
         const size_t num_qubits = 3;
         const size_t num_params = 3;
         const size_t num_obs = 1;
-        std::vector<double> jacobian(num_obs * num_params, 0);
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
-        auto obs = ObsDatum<double>({"PauliZ", "PauliZ", "PauliZ"},
-                                    {{}, {}, {}}, {{0}, {1}, {2}});
+        const auto obs = std::make_shared<TensorProdObs<double>>(
+            std::make_shared<NamedObs<double>>("PauliZ",
+                                               std::vector<size_t>{0}),
+            std::make_shared<NamedObs<double>>("PauliZ",
+                                               std::vector<size_t>{1}),
+            std::make_shared<NamedObs<double>>("PauliZ",
+                                               std::vector<size_t>{2}));
         auto ops = OpsData<double>({"RX", "RX", "RX"},
                                    {{param[0]}, {param[1]}, {param[2]}},
                                    {{0}, {1}, {2}}, {false, false, false});
 
-        std::vector<size_t> tp{0, 1, 2};
-        std::vector<ObsDatum<double>> obs_ls{obs};
         JacobianData<double> tape{
-            num_params, psi.getLength(), psi.getData(), obs_ls, ops, tp};
+            num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
 
-        adj.adjointJacobian(jacobian, tape, true);
+        adjointJacobian(std::span{jacobian}, tape, true);
 
         CAPTURE(jacobian);
 
@@ -241,22 +289,27 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=[RX,RX,RX], Obs=[ZZZ]",
         CHECK(-0.6312451595102775 == Approx(jacobian[2]).margin(1e-7));
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Op=Mixed, Obs=[XXX]",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
+
+TEST_CASE("Algorithms::adjointJacobian Op=Mixed, Obs=[XXX]", "[Algorithms]") {
     std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> tp{0, 1, 2, 3, 4, 5};
     {
         const size_t num_qubits = 3;
         const size_t num_params = 6;
         const size_t num_obs = 1;
-        std::vector<double> jacobian(num_obs * num_params, 0);
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
 
         std::vector<std::complex<double>> cdata(1U << num_qubits);
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
         cdata[0] = std::complex<double>{1, 0};
 
-        auto obs = ObsDatum<double>({"PauliX", "PauliX", "PauliX"},
-                                    {{}, {}, {}}, {{0}, {1}, {2}});
+        const auto obs = std::make_shared<TensorProdObs<double>>(
+            std::make_shared<NamedObs<double>>("PauliX",
+                                               std::vector<size_t>{0}),
+            std::make_shared<NamedObs<double>>("PauliX",
+                                               std::vector<size_t>{1}),
+            std::make_shared<NamedObs<double>>("PauliX",
+                                               std::vector<size_t>{2}));
         auto ops = OpsData<double>(
             {"RZ", "RY", "RZ", "CNOT", "CNOT", "RZ", "RY", "RZ"},
             {{param[0]},
@@ -270,12 +323,10 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=Mixed, Obs=[XXX]",
             {{0}, {0}, {0}, {0, 1}, {1, 2}, {1}, {1}, {1}},
             {false, false, false, false, false, false, false, false});
 
-        std::vector<size_t> tp{0, 1, 2, 3, 4, 5};
-        std::vector<ObsDatum<double>> obs_ls{obs};
         JacobianData<double> tape{
-            num_params, psi.getLength(), psi.getData(), obs_ls, ops, tp};
+            num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
 
-        adj.adjointJacobian(jacobian, tape, true);
+        adjointJacobian(std::span{jacobian}, tape, true);
 
         CAPTURE(jacobian);
 
@@ -288,63 +339,67 @@ TEST_CASE("AdjointJacobian::adjointJacobian Op=Mixed, Obs=[XXX]",
         CHECK(0.323846156 == Approx(jacobian[5]).margin(1e-7));
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Decomposed Rot gate, non "
+
+TEST_CASE("Algorithms::adjointJacobian Decomposed Rot gate, non "
           "computational basis state",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
-    std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+          "[Algorithms]") {
+    using namespace Pennylane::Util;
+    const std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    const std::vector<size_t> tp{0, 1, 2};
     {
         const size_t num_params = 3;
         const size_t num_obs = 1;
 
         const auto thetas = Util::linspace(-2 * M_PI, 2 * M_PI, 7);
-        std::unordered_map<double, std::vector<double>> expec_results{
-            {thetas[0], {0, -9.90819496e-01, 0}},
-            {thetas[1], {-8.18996553e-01, 1.62526544e-01, 0}},
-            {thetas[2], {-0.203949, 0.48593716, 0}},
-            {thetas[3], {0, 1, 0}},
-            {thetas[4], {-2.03948985e-01, 4.85937177e-01, 0}},
-            {thetas[5], {-8.18996598e-01, 1.62526487e-01, 0}},
-            {thetas[6], {0, -9.90819511e-01, 0}}};
-
-        for (const auto &theta : thetas) {
+        std::vector<std::vector<double>> expec_results{
+            {0, -9.90819496e-01, 0},
+            {-8.18996553e-01, 1.62526544e-01, 0},
+            {-0.203949, 0.48593716, 0},
+            {0, 1, 0},
+            {-2.03948985e-01, 4.85937177e-01, 0},
+            {-8.18996598e-01, 1.62526487e-01, 0},
+            {0, -9.90819511e-01, 0}};
+
+        const auto obs = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{0});
+
+        for (size_t i = 0; i < thetas.size(); i++) {
+            const auto theta = thetas[i];
             std::vector<double> local_params{theta, std::pow(theta, 3),
                                              SQRT2<double>() * theta};
-            std::vector<double> jacobian(num_obs * num_params, 0);
+            std::vector<double> jacobian(num_obs * tp.size(), 0);
 
             std::vector<std::complex<double>> cdata{INVSQRT2<double>(),
                                                     -INVSQRT2<double>()};
             StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
-            auto obs = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
             auto ops = OpsData<double>(
                 {"RZ", "RY", "RZ"},
                 {{local_params[0]}, {local_params[1]}, {local_params[2]}},
                 {{0}, {0}, {0}}, {false, false, false});
 
-            std::vector<size_t> tp{0, 1, 2};
-            std::vector<ObsDatum<double>> obs_ls{obs};
             JacobianData<double> tape{
-                num_params, psi.getLength(), psi.getData(), obs_ls, ops, tp};
+                num_params, psi.getLength(), psi.getData(), {obs}, ops, tp};
 
-            adj.adjointJacobian(jacobian, tape, true);
+            adjointJacobian(std::span{jacobian}, tape, true);
 
             CAPTURE(theta);
             CAPTURE(jacobian);
 
             // Computed with PennyLane using default.qubit
-            CHECK(expec_results[theta][0] == Approx(jacobian[0]).margin(1e-7));
-            CHECK(expec_results[theta][1] == Approx(jacobian[1]).margin(1e-7));
-            CHECK(expec_results[theta][2] == Approx(jacobian[2]).margin(1e-7));
+            CHECK(expec_results[i][0] == Approx(jacobian[0]).margin(1e-7));
+            CHECK(expec_results[i][1] == Approx(jacobian[1]).margin(1e-7));
+            CHECK(expec_results[i][2] == Approx(jacobian[2]).margin(1e-7));
         }
     }
 }
-TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
-          "[AdjointJacobian]") {
-    AdjointJacobian<double> adj;
+
+TEST_CASE("Algorithms::adjointJacobian Mixed Ops, Obs and TParams",
+          "[Algorithms]") {
+    using namespace Pennylane::Util;
     std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    const std::vector<size_t> t_params{1, 2, 3};
     {
-        const std::vector<size_t> t_params{1, 2, 3};
         const size_t num_obs = 1;
 
         const auto thetas = Util::linspace(-2 * M_PI, 2 * M_PI, 8);
@@ -357,7 +412,12 @@ TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
                                                 ZERO<double>(), ZERO<double>()};
         StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
 
-        auto obs = ObsDatum<double>({"PauliX", "PauliZ"}, {{}, {}}, {{0}, {1}});
+        const auto obs = std::make_shared<TensorProdObs<double>>(
+            std::make_shared<NamedObs<double>>("PauliX",
+                                               std::vector<size_t>{0}),
+            std::make_shared<NamedObs<double>>("PauliZ",
+                                               std::vector<size_t>{1}));
+
         auto ops = OpsData<double>(
             {"Hadamard", "RX", "CNOT", "RZ", "RY", "RZ", "RZ", "RY", "RZ", "RZ",
              "RY", "CNOT"},
@@ -377,12 +437,11 @@ TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
             {false, false, false, false, false, false, false, false, false,
              false, false, false});
 
-        std::vector<ObsDatum<double>> obs_ls{obs};
         JacobianData<double> tape{
-            t_params.size(), psi.getLength(), psi.getData(), obs_ls, ops,
+            t_params.size(), psi.getLength(), psi.getData(), {obs}, ops,
             t_params};
 
-        adj.adjointJacobian(jacobian, tape, true);
+        adjointJacobian(std::span{jacobian}, tape, true);
 
         std::vector<double> expected{-0.71429188, 0.04998561, -0.71904837};
         // Computed with PennyLane using default.qubit
@@ -392,88 +451,182 @@ TEST_CASE("AdjointJacobian::adjointJacobian Mixed Ops, Obs and TParams",
     }
 }
 
-TEST_CASE("AdjointJacobian::applyObservable visitor checks",
-          "[AdjointJacobian]") {
-    SECTION("Obs with params 0") {
-        AdjointJacobian<double> adj;
-        std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
-        std::vector<double> expec_results{0.90096887, 0.80901699, -0.5};
-
-        auto obs_default = ObsDatum<double>({"PauliZ"}, {{}}, {{0}});
-        auto ops =
-            OpsData<double>({"RX"}, {{expec_results[0]}}, {{0}}, {false});
-        std::vector<double> out_data(1);
-
-        for (std::size_t i = 0; i < param.size(); i++) {
-            StateVectorManagedCPU<double> psi(2);
-            JacobianData<double> jd(1, psi.getLength(), psi.getData(),
-                                    {obs_default}, ops, {1});
-            adj.adjointJacobian(out_data, jd, true);
-        }
+TEST_CASE("Algorithms::adjointJacobian Op=RX, Obs=Ham[Z0+Z1]", "[Algorithms]") {
+    std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> tp{0};
+    {
+        const size_t num_qubits = 2;
+        const size_t num_params = 1;
+        const size_t num_obs = 1;
+        std::vector<double> jacobian(num_obs * tp.size(), 0);
+
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
+
+        const auto obs1 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{0});
+        const auto obs2 = std::make_shared<NamedObs<double>>(
+            "PauliZ", std::vector<size_t>{1});
+
+        auto ham = Hamiltonian<double>::create({0.3, 0.7}, {obs1, obs2});
+
+        auto ops = OpsData<double>({"RX"}, {{param[0]}}, {{0}}, {false});
+
+        JacobianData<double> tape{
+            num_params, psi.getLength(), psi.getData(), {ham}, ops, tp};
+
+        adjointJacobian(std::span{jacobian}, tape, true);
+
+        CAPTURE(jacobian);
+        CHECK(-0.3 * sin(param[0]) == Approx(jacobian[0]).margin(1e-7));
     }
-    SECTION("Obs with params std::vector<std::complex<double>>") {
-        AdjointJacobian<double> adj;
-        std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
-        std::vector<double> expec_results{0.90096887, 0.80901699, -0.5};
-        using v_type = std::vector<std::complex<double>>;
-
-        v_type z_par{ONE<double>(), ZERO<double>(), ZERO<double>(),
-                     ZERO<double>()};
-
-        auto obs_default = ObsDatum<double>({"MyPauliZ"}, {z_par}, {{0}});
-
-        auto ops =
-            OpsData<double>({"RX"}, {{expec_results[0]}}, {{0}}, {false});
-        std::vector<double> out_data(1);
-
-        for (std::size_t i = 0; i < param.size(); i++) {
-            StateVectorManagedCPU<double> psi(2);
-            JacobianData<double> jd(1, psi.getLength(), psi.getData(),
-                                    {obs_default}, ops, {1});
-            adj.adjointJacobian(out_data, jd, true);
-        }
+}
+
+TEST_CASE("Algorithms::adjointJacobian Op=[RX,RX,RX], Obs=Ham[Z0+Z1+Z2], "
+          "TParams=[0,2]",
+          "[Algorithms]") {
+    std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> t_params{0, 2};
+    {
+        const size_t num_qubits = 3;
+        const size_t num_params = 3;
+        const size_t num_obs = 1;
+        std::vector<double> jacobian(num_obs * t_params.size(), 0);
+
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
+
+        auto obs1 = std::make_shared<NamedObs<double>>("PauliZ",
+                                                       std::vector<size_t>{0});
+        auto obs2 = std::make_shared<NamedObs<double>>("PauliZ",
+                                                       std::vector<size_t>{1});
+        auto obs3 = std::make_shared<NamedObs<double>>("PauliZ",
+                                                       std::vector<size_t>{2});
+
+        auto ham =
+            Hamiltonian<double>::create({0.47, 0.32, 0.96}, {obs1, obs2, obs3});
+
+        auto ops = OpsData<double>({"RX", "RX", "RX"},
+                                   {{param[0]}, {param[1]}, {param[2]}},
+                                   {{0}, {1}, {2}}, {false, false, false});
+
+        JacobianData<double> tape{
+            num_params, psi.getLength(), psi.getData(), {ham}, ops, t_params};
+
+        adjointJacobian(std::span{jacobian}, tape, true);
+
+        CAPTURE(jacobian);
+        CHECK((-0.47 * sin(param[0]) == Approx(jacobian[0]).margin(1e-7)));
+        CHECK((-0.96 * sin(param[2]) == Approx(jacobian[1]).margin(1e-7)));
     }
-    SECTION("Obs with params std::vector<double>") {
-        AdjointJacobian<double> adj;
-        std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
-        std::vector<double> expec_results{0.90096887, 0.80901699, -0.5};
-        using v_type = std::vector<double>;
+}
+
+template <typename RandomEngine>
+std::vector<int> randomIntVector(RandomEngine &re, size_t size, int min,
+                                 int max) {
+    std::uniform_int_distribution<int> dist(min, max);
+    std::vector<int> res;
+
+    res.reserve(size);
+    for (size_t i = 0; i < size; i++) {
+        res.emplace_back(dist(re));
+    }
+    return res;
+}
 
-        v_type z_par{0.123};
+TEST_CASE(
+    "Algorithms::adjointJacobian with exceedingly complicated Hamiltonian",
+    "[Algorithms]") {
+    using namespace std::literals;
+    using Pennylane::Algorithms::detail::HamiltonianApplyInPlace;
 
-        auto obs_default = ObsDatum<double>({"RZ"}, {z_par}, {{0}});
+    std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> t_params{0, 2};
+
+    std::mt19937 re{1337};
+    const size_t num_qubits = 8;
+    const size_t n_terms = 1024;
+
+    std::array<std::string_view, 4> pauli_strs = {""sv, "PauliX"sv, "PauliY"sv,
+                                                  "PauliZ"sv};
+
+    std::vector<double> coeffs;
+    std::vector<std::shared_ptr<Observable<double>>> terms;
 
-        auto ops =
-            OpsData<double>({"RX"}, {{expec_results[0]}}, {{0}}, {false});
-        std::vector<double> out_data(1);
+    std::uniform_real_distribution<double> dist(-1.0, 1.0);
 
-        for (std::size_t i = 0; i < param.size(); i++) {
-            StateVectorManagedCPU<double> psi(2);
-            JacobianData<double> jd(1, psi.getLength(), psi.getData(),
-                                    {obs_default}, ops, {1});
-            adj.adjointJacobian(out_data, jd, true);
+    for (size_t k = 0; k < n_terms; k++) {
+        auto term_pauli = randomIntVector(re, num_qubits, 0, 3);
+
+        std::vector<std::shared_ptr<Observable<double>>> term_comp;
+        for (size_t i = 0; i < num_qubits; i++) {
+            if (term_pauli[i] == 0) {
+                continue;
+            }
+            auto wires = std::vector<size_t>();
+            wires.emplace_back(i);
+            auto ob = std::make_shared<NamedObs<double>>(
+                std::string{pauli_strs[term_pauli[i]]}, wires);
+            term_comp.push_back(std::move(ob));
         }
+
+        coeffs.emplace_back(dist(re));
+        terms.emplace_back(TensorProdObs<double>::create(term_comp));
     }
-    SECTION("Obs no params") {
-        AdjointJacobian<double> adj;
-        std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
-        std::vector<double> expec_results{0.90096887, 0.80901699, -0.5};
-        using v_type = std::vector<std::complex<double>>;
-
-        v_type z_par{ONE<double>(), ZERO<double>(), ZERO<double>(),
-                     ZERO<double>()};
-
-        auto obs_default = ObsDatum<double>({"PauliZ"}, {}, {{0}});
-
-        auto ops =
-            OpsData<double>({"RX"}, {{expec_results[0]}}, {{0}}, {false});
-        std::vector<double> out_data(1);
-
-        for (std::size_t i = 0; i < param.size(); i++) {
-            StateVectorManagedCPU<double> psi(2);
-            JacobianData<double> jd(1, psi.getLength(), psi.getData(),
-                                    {obs_default}, ops, {1});
-            adj.adjointJacobian(out_data, jd, true);
-        }
+    std::vector<std::complex<double>> psi(size_t{1} << num_qubits);
+    std::normal_distribution<double> ndist;
+    for (auto &e : psi) {
+        e = ndist(re);
+    }
+
+    StateVectorManagedCPU<double> sv1(psi.data(), psi.size());
+    StateVectorManagedCPU<double> sv2(psi.data(), psi.size());
+
+    HamiltonianApplyInPlace<double, false>::run(coeffs, terms, sv1);
+    HamiltonianApplyInPlace<double, true>::run(coeffs, terms, sv2);
+
+    REQUIRE(sv1.getDataVector() == PLApprox(sv2.getDataVector()).margin(1e-7));
+}
+
+TEST_CASE("Algorithms::adjointJacobian Test HermitianObs", "[Algorithms]") {
+    std::vector<double> param{-M_PI / 7, M_PI / 5, 2 * M_PI / 3};
+    std::vector<size_t> t_params{0, 2};
+    {
+        const size_t num_qubits = 3;
+        const size_t num_params = 3;
+        const size_t num_obs = 1;
+        std::vector<double> jacobian1(num_obs * t_params.size(), 0);
+        std::vector<double> jacobian2(num_obs * t_params.size(), 0);
+
+        std::vector<std::complex<double>> cdata(1U << num_qubits);
+        StateVectorRawCPU<double> psi(cdata.data(), cdata.size());
+        cdata[0] = std::complex<double>{1, 0};
+
+        auto obs1 = std::make_shared<TensorProdObs<double>>(
+            std::make_shared<NamedObs<double>>("PauliZ",
+                                               std::vector<size_t>{0}),
+            std::make_shared<NamedObs<double>>("PauliZ",
+                                               std::vector<size_t>{1}));
+        auto obs2 = std::make_shared<HermitianObs<double>>(
+            std::vector<std::complex<double>>{1, 0, 0, 0, 0, -1, 0, 0, 0, 0, -1,
+                                              0, 0, 0, 0, 1},
+            std::vector<size_t>{0, 1});
+
+        auto ops = OpsData<double>({"RX", "RX", "RX"},
+                                   {{param[0]}, {param[1]}, {param[2]}},
+                                   {{0}, {1}, {2}}, {false, false, false});
+
+        JacobianData<double> tape1{
+            num_params, psi.getLength(), psi.getData(), {obs1}, ops, t_params};
+
+        JacobianData<double> tape2{
+            num_params, psi.getLength(), psi.getData(), {obs2}, ops, t_params};
+
+        adjointJacobian(std::span{jacobian1}, tape1, true);
+        adjointJacobian(std::span{jacobian2}, tape2, true);
+
+        CHECK((jacobian1 == PLApprox(jacobian2).margin(1e-7)));
     }
 }
diff --git a/pennylane_lightning/src/tests/Test_AlgUtil.cpp b/pennylane_lightning/src/tests/Test_AlgUtil.cpp
new file mode 100644
index 0000000000..3698ada7ec
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_AlgUtil.cpp
@@ -0,0 +1,78 @@
+#include "AlgUtil.hpp"
+#include "Observables.hpp"
+
+#include <catch2/catch.hpp>
+
+using namespace Pennylane;
+using namespace Pennylane::Algorithms;
+
+class TestException : public std::exception {};
+
+template <typename T> class TestObservable : public Observable<T> {
+  public:
+    void
+    applyInPlace([[maybe_unused]] StateVectorManagedCPU<T> &sv) const override {
+        throw TestException();
+    }
+
+    [[nodiscard]] auto
+    isEqual([[maybe_unused]] const Observable<T> &other) const
+        -> bool override {
+        return true;
+    }
+
+    [[nodiscard]] auto getObsName() const -> std::string override {
+        return "TestObservable";
+    }
+
+    [[nodiscard]] auto getWires() const -> std::vector<size_t> override {
+        return {};
+    }
+};
+
+TEMPLATE_TEST_CASE("applyObservables", "[Algorithms]", float, double) {
+    using PrecisionT = TestType;
+
+    const size_t num_qubits = 8;
+
+    SECTION("Exceptions are rethrown correctly") {
+        std::vector<StateVectorManagedCPU<PrecisionT>> states(
+            8, StateVectorManagedCPU<PrecisionT>(num_qubits));
+
+        StateVectorManagedCPU<PrecisionT> ref_state(num_qubits);
+
+        std::vector<std::shared_ptr<Observable<PrecisionT>>> observables{
+            std::make_shared<TestObservable<PrecisionT>>(),
+            std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                   std::vector<size_t>{0}),
+            std::make_shared<TestObservable<PrecisionT>>(),
+            std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                   std::vector<size_t>{0}),
+            std::make_shared<TestObservable<PrecisionT>>(),
+            std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                   std::vector<size_t>{0}),
+            std::make_shared<TestObservable<PrecisionT>>(),
+            std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                   std::vector<size_t>{0}),
+        };
+
+        REQUIRE_THROWS_AS(
+            applyObservables<PrecisionT>(states, ref_state, observables),
+            TestException);
+    }
+}
+
+TEMPLATE_TEST_CASE("applyOperationsAdj", "[Algorithms]", float, double) {
+    using PrecisionT = TestType;
+
+    const size_t num_qubits = 8;
+
+    SECTION("Exceptions are rethrown correctly") {
+        std::vector<StateVectorManagedCPU<PrecisionT>> states(
+            8, StateVectorManagedCPU<PrecisionT>(num_qubits));
+
+        OpsData<PrecisionT> ops_data{{"InvalidOpsName"}, {{}}, {{0, 1}}, {{}}};
+
+        REQUIRE_THROWS(applyOperationsAdj<PrecisionT>(states, ops_data, 0));
+    }
+}
diff --git a/pennylane_lightning/src/tests/Test_Internal.cpp b/pennylane_lightning/src/tests/Test_Internal.cpp
index bc8b5d44c7..c1409ea23b 100644
--- a/pennylane_lightning/src/tests/Test_Internal.cpp
+++ b/pennylane_lightning/src/tests/Test_Internal.cpp
@@ -78,16 +78,20 @@ TEMPLATE_TEST_CASE("createProductState", "[Test_Internal]", float, double) {
 
         REQUIRE(st == approx(expected).margin(margin));
     }
-    SECTION("createProductState(\"+-0\") == |+-1> ") {
+    SECTION("createProductState(\"+-0\") != |+-1> ") {
         const auto st = createProductState<PrecisionT>("+-0");
 
-        auto expected = createZeroState<PrecisionT>(3);
-        GateImplementationsPI::applyHadamard(expected.data(), 3, {0}, false);
+        auto expected = createZeroState<PrecisionT>(3); // |000>
+        GateImplementationsPI::applyHadamard(expected.data(), 3, {0},
+                                             false); // |+00>
 
-        GateImplementationsPI::applyPauliX(expected.data(), 3, {1}, false);
-        GateImplementationsPI::applyHadamard(expected.data(), 3, {1}, false);
+        GateImplementationsPI::applyPauliX(expected.data(), 3, {1},
+                                           false); // |+10>
+        GateImplementationsPI::applyHadamard(expected.data(), 3, {1},
+                                             false); // |+-0>
 
-        GateImplementationsPI::applyPauliX(expected.data(), 3, {2}, false);
+        GateImplementationsPI::applyPauliX(expected.data(), 3, {2},
+                                           false); // |+-1>
 
         REQUIRE(st != approx(expected).margin(margin));
     }
diff --git a/pennylane_lightning/src/tests/Test_LinearAlgebra.cpp b/pennylane_lightning/src/tests/Test_LinearAlgebra.cpp
index 544043ed52..45c91bab6f 100644
--- a/pennylane_lightning/src/tests/Test_LinearAlgebra.cpp
+++ b/pennylane_lightning/src/tests/Test_LinearAlgebra.cpp
@@ -15,10 +15,8 @@
 
 using namespace Pennylane;
 
-// NOLINTNEXTLINE: Avoid complexity errors
-TEMPLATE_TEST_CASE("Test linear algebra functions", "[Util][LinearAlgebra]",
-                   float, double) {
-    using Util::Trans;
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEMPLATE_TEST_CASE("Inner product", "[Util][LinearAlgebra]", float, double) {
     SECTION("innerProd") {
         SECTION("Iterative increment") {
             for (size_t i = 0; i < 12; i++) {
@@ -129,6 +127,11 @@ TEMPLATE_TEST_CASE("Test linear algebra functions", "[Util][LinearAlgebra]",
             CHECK(imag(result) == Approx(imag(expected_result)).margin(1e-7));
         }
     }
+}
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEMPLATE_TEST_CASE("Product", "[Util][LinearAlgebra]", float, double) {
+    using Util::Trans;
     SECTION("matrixVecProd") {
         SECTION("Simple Iterative with NoTranspose") {
             for (size_t m = 2; m < 8; m++) {
@@ -207,6 +210,44 @@ TEMPLATE_TEST_CASE("Test linear algebra functions", "[Util][LinearAlgebra]",
 
             CHECK(v_out == approx(v_expected).margin(1e-7));
         }
+        SECTION("Random Complex with Adjoint") {
+            std::vector<std::complex<TestType>> v_in{
+                {0.643624335855, 0.578590708232},
+                {0.538989919338, 0.466635790378},
+                {0.618764845639, 0.599437240657},
+                {0.357299642534, 0.053014685781},
+            };
+            std::vector<std::complex<TestType>> mat{
+                {0.532146153405, 0.701149723765},
+                {0.846447557122, 0.115252356911},
+                {0.054946913257, 0.827064042981},
+                {0.716903475670, 0.676278287205},
+                {0.695253133818, 0.701356633873},
+                {0.652528065087, 0.918467022349},
+                {0.977379072529, 0.855331203864},
+                {0.254556889390, 0.518154071409},
+                {0.772606503900, 0.513854930480},
+                {0.536592010310, 0.154591920456},
+                {0.415274560257, 0.825736069441},
+                {0.634991477205, 0.533747170519},
+                {0.347378131788, 0.235064960717},
+                {0.936386930317, 0.181149920417},
+                {0.375506408059, 0.085886487536},
+                {0.249630514363, 0.514992726206},
+            };
+            std::vector<std::complex<TestType>> v_expected{
+                {2.372858970605, -0.117375105278},
+                {2.160640571918, 0.435930598280},
+                {2.330478161419, -0.778246884580},
+                {2.061051094071, -0.301369019950},
+            };
+
+            std::vector<std::complex<TestType>> v_out =
+                Util::matrixVecProd(mat, v_in, 4, 4, Trans::Adjoint);
+            CAPTURE(v_out);
+
+            CHECK(v_out == approx(v_expected).margin(1e-7));
+        }
         SECTION("Invalid Arguments") {
             using namespace Catch::Matchers;
             std::vector<std::complex<TestType>> mat(2 * 3, {1.0, 1.0});
@@ -276,6 +317,18 @@ TEMPLATE_TEST_CASE("Test linear algebra functions", "[Util][LinearAlgebra]",
 
             CHECK(v_out == approx(v_expected).margin(1e-7));
         }
+        SECTION("In Place") {
+            std::vector<TestType> v_in{1.0, 2.0, 3.0, 4.0};
+            std::vector<TestType> mat{1.0, 0.1,  0.2, 0.2,  0.6,  0.1,
+                                      0.4, -0.7, 1.2, -0.5, -0.6, 0.7};
+            std::vector<TestType> v_expected{0.6, -3.2, 6.8};
+            std::vector<TestType> v_out1 = Util::vecMatrixProd(v_in, mat, 4, 3);
+
+            std::vector<TestType> v_out2(3, TestType{});
+            Util::vecMatrixProd(v_out2, v_in, mat, 4, 3);
+
+            CHECK(v_out2 == v_out1);
+        }
         SECTION("Invalid Arguments") {
             using namespace Catch::Matchers;
             std::vector<TestType> v_in(4, {1.0});
@@ -316,121 +369,6 @@ TEMPLATE_TEST_CASE("Test linear algebra functions", "[Util][LinearAlgebra]",
             CHECK(v_out == nullptr);
         }
     }
-    SECTION("CFTranspose") {
-        SECTION("Simple Matrix") {
-            for (size_t m = 2; m < 10; m++) {
-                std::vector<TestType> mat(m * m, {0});
-                for (size_t i = 0; i < m; i++) {
-                    mat[i * m + i] = 1.0;
-                }
-                std::vector<TestType> mat_t(m * m);
-                Util::CFTranspose<TestType, 4>(mat.data(), mat_t.data(), m, m,
-                                               0, m, 0, m);
-
-                CAPTURE(mat_t);
-                CAPTURE(mat);
-
-                CHECK(mat_t == approx(mat).margin(1e-7));
-            }
-        }
-        SECTION("Random Complex") {
-            std::vector<TestType> mat{
-                0.417876, 0.27448,   0.601209, 0.723548, 0.781624,
-                0.538222, 0.0597232, 0.27755,  0.836569,
-            };
-            std::vector<TestType> mat_t_exp{
-                0.417876, 0.723548, 0.0597232, 0.27448,  0.781624,
-                0.27755,  0.601209, 0.538222,  0.836569,
-            };
-            std::vector<TestType> mat_t(9);
-            Util::CFTranspose<TestType, 2>(mat.data(), mat_t.data(), 3, 3, 0, 3,
-                                           0, 3);
-
-            CAPTURE(mat_t);
-            CAPTURE(mat_t_exp);
-
-            CHECK(mat_t == approx(mat_t_exp));
-        }
-        SECTION("Random Complex non-square") {
-            std::vector<TestType> mat{
-                0.417876, 0.27448,  0.601209,  0.723548,
-                0.781624, 0.538222, 0.0597232, 0.27755,
-            };
-            std::vector<TestType> mat_t_exp{0.417876, 0.781624, 0.27448,
-                                            0.538222, 0.601209, 0.0597232,
-                                            0.723548, 0.27755};
-            std::vector<TestType> mat_t(8);
-            Util::CFTranspose<TestType, 2>(mat.data(), mat_t.data(), 2, 4, 0, 2,
-                                           0, 4);
-
-            CAPTURE(mat_t);
-            CAPTURE(mat_t_exp);
-
-            CHECK(mat_t == approx(mat_t_exp));
-        }
-        SECTION("Invalid Arguments") {
-            using namespace Catch::Matchers;
-            std::vector<TestType> mat(2 * 3, {1.0});
-            CHECK_THROWS_AS(Util::Transpose(mat, 2, 2), std::invalid_argument);
-            CHECK_THROWS_WITH(
-                Util::Transpose(mat, 2, 2),
-                Contains(
-                    "Invalid number of rows and columns for the input matrix"));
-        }
-    }
-    SECTION("Transpose<complex<T>>") {
-        SECTION("Simple Matrix") {
-            for (size_t m = 2; m < 8; m++) {
-                std::vector<std::complex<TestType>> mat(m * m, {0, 0});
-                for (size_t i = 0; i < m; i++) {
-                    mat[i * m + i] = {1.0, 1.0};
-                }
-                std::vector<std::complex<TestType>> mat_t =
-                    Util::Transpose(mat, m, m);
-
-                CAPTURE(mat_t);
-                CAPTURE(mat);
-
-                CHECK(mat_t == approx(mat).margin(1e-7));
-            }
-        }
-        SECTION("Random Complex") {
-            std::vector<std::complex<TestType>> mat{
-                {0.417876, 0.27448},   {0.601209, 0.723548},
-                {0.781624, 0.538222},  {0.0597232, 0.27755},
-                {0.0431741, 0.593319}, {0.224124, 0.130335},
-                {0.237877, 0.01557},   {0.931634, 0.786367},
-                {0.378397, 0.894381},  {0.840747, 0.889789},
-                {0.530623, 0.463644},  {0.868736, 0.760685},
-                {0.258175, 0.836569},  {0.495012, 0.667726},
-                {0.298962, 0.384992},  {0.659472, 0.232696}};
-            std::vector<std::complex<TestType>> mat_t_exp{
-                {0.417876, 0.27448},  {0.0431741, 0.593319},
-                {0.378397, 0.894381}, {0.258175, 0.836569},
-                {0.601209, 0.723548}, {0.224124, 0.130335},
-                {0.840747, 0.889789}, {0.495012, 0.667726},
-                {0.781624, 0.538222}, {0.237877, 0.01557},
-                {0.530623, 0.463644}, {0.298962, 0.384992},
-                {0.0597232, 0.27755}, {0.931634, 0.786367},
-                {0.868736, 0.760685}, {0.659472, 0.232696}};
-            std::vector<std::complex<TestType>> mat_t =
-                Util::Transpose(mat, 4, 4);
-
-            CAPTURE(mat_t);
-            CAPTURE(mat_t_exp);
-
-            CHECK(mat_t == approx(mat_t_exp));
-        }
-        SECTION("Invalid Arguments") {
-            using namespace Catch::Matchers;
-            std::vector<std::complex<TestType>> mat(2 * 3, {1.0, 1.0});
-            CHECK_THROWS_AS(Util::Transpose(mat, 2, 2), std::invalid_argument);
-            CHECK_THROWS_WITH(
-                Util::Transpose(mat, 2, 2),
-                Contains(
-                    "Invalid number of rows and columns for the input matrix"));
-        }
-    }
     SECTION("matrixMatProd") {
         SECTION("Simple Iterative (Trans::Transpose)") {
             for (size_t m = 2; m < 8; m++) {
@@ -621,15 +559,262 @@ TEMPLATE_TEST_CASE("Test linear algebra functions", "[Util][LinearAlgebra]",
             CHECK(m_out == nullptr);
         }
     }
-    SECTION("SquaredNorm") {
-        SECTION("For real type") {
-            std::vector<TestType> vec{0.0, 1.0, 3.0, 10.0};
-            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+}
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEMPLATE_TEST_CASE("Transpose", "[Util][LinearAlgebra]", float, double) {
+    SECTION("CFTranspose") {
+        SECTION("Simple Matrix") {
+            for (size_t m = 2; m < 10; m++) {
+                std::vector<TestType> mat(m * m, {0});
+                for (size_t i = 0; i < m; i++) {
+                    mat[i * m + i] = 1.0;
+                }
+                std::vector<TestType> mat_t(m * m);
+                Util::CFTranspose<TestType, 4>(mat.data(), mat_t.data(), m, m,
+                                               0, m, 0, m);
+
+                CAPTURE(mat_t);
+                CAPTURE(mat);
+
+                CHECK(mat_t == approx(mat).margin(1e-7));
+            }
+        }
+        SECTION("Random Complex") {
+            std::vector<TestType> mat{
+                0.417876, 0.27448,   0.601209, 0.723548, 0.781624,
+                0.538222, 0.0597232, 0.27755,  0.836569,
+            };
+            std::vector<TestType> mat_t_exp{
+                0.417876, 0.723548, 0.0597232, 0.27448,  0.781624,
+                0.27755,  0.601209, 0.538222,  0.836569,
+            };
+            std::vector<TestType> mat_t(9);
+            Util::CFTranspose<TestType, 2>(mat.data(), mat_t.data(), 3, 3, 0, 3,
+                                           0, 3);
+
+            CAPTURE(mat_t);
+            CAPTURE(mat_t_exp);
+
+            CHECK(mat_t == approx(mat_t_exp));
+        }
+        SECTION("Random Complex non-square") {
+            std::vector<TestType> mat{
+                0.417876, 0.27448,  0.601209,  0.723548,
+                0.781624, 0.538222, 0.0597232, 0.27755,
+            };
+            std::vector<TestType> mat_t_exp{0.417876, 0.781624, 0.27448,
+                                            0.538222, 0.601209, 0.0597232,
+                                            0.723548, 0.27755};
+            std::vector<TestType> mat_t(8);
+            Util::CFTranspose<TestType, 2>(mat.data(), mat_t.data(), 2, 4, 0, 2,
+                                           0, 4);
+
+            CAPTURE(mat_t);
+            CAPTURE(mat_t_exp);
+
+            CHECK(mat_t == approx(mat_t_exp));
+        }
+    }
+    SECTION("Transpose") {
+        SECTION("Simple Matrix") {
+            for (size_t m = 2; m < 8; m++) {
+                std::vector<std::complex<TestType>> mat(m * m, {0, 0});
+                for (size_t i = 0; i < m; i++) {
+                    mat[i * m + i] = {1, 1};
+                }
+                std::vector<std::complex<TestType>> mat_t =
+                    Util::Transpose(mat, m, m);
+
+                CAPTURE(mat_t);
+                CAPTURE(mat);
+
+                CHECK(mat_t == approx(mat).margin(1e-7));
+            }
         }
+        SECTION("Random Complex") {
+            std::vector<std::complex<TestType>> mat{
+                {0.417876, 0.27448},   {0.601209, 0.723548},
+                {0.781624, 0.538222},  {0.0597232, 0.27755},
+                {0.0431741, 0.593319}, {0.224124, 0.130335},
+                {0.237877, 0.01557},   {0.931634, 0.786367},
+                {0.378397, 0.894381},  {0.840747, 0.889789},
+                {0.530623, 0.463644},  {0.868736, 0.760685},
+                {0.258175, 0.836569},  {0.495012, 0.667726},
+                {0.298962, 0.384992},  {0.659472, 0.232696}};
+            std::vector<std::complex<TestType>> mat_t_exp{
+                {0.417876, 0.27448},  {0.0431741, 0.593319},
+                {0.378397, 0.894381}, {0.258175, 0.836569},
+                {0.601209, 0.723548}, {0.224124, 0.130335},
+                {0.840747, 0.889789}, {0.495012, 0.667726},
+                {0.781624, 0.538222}, {0.237877, 0.01557},
+                {0.530623, 0.463644}, {0.298962, 0.384992},
+                {0.0597232, 0.27755}, {0.931634, 0.786367},
+                {0.868736, 0.760685}, {0.659472, 0.232696}};
+            std::vector<std::complex<TestType>> mat_t =
+                Util::Transpose(mat, 4, 4);
 
-        SECTION("For complex type") {
-            std::vector<std::complex<TestType>> vec{{0.0, 1.0}, {3.0, 10.0}};
-            CHECK(Util::squaredNorm(vec) == Approx(110.0));
+            CAPTURE(mat_t);
+            CAPTURE(mat_t_exp);
+
+            CHECK(mat_t == approx(mat_t_exp));
+        }
+        SECTION("Invalid Arguments") {
+            using namespace Catch::Matchers;
+            std::vector<TestType> mat(2 * 3, {1.0});
+            CHECK_THROWS_AS(
+                Util::Transpose(std::span<const TestType>{mat}, 2, 2),
+                std::invalid_argument);
+            CHECK_THROWS_WITH(
+                Util::Transpose(mat, 2, 2),
+                Contains(
+                    "Invalid number of rows and columns for the input matrix"));
+        }
+    }
+    SECTION("Transpose<complex<T>>") {
+        SECTION("Simple Matrix") {
+            for (size_t m = 2; m < 8; m++) {
+                std::vector<std::complex<TestType>> mat(m * m, {0, 0});
+                for (size_t i = 0; i < m; i++) {
+                    mat[i * m + i] = {1.0, 1.0};
+                }
+                std::vector<std::complex<TestType>> mat_t =
+                    Util::Transpose(mat, m, m);
+
+                CAPTURE(mat_t);
+                CAPTURE(mat);
+
+                CHECK(mat_t == approx(mat).margin(1e-7));
+            }
         }
+        SECTION("Random Complex") {
+            std::vector<std::complex<TestType>> mat{
+                {0.417876, 0.27448},   {0.601209, 0.723548},
+                {0.781624, 0.538222},  {0.0597232, 0.27755},
+                {0.0431741, 0.593319}, {0.224124, 0.130335},
+                {0.237877, 0.01557},   {0.931634, 0.786367},
+                {0.378397, 0.894381},  {0.840747, 0.889789},
+                {0.530623, 0.463644},  {0.868736, 0.760685},
+                {0.258175, 0.836569},  {0.495012, 0.667726},
+                {0.298962, 0.384992},  {0.659472, 0.232696}};
+            std::vector<std::complex<TestType>> mat_t_exp{
+                {0.417876, 0.27448},  {0.0431741, 0.593319},
+                {0.378397, 0.894381}, {0.258175, 0.836569},
+                {0.601209, 0.723548}, {0.224124, 0.130335},
+                {0.840747, 0.889789}, {0.495012, 0.667726},
+                {0.781624, 0.538222}, {0.237877, 0.01557},
+                {0.530623, 0.463644}, {0.298962, 0.384992},
+                {0.0597232, 0.27755}, {0.931634, 0.786367},
+                {0.868736, 0.760685}, {0.659472, 0.232696}};
+            std::vector<std::complex<TestType>> mat_t =
+                Util::Transpose(mat, 4, 4);
+
+            CAPTURE(mat_t);
+            CAPTURE(mat_t_exp);
+
+            CHECK(mat_t == approx(mat_t_exp));
+        }
+        SECTION("Invalid Arguments") {
+            using namespace Catch::Matchers;
+            std::vector<std::complex<TestType>> mat(2 * 3, {1.0, 1.0});
+            CHECK_THROWS_AS(Util::Transpose(mat, 2, 2), std::invalid_argument);
+            CHECK_THROWS_WITH(
+                Util::Transpose(mat, 2, 2),
+                Contains(
+                    "Invalid number of rows and columns for the input matrix"));
+        }
+    }
+}
+
+TEMPLATE_TEST_CASE("Util::squaredNorm", "[Util][LinearAlgebra]", float,
+                   double) {
+    SECTION("For real type") {
+        std::vector<TestType> vec{0.0, 1.0, 3.0, 10.0};
+        CHECK(Util::squaredNorm(vec) == Approx(110.0));
+    }
+
+    SECTION("For complex type") {
+        std::vector<std::complex<TestType>> vec{{0.0, 1.0}, {3.0, 10.0}};
+        CHECK(Util::squaredNorm(vec) == Approx(110.0));
+    }
+}
+
+TEMPLATE_TEST_CASE("Util::scaleAndAdd", "[Util][LinearAlgebra]", float,
+                   double) {
+    using PrecisionT = TestType;
+    using ComplexPrecisionT = std::complex<PrecisionT>;
+
+    SECTION("Test result is correct") {
+        auto a = ComplexPrecisionT{0.36572644485147254, 0.4729529811649217};
+        std::vector<ComplexPrecisionT> x{
+            ComplexPrecisionT{0.481941495077, 0.734106237571},
+            ComplexPrecisionT{0.960470937496, 0.880529982024},
+            ComplexPrecisionT{0.135982489400, 0.049663856666},
+            ComplexPrecisionT{0.589227566883, 0.646648171030},
+            ComplexPrecisionT{0.051294350194, 0.013730433456},
+            ComplexPrecisionT{0.716464613724, 0.296251370128},
+            ComplexPrecisionT{0.820197028755, 0.199230854010},
+            ComplexPrecisionT{0.100767632907, 0.745810000609},
+            ComplexPrecisionT{0.603122469037, 0.437680494447},
+            ComplexPrecisionT{0.815084269631, 0.501486284044},
+            ComplexPrecisionT{0.554633849948, 0.437321144284},
+            ComplexPrecisionT{0.822295519809, 0.810051588437},
+            ComplexPrecisionT{0.217638951648, 0.663920104700},
+            ComplexPrecisionT{0.289819402719, 0.839919161595},
+            ComplexPrecisionT{0.498496405040, 0.906874924446},
+            ComplexPrecisionT{0.365971064862, 0.230694150520},
+        };
+        std::vector<ComplexPrecisionT> y{
+            ComplexPrecisionT{0.516438479285, 0.970319841313},
+            ComplexPrecisionT{0.085702308539, 0.005302125762},
+            ComplexPrecisionT{0.591955559108, 0.945946312721},
+            ComplexPrecisionT{0.710102120659, 0.410003006045},
+            ComplexPrecisionT{0.171020364152, 0.020935262021},
+            ComplexPrecisionT{0.904267565256, 0.235752839391},
+            ComplexPrecisionT{0.715111137847, 0.402137049186},
+            ComplexPrecisionT{0.590485707389, 0.550485111898},
+            ComplexPrecisionT{0.830734963458, 0.777755725832},
+            ComplexPrecisionT{0.988885576027, 0.541038298049},
+            ComplexPrecisionT{0.375479099161, 0.275849441779},
+            ComplexPrecisionT{0.441329976617, 0.825285998539},
+            ComplexPrecisionT{0.376823807696, 0.896094272876},
+            ComplexPrecisionT{0.558768533750, 0.963077088666},
+            ComplexPrecisionT{0.402000571969, 0.344065008137},
+            ComplexPrecisionT{0.805773653517, 0.316132703093},
+        };
+        std::vector<ComplexPrecisionT> expected{
+            ComplexPrecisionT{0.345499495355, 1.466737572567},
+            ComplexPrecisionT{0.020522649889, 0.781592818884},
+            ComplexPrecisionT{0.618199282452, 1.028423022205},
+            ComplexPrecisionT{0.619764043650, 0.925176277047},
+            ComplexPrecisionT{0.183286215053, 0.050216660476},
+            ComplexPrecisionT{1.026184652619, 0.682953874730},
+            ComplexPrecisionT{0.920852054907, 0.862915671020},
+            ComplexPrecisionT{0.274606032358, 0.870905904344},
+            ComplexPrecisionT{0.844310505222, 1.223075626786},
+            ComplexPrecisionT{1.049804015161, 1.109941629077},
+            ComplexPrecisionT{0.371491026381, 0.698105081924},
+            ComplexPrecisionT{0.358948880046, 1.510450403616},
+            ComplexPrecisionT{0.142417134970, 1.241840403433},
+            ComplexPrecisionT{0.267520882141, 1.407328688114},
+            ComplexPrecisionT{0.155404690895, 0.911498511043},
+            ComplexPrecisionT{0.830511463762, 0.573590760757},
+        };
+        Util::scaleAndAdd(a, x, y);
+        REQUIRE(y == approx(expected));
+    }
+    SECTION("Throws exception when the size mismatches") {
+        std::vector<ComplexPrecisionT> x(8, ComplexPrecisionT{});
+        std::vector<ComplexPrecisionT> y(4, ComplexPrecisionT{});
+
+        PL_REQUIRE_THROWS_MATCHES(
+            Util::scaleAndAdd(ComplexPrecisionT{0.5, 0.4}, x, y),
+            std::invalid_argument, "Dimensions of vectors mismatch");
+    }
+    SECTION("omp_scaleAndAdd uses STD_CROSSOVER") {
+        std::vector<ComplexPrecisionT> x(32);
+        std::vector<ComplexPrecisionT> y(32);
+        REQUIRE_NOTHROW(Util::omp_scaleAndAdd<PrecisionT, 16>(
+            32, {1.0, 0.0}, x.data(), y.data()));
     }
 }
diff --git a/pennylane_lightning/src/tests/Test_Observables.cpp b/pennylane_lightning/src/tests/Test_Observables.cpp
new file mode 100644
index 0000000000..fb37dba954
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_Observables.cpp
@@ -0,0 +1,374 @@
+#include "Observables.hpp"
+#include "TestHelpers.hpp"
+
+#include <catch2/catch.hpp>
+
+using namespace Pennylane;
+using namespace Pennylane::Algorithms;
+using Pennylane::Util::LightningException;
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEMPLATE_TEST_CASE("NamedObs", "[Observables]", float, double) {
+    using PrecisionT = TestType;
+    SECTION("NamedObs only accepts correct arguments") {
+        REQUIRE_THROWS_AS(NamedObs<TestType>("PauliX", {}), LightningException);
+        REQUIRE_THROWS_AS(NamedObs<TestType>("PauliX", {0, 3}),
+                          LightningException);
+
+        REQUIRE_THROWS_AS(NamedObs<TestType>("RX", {0}), LightningException);
+        REQUIRE_THROWS_AS(NamedObs<TestType>("RX", {0, 1, 2, 3}),
+                          LightningException);
+        REQUIRE_THROWS_AS(
+            NamedObs<TestType>("RX", {0}, std::vector<PrecisionT>{0.3, 0.4}),
+            LightningException);
+        REQUIRE_NOTHROW(NamedObs<TestType>(
+            "Rot", {0}, std::vector<PrecisionT>{0.3, 0.4, 0.5}));
+    }
+
+    SECTION("Named of the Observable must be correct") {
+        REQUIRE(NamedObs<TestType>("PauliZ", {0}).getObsName() == "PauliZ[0]");
+    }
+
+    SECTION("Objects with different names") {
+        auto ob1 = NamedObs<TestType>("PauliX", {0});
+        auto ob2 = NamedObs<TestType>("PauliX", {0});
+        auto ob3 = NamedObs<TestType>("PauliZ", {0});
+
+        REQUIRE(ob1 == ob2);
+        REQUIRE(ob2 != ob3);
+        REQUIRE(ob1 != ob3);
+    }
+
+    SECTION("Objects with different wires") {
+        auto ob1 = NamedObs<TestType>("PauliY", {0});
+        auto ob2 = NamedObs<TestType>("PauliY", {0});
+        auto ob3 = NamedObs<TestType>("PauliY", {1});
+
+        REQUIRE(ob1 == ob2);
+        REQUIRE(ob2 != ob3);
+        REQUIRE(ob1 != ob3);
+    }
+
+    SECTION("Objects with different parameters") {
+        auto ob1 = NamedObs<TestType>("RZ", {0}, {0.4});
+        auto ob2 = NamedObs<TestType>("RZ", {0}, {0.4});
+        auto ob3 = NamedObs<TestType>("RZ", {0}, {0.1});
+
+        REQUIRE(ob1 == ob2);
+        REQUIRE(ob2 != ob3);
+        REQUIRE(ob1 != ob3);
+    }
+}
+
+TEMPLATE_TEST_CASE("HermitianObs", "[Observables]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexPrecisionT = std::complex<TestType>;
+    SECTION("HermitianObs only accepts correct arguments") {
+        auto ob1 = HermitianObs<TestType>{
+            std::vector<ComplexPrecisionT>{0.0, 0.0, 0.0, 0.0}, {0}};
+        auto ob2 = HermitianObs<TestType>{
+            std::vector<ComplexPrecisionT>(16, ComplexPrecisionT{}), {0, 1}};
+        REQUIRE_THROWS_AS(
+            HermitianObs<TestType>(
+                std::vector<ComplexPrecisionT>{0.0, 0.0, 0.0}, {0}),
+            LightningException);
+        REQUIRE_THROWS_AS(
+            HermitianObs<TestType>(
+                std::vector<ComplexPrecisionT>{0.0, 0.0, 0.0, 0.0, 0.0},
+                {0, 1}),
+            LightningException);
+    }
+    SECTION("getObsName") {
+        REQUIRE(HermitianObs<TestType>(
+                    std::vector<ComplexPrecisionT>{1.0, 0.0, 2.0, 0.0}, {0})
+                    .getObsName() == "Hermitian");
+    }
+    SECTION("Objects with different matrices") {
+        auto ob1 = HermitianObs<PrecisionT>{
+            std::vector<ComplexPrecisionT>{1.0, 0.0, 0.0, 0.0}, {0}};
+        auto ob2 = HermitianObs<PrecisionT>{
+            std::vector<ComplexPrecisionT>{1.0, 0.0, 0.0, 0.0}, {0}};
+        auto ob3 = HermitianObs<PrecisionT>{
+            std::vector<ComplexPrecisionT>{0.0, 1.0, 0.0, 0.0}, {0}};
+        REQUIRE(ob1 == ob2);
+        REQUIRE(ob1 != ob3);
+        REQUIRE(ob2 != ob3);
+    }
+    SECTION("Objects with different wires") {
+        auto ob1 = HermitianObs<PrecisionT>{
+            std::vector<ComplexPrecisionT>{1.0, 0.0, -1.0, 0.0}, {0}};
+        auto ob2 = HermitianObs<PrecisionT>{
+            std::vector<ComplexPrecisionT>{1.0, 0.0, -1.0, 0.0}, {0}};
+        auto ob3 = HermitianObs<PrecisionT>{
+            std::vector<ComplexPrecisionT>{1.0, 0.0, -1.0, 0.0}, {1}};
+        REQUIRE(ob1 == ob2);
+        REQUIRE(ob1 != ob3);
+        REQUIRE(ob2 != ob3);
+    }
+}
+
+TEMPLATE_TEST_CASE("TensorProdObs", "[Observables]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexPrecisionT = std::complex<TestType>;
+
+    SECTION("Overlapping wires throw an exception") {
+        auto ob1 = std::make_shared<HermitianObs<PrecisionT>>(
+            std::vector<ComplexPrecisionT>(16, ComplexPrecisionT{0.0, 0.0}),
+            std::vector<size_t>{0, 1});
+        auto ob2_1 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliX", std::vector<size_t>{1});
+        auto ob2_2 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{2});
+        auto ob2 = TensorProdObs<PrecisionT>::create({ob2_1, ob2_2});
+
+        REQUIRE_THROWS_AS(TensorProdObs<PrecisionT>::create({ob1, ob2}),
+                          LightningException);
+    }
+
+    SECTION("Can construct an observable with non-overlapping wires") {
+        auto ob1 = std::make_shared<HermitianObs<PrecisionT>>(
+            std::vector<ComplexPrecisionT>(16, ComplexPrecisionT{0.0, 0.0}),
+            std::vector<size_t>{0, 1});
+        auto ob2_1 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliX", std::vector<size_t>{2});
+        auto ob2_2 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{3});
+        auto ob2 = TensorProdObs<PrecisionT>::create({ob2_1, ob2_2});
+
+        REQUIRE_NOTHROW(TensorProdObs<PrecisionT>::create({ob1, ob2}));
+    }
+
+    SECTION("getObsName") {
+        auto ob =
+            TensorProdObs<PrecisionT>(std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliX", std::vector<size_t>{0}),
+                                      std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliZ", std::vector<size_t>{1}));
+        REQUIRE(ob.getObsName() == "PauliX[0] @ PauliZ[1]");
+    }
+
+    SECTION("Compare two tensor product observables") {
+        auto ob1 =
+            TensorProdObs<PrecisionT>{std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliX", std::vector<size_t>{0}),
+                                      std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliZ", std::vector<size_t>{1})};
+        auto ob2 =
+            TensorProdObs<PrecisionT>{std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliX", std::vector<size_t>{0}),
+                                      std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliZ", std::vector<size_t>{1})};
+        auto ob3 =
+            TensorProdObs<PrecisionT>{std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliX", std::vector<size_t>{0}),
+                                      std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliZ", std::vector<size_t>{2})};
+        auto ob4 =
+            TensorProdObs<PrecisionT>{std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliZ", std::vector<size_t>{0}),
+                                      std::make_shared<NamedObs<PrecisionT>>(
+                                          "PauliZ", std::vector<size_t>{1})};
+
+        auto ob5 =
+            TensorProdObs<PrecisionT>{std::make_shared<NamedObs<PrecisionT>>(
+                "PauliZ", std::vector<size_t>{0})};
+
+        REQUIRE(ob1 == ob2);
+        REQUIRE(ob1 != ob3);
+        REQUIRE(ob1 != ob4);
+        REQUIRE(ob1 != ob5);
+    }
+
+    SECTION("Tensor product applies to a statevector correctly") {
+        auto ob = TensorProdObs<PrecisionT>{
+            std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                   std::vector<size_t>{0}),
+            std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                   std::vector<size_t>{2}),
+        };
+
+        SECTION("Test using |1+0>") {
+            constexpr auto num_qubits = size_t{3};
+            StateVectorManagedCPU<PrecisionT> sv(num_qubits);
+
+            sv.updateData(createProductState<PrecisionT>("1+0"));
+            ob.applyInPlace(sv);
+
+            REQUIRE(sv.getDataVector() ==
+                    PLApprox(createProductState<PrecisionT>("0+1")));
+        }
+
+        SECTION("Test using |+-01>") {
+            constexpr auto num_qubits = size_t{4};
+            StateVectorManagedCPU<PrecisionT> sv(num_qubits);
+
+            sv.updateData(createProductState<PrecisionT>("+-01"));
+            ob.applyInPlace(sv);
+
+            REQUIRE(sv.getDataVector() ==
+                    PLApprox(createProductState<PrecisionT>("+-11")));
+        }
+    }
+}
+
+TEMPLATE_TEST_CASE("Hamiltonian", "[Observables]", float, double) {
+    using PrecisionT = TestType;
+    using ComplexPrecisionT = std::complex<TestType>;
+
+    const auto h = PrecisionT{0.809}; // half of the golden ratio
+
+    auto zz = std::make_shared<TensorProdObs<PrecisionT>>(
+        std::make_shared<NamedObs<PrecisionT>>("PauliZ",
+                                               std::vector<size_t>{0}),
+        std::make_shared<NamedObs<PrecisionT>>("PauliZ",
+                                               std::vector<size_t>{1}));
+
+    auto x1 = std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                     std::vector<size_t>{0});
+    auto x2 = std::make_shared<NamedObs<PrecisionT>>("PauliX",
+                                                     std::vector<size_t>{1});
+
+    SECTION("Hamiltonian constructor only accepts valid arguments") {
+        REQUIRE_NOTHROW(Hamiltonian<PrecisionT>::create({PrecisionT{1.0}, h, h},
+                                                        {zz, x1, x2}));
+
+        REQUIRE_THROWS_AS(
+            Hamiltonian<PrecisionT>::create({PrecisionT{1.0}, h}, {zz, x1, x2}),
+            LightningException);
+    }
+
+    SECTION("getObsName") {
+        auto X0 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliX", std::vector<size_t>{0});
+        auto Z2 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{2});
+
+        REQUIRE(Hamiltonian<PrecisionT>::create({0.3, 0.5}, {X0, Z2})
+                    ->getObsName() ==
+                "Hamiltonian: { 'coeffs' : [0.3, 0.5], "
+                "'observables' : [PauliX[0], PauliZ[2]]}");
+    }
+
+    SECTION("Compare Hamiltonians") {
+        auto X0 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliX", std::vector<size_t>{0});
+        auto X1 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliX", std::vector<size_t>{1});
+        auto X2 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliX", std::vector<size_t>{2});
+
+        auto Y0 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliY", std::vector<size_t>{0});
+        auto Y1 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliY", std::vector<size_t>{1});
+        auto Y2 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliY", std::vector<size_t>{2});
+
+        auto Z0 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{0});
+        auto Z1 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{1});
+        auto Z2 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{2});
+
+        auto ham1 = Hamiltonian<PrecisionT>::create(
+            {0.8, 0.5, 0.7},
+            {
+                std::make_shared<TensorProdObs<PrecisionT>>(X0, Y1, Z2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Z0, X1, Y2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Y0, Z1, X2),
+            });
+
+        auto ham2 = Hamiltonian<PrecisionT>::create(
+            {0.8, 0.5, 0.7},
+            {
+                std::make_shared<TensorProdObs<PrecisionT>>(X0, Y1, Z2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Z0, X1, Y2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Y0, Z1, X2),
+            });
+
+        auto ham3 = Hamiltonian<PrecisionT>::create(
+            {0.8, 0.5, 0.642},
+            {
+                std::make_shared<TensorProdObs<PrecisionT>>(X0, Y1, Z2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Z0, X1, Y2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Y0, Z1, X2),
+            });
+
+        auto ham4 = Hamiltonian<PrecisionT>::create(
+            {0.8, 0.5},
+            {
+                std::make_shared<TensorProdObs<PrecisionT>>(X0, Y1, Z2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Z0, X1, Y2),
+            });
+
+        auto ham5 = Hamiltonian<PrecisionT>::create(
+            {0.8, 0.5, 0.7},
+            {
+                std::make_shared<TensorProdObs<PrecisionT>>(X0, Y1, Z2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Z0, X1, Y2),
+                std::make_shared<TensorProdObs<PrecisionT>>(Y0, Z1, Y2),
+            });
+
+        REQUIRE(*ham1 == *ham2);
+        REQUIRE(*ham1 != *ham3);
+        REQUIRE(*ham2 != *ham3);
+        REQUIRE(*ham2 != *ham4);
+        REQUIRE(*ham1 != *ham5);
+    }
+
+    SECTION("Hamiltonian::applyInPlace") {
+        auto ham = Hamiltonian<PrecisionT>::create({PrecisionT{1.0}, h, h},
+                                                   {zz, x1, x2});
+
+        SECTION(" to |+->") {
+            constexpr auto num_qubits = size_t{2};
+            StateVectorManagedCPU<PrecisionT> sv(num_qubits);
+
+            sv.updateData(createProductState<PrecisionT>("+-"));
+
+            ham->applyInPlace(sv);
+
+            auto expected = std::vector<ComplexPrecisionT>{
+                ComplexPrecisionT{0.5, 0.0},
+                ComplexPrecisionT{0.5, 0.0},
+                ComplexPrecisionT{-0.5, 0.0},
+                ComplexPrecisionT{-0.5, 0.0},
+            };
+
+            REQUIRE(sv.getDataVector() == PLApprox(expected));
+        }
+
+        SECTION("Hamiltonian applies correctly to |01>") {
+            constexpr auto num_qubits = size_t{2};
+            StateVectorManagedCPU<PrecisionT> sv(num_qubits);
+
+            sv.updateData(createProductState<PrecisionT>("01"));
+
+            ham->applyInPlace(sv);
+
+            auto expected = std::vector<ComplexPrecisionT>{
+                ComplexPrecisionT{h, 0.0},
+                ComplexPrecisionT{-1.0, 0.0},
+                ComplexPrecisionT{0.0, 0.0},
+                ComplexPrecisionT{h, 0.0},
+            };
+
+            REQUIRE(sv.getDataVector() == PLApprox(expected));
+        }
+    }
+
+    SECTION("getWires") {
+        auto Z0 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{0});
+        auto Z5 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{5});
+        auto Z9 = std::make_shared<NamedObs<PrecisionT>>(
+            "PauliZ", std::vector<size_t>{9});
+
+        auto ham1 =
+            Hamiltonian<PrecisionT>::create({0.8, 0.5, 0.7}, {Z0, Z5, Z9});
+
+        REQUIRE(ham1->getWires() == std::vector<size_t>{0, 5, 9});
+    }
+}
diff --git a/pennylane_lightning/src/tests/Test_StateVecAdjDiff.cpp b/pennylane_lightning/src/tests/Test_StateVecAdjDiff.cpp
new file mode 100644
index 0000000000..40d4bff3a7
--- /dev/null
+++ b/pennylane_lightning/src/tests/Test_StateVecAdjDiff.cpp
@@ -0,0 +1,301 @@
+#include "AdjointDiff.hpp"
+#include "AlgUtil.hpp"
+#include "Constant.hpp"
+#include "GateOperation.hpp"
+#include "StateVecAdjDiff.hpp"
+#include "Util.hpp"
+
+#include "TestHelpers.hpp"
+
+#include <catch2/catch.hpp>
+
+#include <cmath>
+#include <numbers>
+
+using namespace Pennylane;
+using namespace Pennylane::Util;
+using namespace Pennylane::Algorithms;
+
+/**
+ * @brief
+ *
+ * @param length Size of the gate sequence
+ * @param
+ */
+template <class PrecisionT, class RandomEngine>
+auto createRandomOps(RandomEngine &re, size_t length, size_t wires)
+    -> OpsData<PrecisionT> {
+    using namespace Pennylane::Gates;
+
+    std::array gates_to_use = {GateOperation::RX, GateOperation::RY,
+                               GateOperation::RZ};
+
+    std::vector<std::string> ops_names;
+    std::vector<std::vector<PrecisionT>> ops_params;
+    std::vector<std::vector<size_t>> ops_wires;
+    std::vector<bool> ops_inverses;
+
+    std::uniform_int_distribution<size_t> gate_dist(0, gates_to_use.size() - 1);
+    std::uniform_real_distribution<PrecisionT> param_dist(0.0, 2 * M_PI);
+    std::uniform_int_distribution<int> inverse_dist(0, 1);
+
+    for (size_t i = 0; i < length; i++) {
+        const auto gate_op = gates_to_use[gate_dist(re)];
+        const auto gate_name =
+            Util::lookup(Gates::Constant::gate_names, gate_op);
+        ops_names.emplace_back(gate_name);
+        ops_params.emplace_back(std::vector<PrecisionT>{param_dist(re)});
+        ops_inverses.emplace_back(inverse_dist(re));
+        ops_wires.emplace_back(createWires(gate_op, wires));
+    }
+
+    return {ops_names, ops_params, ops_wires, ops_inverses, {{}}};
+}
+
+// NOLINTNEXTLINE(readability-function-cognitive-complexity)
+TEMPLATE_TEST_CASE("StateVector VJP", "[Test_StateVecAdjDiff]", float, double) {
+    using std::cos;
+    using std::sin;
+    using std::sqrt;
+
+    using PrecisionT = TestType;
+    using ComplexPrecisionT = std::complex<PrecisionT>;
+
+    constexpr static auto isqrt2 = INVSQRT2<PrecisionT>();
+
+    SECTION("Do nothing if the tape does not have trainable parameters") {
+        std::vector<ComplexPrecisionT> vjp(1);
+        OpsData<TestType> ops_data{
+            {"CNOT", "RX"},   // names
+            {{}, {M_PI / 7}}, // params
+            {{0, 1}, {1}},    // wires
+            {false, false},   // inverses
+            {}                // matrices
+        };
+
+        auto dy = std::vector<ComplexPrecisionT>(4);
+        std::vector<ComplexPrecisionT> ini_st{
+            {isqrt2, 0.0}, {0.0, 0.0}, {isqrt2, 0.0}, {0.0, 0.0}};
+        JacobianData<TestType> jd{1, 4, ini_st.data(), {}, ops_data, {}};
+        REQUIRE_NOTHROW(statevectorVJP(
+            std::span{vjp}, jd, std::span<const ComplexPrecisionT>{dy}, true));
+    }
+
+    SECTION("CNOT RX1") {
+        const PrecisionT theta = std::numbers::pi_v<PrecisionT> / 7;
+        OpsData<TestType> ops_data{
+            {"CNOT", "RX"}, // names
+            {{}, {theta}},  // params
+            {{0, 1}, {1}},  // wires
+            {false, false}, // inverses
+            {}              // matrices
+        };
+
+        auto dy = std::vector<ComplexPrecisionT>(4);
+
+        std::vector<std::vector<ComplexPrecisionT>> expected = {
+            {{-isqrt2 / PrecisionT{2.0} * sin(theta / 2), 0.0}},
+            {{0.0, -isqrt2 / PrecisionT{2.0} * cos(theta / 2)}},
+            {{0.0, -isqrt2 / PrecisionT{2.0} * cos(theta / 2)}},
+            {{-isqrt2 / PrecisionT{2.0} * sin(theta / 2), 0.0}},
+        };
+
+        SECTION("with apply_operations = true") {
+            std::vector<ComplexPrecisionT> ini_st{
+                {isqrt2, 0.0}, {0.0, 0.0}, {isqrt2, 0.0}, {0.0, 0.0}};
+            JacobianData<TestType> jd{1, 4, ini_st.data(), {}, ops_data, {0}};
+
+            for (size_t i = 0; i < 4; i++) {
+                std::fill(dy.begin(), dy.end(), ComplexPrecisionT{0.0, 0.0});
+                dy[i] = {1.0, 0.0};
+                std::vector<ComplexPrecisionT> vjp(1);
+                statevectorVJP(std::span{vjp}, jd,
+                               std::span<const ComplexPrecisionT>{dy}, true);
+
+                REQUIRE(vjp == PLApprox(expected[i]).margin(1e-5));
+            }
+        }
+
+        SECTION("with apply_operations = false") {
+            std::vector<std::complex<TestType>> final_st{
+                {cos(theta / 2) * isqrt2, 0.0},
+                {0.0, -isqrt2 * sin(theta / 2)},
+                {0.0, -isqrt2 * sin(theta / 2)},
+                {cos(theta / 2) * isqrt2, 0.0}};
+            JacobianData<TestType> jd{1, 4, final_st.data(), {}, ops_data, {0}};
+
+            for (size_t i = 0; i < 4; i++) {
+                std::fill(dy.begin(), dy.end(),
+                          std::complex<TestType>{0.0, 0.0});
+                dy[i] = {1.0, 0.0};
+                std::vector<ComplexPrecisionT> vjp(1);
+                statevectorVJP(std::span{vjp}, jd,
+                               std::span<const ComplexPrecisionT>{dy}, false);
+
+                REQUIRE(vjp == PLApprox(expected[i]).margin(1e-5));
+            }
+        }
+    }
+
+    SECTION("CNOT0,1 RX1 CNOT1,0 RX0 CNOT0,1 RX1 CNOT1,0 RX0") {
+        std::vector<std::complex<TestType>> ini_st{
+            {isqrt2, 0.0}, {0.0, 0.0}, {isqrt2, 0.0}, {0.0, 0.0}};
+
+        OpsData<TestType> ops_data{
+            {"CNOT", "RX", "CNOT", "RX", "CNOT", "RX", "CNOT", "RX"}, // names
+            {{}, {M_PI}, {}, {M_PI}, {}, {M_PI}, {}, {M_PI}},         // params
+            {{0, 1}, {1}, {1, 0}, {0}, {0, 1}, {1}, {1, 0}, {0}},     // wires
+            {false, false, false, false, false, false, false,
+             false}, // inverses
+            {}       // matrices
+        };
+
+        std::vector<std::complex<TestType>> expected_der0 = {
+            {0.0, -isqrt2 / 2.0},
+            {0.0, 0.0},
+            {0.0, 0.0},
+            {0.0, -isqrt2 / 2.0},
+        }; // For trainable_param == 0
+        std::vector<std::complex<TestType>> expected_der1 = {
+            {0.0, 0.0},
+            {0.0, -isqrt2 / 2.0},
+            {0.0, -isqrt2 / 2.0},
+            {0.0, 0.0},
+        }; // For trainable_param == 1
+
+        SECTION("with apply_operations = true") {
+            std::vector<std::complex<TestType>> ini_st{
+                {isqrt2, 0.0}, {0.0, 0.0}, {isqrt2, 0.0}, {0.0, 0.0}};
+
+            JacobianData<TestType> jd{
+                1, 4, ini_st.data(), {}, ops_data, {1, 2} // trainable params
+            };
+
+            auto dy = std::vector<std::complex<TestType>>(4);
+
+            for (size_t i = 0; i < 4; i++) {
+                std::fill(dy.begin(), dy.end(),
+                          std::complex<TestType>{0.0, 0.0});
+                dy[i] = {1.0, 0.0};
+                std::vector<ComplexPrecisionT> vjp(2);
+                statevectorVJP(std::span{vjp}, jd,
+                               std::span<const ComplexPrecisionT>{dy}, true);
+
+                REQUIRE(vjp[0] == approx(expected_der0[i]).margin(1e-5));
+                REQUIRE(vjp[1] == approx(expected_der1[i]).margin(1e-5));
+            }
+        }
+
+        SECTION("with apply_operations = false") {
+            std::vector<std::complex<TestType>> final_st{
+                {0.0, 0.0}, {isqrt2, 0.0}, {isqrt2, 0.0}, {0.0, 0.0}};
+
+            JacobianData<TestType> jd{
+                4, 4, final_st.data(), {}, ops_data, {1, 2} // trainable params
+            };
+
+            auto dy = std::vector<std::complex<TestType>>(4);
+
+            for (size_t i = 0; i < 4; i++) {
+                std::fill(dy.begin(), dy.end(),
+                          std::complex<TestType>{0.0, 0.0});
+                dy[i] = {1.0, 0.0};
+                std::vector<ComplexPrecisionT> vjp(2);
+                statevectorVJP(std::span{vjp}, jd,
+                               std::span<const ComplexPrecisionT>{dy}, false);
+
+                REQUIRE(vjp[0] == approx(expected_der0[i]).margin(1e-5));
+                REQUIRE(vjp[1] == approx(expected_der1[i]).margin(1e-5));
+            }
+        }
+    }
+
+    SECTION("Test complex dy") {
+        OpsData<TestType> ops_data1{
+            {"CNOT", "RX"},   // names
+            {{}, {M_PI / 7}}, // params
+            {{0, 1}, {1}},    // wires
+            {false, false},   // inverses
+            {}                // matrices
+        };
+
+        auto dy1 = std::vector<ComplexPrecisionT>{
+            {0.4, 0.4}, {0.4, 0.4}, {0.4, 0.4}, {0.4, 0.4}};
+
+        OpsData<TestType> ops_data2{
+            {"CNOT", "RX"},    // names
+            {{}, {-M_PI / 7}}, // params
+            {{0, 1}, {1}},     // wires
+            {false, false},    // inverses
+            {}                 // matrices
+        };
+
+        auto dy2 = std::vector<ComplexPrecisionT>{
+            {0.4, -0.4}, {0.4, -0.4}, {0.4, -0.4}, {0.4, -0.4}};
+        std::vector<ComplexPrecisionT> ini_st{
+            {isqrt2, 0.0}, {0.0, 0.0}, {isqrt2, 0.0}, {0.0, 0.0}};
+
+        JacobianData<TestType> jd1{1, 4, ini_st.data(), {}, ops_data1, {0}};
+        JacobianData<TestType> jd2{1, 4, ini_st.data(), {}, ops_data2, {0}};
+
+        std::vector<ComplexPrecisionT> vjp1(1);
+        std::vector<ComplexPrecisionT> vjp2(1);
+
+        statevectorVJP(std::span{vjp1}, jd1,
+                       std::span<const ComplexPrecisionT>{dy1}, true);
+
+        statevectorVJP(std::span{vjp2}, jd2,
+                       std::span<const ComplexPrecisionT>{dy2}, true);
+
+        REQUIRE(vjp1[0] == approx(-std::conj(vjp2[0])));
+    }
+
+    SECTION(
+        "Check the result is consistent with adjoint diff with observables") {
+        std::mt19937 re{1337};
+        auto ops_data = createRandomOps<TestType>(re, 10, 3);
+        auto obs = std::make_shared<NamedObs<TestType>>("PauliZ",
+                                                        std::vector<size_t>{0});
+
+        const size_t num_params = [&]() {
+            size_t r = 0;
+            for (const auto &ops_params : ops_data.getOpsParams()) {
+                if (!ops_params.empty()) {
+                    ++r;
+                }
+            }
+            return r;
+        }();
+
+        std::vector<size_t> trainable_params(num_params);
+        std::iota(trainable_params.begin(), trainable_params.end(), 0);
+
+        const auto ini_st = createProductState<TestType>("+++");
+
+        StateVectorManagedCPU<TestType> sv(ini_st.data(), ini_st.size());
+        applyOperations(sv, ops_data);
+        JacobianData<TestType> jd{
+            num_params, 8,        sv.getDataVector().data(),
+            {obs},      ops_data, trainable_params};
+
+        auto o_sv = sv;
+        applyObservable(o_sv, *obs);
+
+        std::vector<TestType> grad_vjp = [&]() {
+            std::vector<ComplexPrecisionT> vjp(num_params);
+            statevectorVJP(
+                std::span{vjp}, jd,
+                std::span<const ComplexPrecisionT>{o_sv.getDataVector()},
+                false);
+            std::vector<TestType> res(vjp.size());
+            std::transform(vjp.begin(), vjp.end(), res.begin(),
+                           [](const auto &x) { return 2 * std::real(x); });
+            return res;
+        }();
+
+        std::vector<TestType> jac(num_params);
+        adjointJacobian<TestType>(std::span{jac}, jd);
+
+        REQUIRE(grad_vjp == PLApprox(jac).margin(1e-5));
+    }
+}
diff --git a/pennylane_lightning/src/tests/Test_StateVectorRawCPU.cpp b/pennylane_lightning/src/tests/Test_StateVectorRawCPU.cpp
index af7db4f80c..9b8ef8df66 100644
--- a/pennylane_lightning/src/tests/Test_StateVectorRawCPU.cpp
+++ b/pennylane_lightning/src/tests/Test_StateVectorRawCPU.cpp
@@ -36,24 +36,44 @@ TEMPLATE_TEST_CASE("StateVectorRawCPU::setData", "[StateVectorRawCPU]", float,
                    double) {
     using PrecisionT = TestType;
 
-    SECTION("setData correctly update data") {
+    SECTION("changeDataPtr correctly update data") {
         auto st_data = createRandomState<PrecisionT>(re, 4);
         StateVectorRawCPU<PrecisionT> sv(st_data.data(), st_data.size());
 
         auto st_data2 = createRandomState<PrecisionT>(re, 8);
-        sv.setData(st_data2.data(), st_data2.size());
+        sv.changeDataPtr(st_data2.data(), st_data2.size());
 
         REQUIRE(sv.getNumQubits() == 8);
         REQUIRE(sv.getData() == st_data2.data());
         REQUIRE(sv.getLength() == (1U << 8U));
     }
 
-    SECTION("setData throws an exception when the data is incorrect") {
+    SECTION("changeDataPtr throws an exception when the data is incorrect") {
         auto st_data = createRandomState<PrecisionT>(re, 4);
         StateVectorRawCPU<PrecisionT> sv(st_data.data(), st_data.size());
 
         std::vector<std::complex<PrecisionT>> new_data(7, PrecisionT{0.0});
 
-        REQUIRE_THROWS(sv.setData(new_data.data(), new_data.size()));
+        REQUIRE_THROWS_AS(sv.changeDataPtr(new_data.data(), new_data.size()),
+                          Util::LightningException);
+    }
+
+    SECTION("setDataFrom correctly update data") {
+        auto st_data1 = createRandomState<PrecisionT>(re, 4);
+        auto st_data2 = createRandomState<PrecisionT>(re, 4);
+        StateVectorRawCPU<PrecisionT> sv(st_data1.data(), st_data1.size());
+
+        sv.setDataFrom(st_data2.data(),
+                       st_data2.size()); // Should update st_data1
+        REQUIRE(st_data1 == st_data2);
+    }
+
+    SECTION("setDataFrom throws an exception when the data is incorrect") {
+        auto st_data1 = createRandomState<PrecisionT>(re, 4);
+        auto st_data2 = createRandomState<PrecisionT>(re, 8);
+        StateVectorRawCPU<PrecisionT> sv(st_data1.data(), st_data1.size());
+
+        REQUIRE_THROWS_AS(sv.setDataFrom(st_data2.data(), st_data2.size()),
+                          Util::LightningException);
     }
 }
diff --git a/pennylane_lightning/src/util/ConstantUtil.hpp b/pennylane_lightning/src/util/ConstantUtil.hpp
index 03a49bb96d..4feb9aa1e0 100644
--- a/pennylane_lightning/src/util/ConstantUtil.hpp
+++ b/pennylane_lightning/src/util/ConstantUtil.hpp
@@ -201,7 +201,7 @@ constexpr auto prepend_to_tuple(T &&elt, Tuple &&t) {
  * @param tuple Tuple to transform
  */
 template <class Tuple> constexpr auto tuple_to_array(Tuple &&tuple) {
-    using T = std::tuple_element_t<0, remove_cvref_t<Tuple>>;
+    using T = std::tuple_element_t<0, std::remove_cvref_t<Tuple>>;
     return std::apply(
         [](auto... n) { return std::array<T, sizeof...(n)>{n...}; },
         std::forward<Tuple>(tuple));
diff --git a/pennylane_lightning/src/util/Error.hpp b/pennylane_lightning/src/util/Error.hpp
index 6c3c2eab64..9521b120f3 100644
--- a/pennylane_lightning/src/util/Error.hpp
+++ b/pennylane_lightning/src/util/Error.hpp
@@ -10,6 +10,8 @@
 // limitations under the License.
 #pragma once
 
+#include "Util.hpp"
+
 #include <exception>
 #include <iostream>
 #include <sstream>
@@ -17,8 +19,6 @@
 #include <type_traits>
 #include <utility>
 
-#include "Util.hpp"
-
 /**
  * @brief Macro that throws `%LightningException` with given message.
  *
diff --git a/pennylane_lightning/src/util/LinearAlgebra.hpp b/pennylane_lightning/src/util/LinearAlgebra.hpp
index 4ef49f4c39..81674aecf7 100644
--- a/pennylane_lightning/src/util/LinearAlgebra.hpp
+++ b/pennylane_lightning/src/util/LinearAlgebra.hpp
@@ -17,6 +17,7 @@
  */
 #pragma once
 
+#include "Macros.hpp"
 #include "TypeTraits.hpp"
 #include "Util.hpp"
 
@@ -25,6 +26,7 @@
 #include <cstdlib>
 #include <numeric>
 #include <random>
+#include <span>
 #include <vector>
 
 /// @cond DEV
@@ -60,7 +62,6 @@ enum class Trans : int {
     Transpose = CblasTrans,
     Adjoint = CblasConjTrans
 };
-
 /**
  * @brief Calculates the inner-product using OpenMP.
  *
@@ -343,8 +344,8 @@ inline void matrixVecProd(const std::complex<T> *mat,
  * nthreads = 1, bool transpose = false)
  */
 template <class T>
-inline auto matrixVecProd(const std::vector<std::complex<T>> mat,
-                          const std::vector<std::complex<T>> v_in, size_t m,
+inline auto matrixVecProd(const std::vector<std::complex<T>> &mat,
+                          const std::vector<std::complex<T>> &v_in, size_t m,
                           size_t n, Trans transpose = Trans::NoTranspose)
     -> std::vector<std::complex<T>> {
     if (mat.size() != m * n) {
@@ -467,15 +468,16 @@ inline static void CFTranspose(const std::complex<T> *mat,
  * @param n Number of columns of `mat`.
  * @return mat transpose of shape n * m.
  */
-template <class T, class Alloc>
-inline auto Transpose(const std::vector<std::complex<T>, Alloc> &mat, size_t m,
-                      size_t n) -> std::vector<std::complex<T>, Alloc> {
+template <class T, class Allocator = std::allocator<T>>
+inline auto Transpose(std::span<const T> mat, size_t m, size_t n,
+                      Allocator allocator = std::allocator<T>())
+    -> std::vector<T, Allocator> {
     if (mat.size() != m * n) {
         throw std::invalid_argument(
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<std::complex<T>, Alloc> mat_t(n * m, mat.get_allocator());
+    std::vector<T, Allocator> mat_t(n * m, allocator);
     CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
     return mat_t;
 }
@@ -484,23 +486,19 @@ inline auto Transpose(const std::vector<std::complex<T>, Alloc> &mat, size_t m,
  * @brief Transpose a matrix of shape m * n to n * m using the
  * best available method.
  *
+ * This version may be merged with the above one when std::ranges is well
+ * supported.
+ *
  * @tparam T Floating point precision type.
  * @param mat Row-wise flatten matrix of shape m * n.
  * @param m Number of rows of `mat`.
  * @param n Number of columns of `mat`.
  * @return mat transpose of shape n * m.
  */
-template <class T, class Alloc>
-inline auto Transpose(const std::vector<T, Alloc> &mat, size_t m, size_t n)
-    -> std::vector<T, Alloc> {
-    if (mat.size() != m * n) {
-        throw std::invalid_argument(
-            "Invalid number of rows and columns for the input matrix");
-    }
-
-    std::vector<T, Alloc> mat_t(n * m, mat.get_allocator());
-    CFTranspose(mat.data(), mat_t.data(), m, n, 0, m, 0, n);
-    return mat_t;
+template <class T, class Allocator>
+inline auto Transpose(const std::vector<T, Allocator> &mat, size_t m, size_t n)
+    -> std::vector<T, Allocator> {
+    return Transpose(std::span<const T>{mat}, m, n, mat.get_allocator());
 }
 
 /**
@@ -554,10 +552,10 @@ inline void vecMatrixProd(const T *v_in, const T *mat, T *v_out, size_t m,
  * @see inline void vecMatrixProd(const T *v_in,
  * const T *mat, T *v_out, size_t m, size_t n)
  */
-template <class T, class Alloc>
-inline auto vecMatrixProd(const std::vector<T, Alloc> &v_in,
-                          const std::vector<T, Alloc> &mat, size_t m, size_t n)
-    -> std::vector<T, Alloc> {
+template <class T, class AllocA, class AllocB>
+inline auto vecMatrixProd(const std::vector<T, AllocA> &v_in,
+                          const std::vector<T, AllocB> &mat, size_t m, size_t n)
+    -> std::vector<T> {
     if (v_in.size() != m) {
         throw std::invalid_argument("Invalid size for the input vector");
     }
@@ -566,7 +564,7 @@ inline auto vecMatrixProd(const std::vector<T, Alloc> &v_in,
             "Invalid number of rows and columns for the input matrix");
     }
 
-    std::vector<T, Alloc> v_out(n, mat.get_allocator());
+    std::vector<T, AllocA> v_out(n);
     vecMatrixProd(v_in.data(), mat.data(), v_out.data(), m, n);
 
     return v_out;
@@ -578,9 +576,10 @@ inline auto vecMatrixProd(const std::vector<T, Alloc> &v_in,
  * @see inline void vecMatrixProd(const T *v_in, const T *mat, T *v_out, size_t
  * m, size_t n)
  */
-template <class T>
-inline void vecMatrixProd(std::vector<T> &v_out, const std::vector<T> &v_in,
-                          const std::vector<T> &mat, size_t m, size_t n) {
+template <class T, class AllocA, class AllocB, class AllocC>
+inline void
+vecMatrixProd(std::vector<T, AllocA> &v_out, const std::vector<T, AllocB> &v_in,
+              const std::vector<T, AllocC> &mat, size_t m, size_t n) {
     if (mat.size() != m * n) {
         throw std::invalid_argument(
             "Invalid number of rows and columns for the input matrix");
@@ -715,6 +714,11 @@ inline void matrixMatProd(const std::complex<T> *m_left,
             cblas_zgemm(CblasRowMajor, CblasNoTrans, tr, m, n, k, &co, m_left,
                         k, m_right, (transpose != Trans::NoTranspose) ? k : n,
                         &cz, m_out, n);
+        } else {
+            static_assert(
+                std::is_same_v<T, float> || std::is_same_v<T, double>,
+                "This procedure only supports a single or double precision "
+                "floating point types.");
         }
     } else {
         omp_matrixMatProd(m_left, m_right, m_out, m, n, k, transpose);
@@ -846,4 +850,102 @@ auto randomUnitary(RandomEngine &re, size_t num_qubits)
     }
     return res;
 }
+
+/**
+ * @brief @rst
+ * Calculate :math:`y += a*x` for a scalar :math:`a` and a vector :math:`x`
+ * using OpenMP
+ * @endrst
+ *
+ * @tparam STD_CROSSOVER The number of dimension after which OpenMP version
+ * outperforms the standard method.
+ *
+ * @param dim Dimension of data
+ * @param a Scalar to scale x
+ * @param x Vector to add
+ * @param y Vector to be added
+ */
+template <class T,
+          size_t STD_CROSSOVER = 1U << 12U> // NOLINT(readability-magic-numbers)
+void omp_scaleAndAdd(size_t dim, std::complex<T> a, const std::complex<T> *x,
+                     std::complex<T> *y) {
+    if (dim < STD_CROSSOVER) {
+        for (size_t i = 0; i < dim; i++) {
+            y[i] += a * x[i];
+        }
+    } else {
+#if defined(_OPENMP)
+#pragma omp parallel for default(none) firstprivate(a, dim, x, y)
+#endif
+        for (size_t i = 0; i < dim; i++) {
+            y[i] += a * x[i];
+        }
+    }
+}
+
+/**
+ * @brief @rst
+ * Calculate :math:`y += a*x` for a scalar :math:`a` and a vector :math:`x`
+ * using BLAS.
+ * @endrst
+ *
+ * @param dim Dimension of data
+ * @param a Scalar to scale x
+ * @param x Vector to add
+ * @param y Vector to be added
+ */
+template <class T>
+void blas_scaleAndAdd(size_t dim, std::complex<T> a, const std::complex<T> *x,
+                      std::complex<T> *y) {
+    if constexpr (std::is_same_v<T, float>) {
+        cblas_caxpy(dim, &a, x, 1, y, 1);
+    } else if (std::is_same_v<T, double>) {
+        cblas_zaxpy(dim, &a, x, 1, y, 1);
+    } else {
+        static_assert(
+            std::is_same_v<T, float> || std::is_same_v<T, double>,
+            "This procedure only supports a single or double precision "
+            "floating point types.");
+    }
+}
+
+/**
+ * @brief @rst
+ * Calculate :math:`y += a*x` for a scalar :math:`a` and a vector :math:`x`
+ * using the best available method.
+ * @endrst
+ *
+ *
+ * @param dim Dimension of data
+ * @param a Scalar to scale x
+ * @param x Vector to add
+ * @param y Vector to be added
+ */
+template <class T>
+void scaleAndAdd(size_t dim, std::complex<T> a, const std::complex<T> *x,
+                 std::complex<T> *y) {
+    if constexpr (USE_CBLAS) {
+        blas_scaleAndAdd(dim, a, x, y);
+    } else {
+        omp_scaleAndAdd(dim, a, x, y);
+    }
+}
+/**
+ * @brief @rst
+ * Calculate :math:`y += a*x` for a scalar :math:`a` and a vector :math:`x`.
+ * @endrst
+ *
+ * @param dim Dimension of data
+ * @param a Scalar to scale x
+ * @param x Vector to add
+ * @param y Vector to be added
+ */
+template <class T>
+void scaleAndAdd(std::complex<T> a, const std::vector<std::complex<T>> &x,
+                 std::vector<std::complex<T>> &y) {
+    if (x.size() != y.size()) {
+        throw std::invalid_argument("Dimensions of vectors mismatch");
+    }
+    scaleAndAdd(x.size(), a, x.data(), y.data());
+}
 } // namespace Pennylane::Util
diff --git a/pennylane_lightning/src/util/Memory.hpp b/pennylane_lightning/src/util/Memory.hpp
index b86797a40a..8f15c38890 100644
--- a/pennylane_lightning/src/util/Memory.hpp
+++ b/pennylane_lightning/src/util/Memory.hpp
@@ -67,6 +67,9 @@ inline void alignedFree(void *p) {
 /**
  * @brief C++ Allocator class for aligned memory.
  *
+ * C++17's std::pmr::polymorphic_allocator substitutes this whole class.
+ * However, clang (even the latest version 13) does not support pmr yet.
+ *
  * @tparam T Datatype to allocate
  */
 template <class T> class AlignedAllocator {
@@ -107,7 +110,7 @@ template <class T> class AlignedAllocator {
      * @param size The number of T objects for the allocation
      * @return Allocated aligned memory
      */
-    [[nodiscard]] T *allocate(std::size_t size) {
+    [[nodiscard]] T *allocate(std::size_t size) const {
         if (size == 0) {
             return nullptr;
         }
@@ -130,7 +133,7 @@ template <class T> class AlignedAllocator {
      * @param p Pointer to the allocated data
      * @param size Size of the data we allocated (unused).
      */
-    void deallocate(T *p, [[maybe_unused]] std::size_t size) noexcept {
+    void deallocate(T *p, [[maybe_unused]] std::size_t size) const noexcept {
         if (alignment_ > alignof(std::max_align_t)) {
             alignedFree(p);
         } else {
diff --git a/pennylane_lightning/src/util/TypeTraits.hpp b/pennylane_lightning/src/util/TypeTraits.hpp
index cc10f3ef11..c1372fddea 100644
--- a/pennylane_lightning/src/util/TypeTraits.hpp
+++ b/pennylane_lightning/src/util/TypeTraits.hpp
@@ -18,13 +18,8 @@
 #pragma once
 #include <complex>
 #include <type_traits>
+
 namespace Pennylane::Util {
-// Enable until C++20 support is explicitly allowed
-template <class T> struct remove_cvref {
-    using type = std::remove_cv_t<std::remove_reference_t<T>>;
-};
-// type alias
-template <class T> using remove_cvref_t = typename remove_cvref<T>::type;
 template <typename T> struct remove_complex { using type = T; };
 template <typename T> struct remove_complex<std::complex<T>> {
     using type = T;
diff --git a/tests/test_adjoint_jacobian.py b/tests/test_adjoint_jacobian.py
index 77182245ab..2d0daee7e4 100644
--- a/tests/test_adjoint_jacobian.py
+++ b/tests/test_adjoint_jacobian.py
@@ -69,7 +69,7 @@ class TestAdjointJacobian:
 
     @pytest.fixture(params=[np.complex64, np.complex128])
     def dev(self, request):
-        return qml.device("lightning.qubit", wires=2, c_dtype=request.param)
+        return qml.device("lightning.qubit", wires=3, c_dtype=request.param)
 
     def test_not_expval(self, dev):
         """Test if a QuantumFunctionError is raised for a tape with measurements that are not
@@ -79,7 +79,9 @@ def test_not_expval(self, dev):
             qml.RX(0.1, wires=0)
             qml.var(qml.PauliZ(0))
 
-        with pytest.raises(qml.QuantumFunctionError, match="Adjoint differentiation method does"):
+        with pytest.raises(
+            qml.QuantumFunctionError, match="Adjoint differentiation method does not"
+        ):
             dev.adjoint_jacobian(tape)
 
     def test_finite_shots_warns(self):
@@ -95,6 +97,15 @@ def test_finite_shots_warns(self):
         ):
             dev.adjoint_jacobian(tape)
 
+    def test_empty_measurements(self, tol, dev):
+        """Tests if an empty array is returned when the measurements of the tape is empty."""
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+
+        jac = dev.adjoint_jacobian(tape)
+        assert len(jac) == 0
+
     @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
     def test_unsupported_op(self, dev):
         """Test if a QuantumFunctionError is raised for an unsupported operation, i.e.,
@@ -130,28 +141,6 @@ def test_proj_unsupported(self, dev):
         ):
             dev.adjoint_jacobian(tape)
 
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
-    def test_unsupported_hermitian_expectation(self, dev):
-        obs = np.array([[1, 0], [0, -1]], dtype=np.complex128, requires_grad=False)
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RY(0.1, wires=(0,))
-            qml.expval(qml.Hermitian(obs, wires=(0,)))
-
-        with pytest.raises(
-            qml.QuantumFunctionError, match="Lightning adjoint differentiation method does not"
-        ):
-            dev.adjoint_jacobian(tape)
-
-        with qml.tape.QuantumTape() as tape:
-            qml.RY(0.1, wires=(0,))
-            qml.expval(qml.Hermitian(obs, wires=(0,)) @ qml.PauliZ(wires=1))
-
-        with pytest.raises(
-            qml.QuantumFunctionError, match="Lightning adjoint differentiation method does not"
-        ):
-            dev.adjoint_jacobian(tape)
-
     @pytest.mark.parametrize("theta", np.linspace(-2 * np.pi, 2 * np.pi, 7))
     @pytest.mark.parametrize("G", [qml.RX, qml.RY, qml.RZ])
     def test_pauli_rotation_gradient(self, G, theta, dev):
@@ -228,9 +217,8 @@ def test_rx_gradient(self, tol, dev):
         expected_jacobian = -np.sin(a)
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
-    def test_multiple_rx_gradient(self, tol):
+    def test_multiple_rx_gradient_pauliz(self, tol, dev):
         """Tests that the gradient of multiple RX gates in a circuit yields the correct result."""
-        dev = qml.device("lightning.qubit", wires=3)
         params = np.array([np.pi, np.pi / 2, np.pi / 3])
 
         with qml.tape.QuantumTape() as tape:
@@ -246,6 +234,97 @@ def test_multiple_rx_gradient(self, tol):
         expected_jacobian = -np.diag(np.sin(params))
         assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
 
+    def test_multiple_rx_gradient_hermitian(self, tol, dev):
+        """Tests that the gradient of multiple RX gates in a circuit yields the correct result
+        with Hermitian observable
+        """
+        params = np.array([np.pi, np.pi / 2, np.pi / 3])
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(params[0], wires=0)
+            qml.RX(params[1], wires=1)
+            qml.RX(params[2], wires=2)
+
+            for idx in range(3):
+                qml.expval(qml.Hermitian([[1, 0], [0, -1]], wires=[idx]))
+
+        tape.trainable_params = {0, 1, 2}
+        # circuit jacobians
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = -np.diag(np.sin(params))
+
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
+    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]
+    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
+
+    def test_multiple_rx_gradient_expval_hermitian(self, tol, dev):
+        """Tests that the gradient of multiple RX gates in a circuit yields the correct result
+        with Hermitian observable
+        """
+        params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(params[0], wires=0)
+            qml.RX(params[1], wires=1)
+            qml.RX(params[2], wires=2)
+
+            qml.expval(
+                qml.Hermitian(
+                    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]], wires=[0, 2]
+                )
+            )
+
+        tape.trainable_params = {0, 1, 2}
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = np.array(
+            [-np.sin(params[0]) * np.cos(params[2]), 0, -np.cos(params[0]) * np.sin(params[2])]
+        )
+
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
+    qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]
+    ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
+
+    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_multiple_rx_gradient_expval_hamiltonian(self, tol, dev):
+        """Tests that the gradient of multiple RX gates in a circuit yields the correct result
+        with Hermitian observable
+        """
+        params = np.array([np.pi / 3, np.pi / 4, np.pi / 5])
+
+        ham = qml.Hamiltonian(
+            [1.0, 0.3, 0.3, 0.4],
+            [
+                qml.PauliX(0) @ qml.PauliX(1),
+                qml.PauliZ(0),
+                qml.PauliZ(1),
+                qml.Hermitian(
+                    [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, 1]], wires=[0, 2]
+                ),
+            ],
+        )
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(params[0], wires=0)
+            qml.RX(params[1], wires=1)
+            qml.RX(params[2], wires=2)
+
+            qml.expval(ham)
+
+        tape.trainable_params = {0, 1, 2}
+        dev_jacobian = dev.adjoint_jacobian(tape)
+        expected_jacobian = (
+            0.3 * np.array([-np.sin(params[0]), 0, 0])
+            + 0.3 * np.array([0, -np.sin(params[1]), 0])
+            + 0.4
+            * np.array(
+                [-np.sin(params[0]) * np.cos(params[2]), 0, -np.cos(params[0]) * np.sin(params[2])]
+            )
+        )
+
+        assert np.allclose(dev_jacobian, expected_jacobian, atol=tol, rtol=0)
+
     qubit_ops = [getattr(qml, name) for name in qml.ops._qubit__ops__]
     ops = {qml.RX, qml.RY, qml.RZ, qml.PhaseShift, qml.CRX, qml.CRY, qml.CRZ, qml.Rot}
 
@@ -262,7 +341,7 @@ def test_multiple_rx_gradient(self, tol):
             qml.Rot(0.2, -0.1, 0.2, wires=0),
         ],
     )
-    def test_gradients(self, op, obs, dev):
+    def test_gradients_pauliz(self, op, obs, dev):
         """Tests that the gradients of circuits match between the finite difference and device
         methods."""
 
@@ -282,7 +361,7 @@ def test_gradients(self, op, obs, dev):
             qml.expval(obs(wires=0))
             qml.expval(qml.PauliZ(wires=1))
 
-        dev.trainable_params = set(range(1, 1 + op.num_params))
+        tape.trainable_params = set(range(1, 1 + op.num_params))
 
         h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
         tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
@@ -294,7 +373,54 @@ def test_gradients(self, op, obs, dev):
 
         assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
 
-    def test_gradient_gate_with_multiple_parameters(self, dev):
+    @pytest.mark.parametrize(
+        "op",
+        [
+            qml.RX(0.4, wires=0),
+            qml.RY(0.6, wires=0),
+            qml.RZ(0.8, wires=0),
+            qml.CRX(1.0, wires=[0, 1]),
+            qml.CRY(2.0, wires=[0, 1]),
+            qml.CRZ(3.0, wires=[0, 1]),
+            qml.Rot(0.2, -0.1, 0.2, wires=0),
+        ],
+    )
+    def test_gradients_hermitian(self, op, dev):
+        """Tests that the gradients of circuits match between the finite difference and device
+        methods."""
+
+        # op.num_wires and op.num_params must be initialized a priori
+        with qml.tape.QuantumTape() as tape:
+            qml.Hadamard(wires=0)
+            qml.RX(0.543, wires=0)
+            qml.CNOT(wires=[0, 1])
+
+            op.queue()
+
+            qml.Rot(1.3, -2.3, 0.5, wires=[0])
+            qml.RZ(-0.5, wires=0)
+            qml.RY(0.5, wires=1).inv()
+            qml.CNOT(wires=[0, 1])
+
+            qml.expval(
+                qml.Hermitian(
+                    [[0, 0, 1, 1], [0, 1, 2, 1], [1, 2, 1, 0], [1, 1, 0, 0]], wires=[0, 1]
+                )
+            )
+
+        tape.trainable_params = set(range(1, 1 + op.num_params))
+
+        h = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        grad_F = (lambda t, fn: fn(qml.execute(t, dev, None)))(
+            *qml.gradients.finite_diff(tape, h=h)
+        )
+        grad_D = dev.adjoint_jacobian(tape)
+
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    def test_gradient_gate_with_multiple_parameters_pauliz(self, dev):
         """Tests that gates with multiple free parameters yield correct gradients."""
 
         x, y, z = [0.5, 0.3, -0.7]
@@ -320,6 +446,61 @@ def test_gradient_gate_with_multiple_parameters(self, dev):
         # the different methods agree
         assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
 
+    def test_gradient_gate_with_multiple_parameters_hermitian(self, dev):
+        """Tests that gates with multiple free parameters yield correct gradients."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.Hermitian([[0, 1], [1, 1]], wires=0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        grad_D = dev.adjoint_jacobian(tape)
+        tapes, fn = qml.gradients.finite_diff(tape, h=h)
+        grad_F = fn(qml.execute(tapes, dev, None))
+
+        # gradient has the correct shape and every element is nonzero
+        assert grad_D.shape == (1, 3)
+        assert np.count_nonzero(grad_D) == 3
+        # the different methods agree
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
+    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_gradient_gate_with_multiple_parameters_hamiltonian(self, dev):
+        """Tests that gates with multiple free parameters yield correct gradients."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        ham = qml.Hamiltonian(
+            [1.0, 0.3, 0.3], [qml.PauliX(0) @ qml.PauliX(1), qml.PauliZ(0), qml.PauliZ(1)]
+        )
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(ham)
+
+        tape.trainable_params = {1, 2, 3}
+
+        h = 2e-3 if dev.R_DTYPE == np.float32 else 1e-7
+        tol = 1e-3 if dev.R_DTYPE == np.float32 else 1e-7
+
+        grad_D = dev.adjoint_jacobian(tape)
+        tapes, fn = qml.gradients.finite_diff(tape, h=h)
+        grad_F = fn(qml.execute(tapes, dev, None))
+
+        # gradient has the correct shape and every element is nonzero
+        assert grad_D.shape == (1, 3)
+        assert np.count_nonzero(grad_D) == 3
+        # the different methods agree
+        assert np.allclose(grad_D, grad_F, atol=tol, rtol=0)
+
     def test_use_device_state(self, tol, dev):
         """Tests that when using the device state, the correct answer is still returned."""
 
@@ -359,6 +540,39 @@ def test_provide_starting_state(self, tol, dev):
 
         assert np.allclose(dM1, dM2, atol=tol, rtol=0)
 
+    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_provide_wrong_starting_state(self, dev):
+        """Tests raise an exception when provided starting state mismatches."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.PauliZ(0))
+
+        tape.trainable_params = {1, 2, 3}
+
+        with pytest.raises(
+            qml.QuantumFunctionError,
+            match="The number of qubits of starting_state must be the same as",
+        ):
+            dev.adjoint_jacobian(tape, starting_state=np.ones(7))
+
+    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+    def test_state_return_type(self, dev):
+        """Tests raise an exception when the return type is State"""
+        with qml.tape.QuantumTape() as tape:
+            qml.RX(0.4, wires=[0])
+            qml.state()
+
+        tape.trainable_params = {0}
+
+        with pytest.raises(
+            qml.QuantumFunctionError, match="This method does not support statevector return type."
+        ):
+            dev.adjoint_jacobian(tape)
+
 
 class TestAdjointJacobianQNode:
     """Test QNode integration with the adjoint_jacobian method"""
@@ -605,6 +819,44 @@ def f(params1, params2):
         assert np.allclose(grad_adjoint, grad_fd, atol=tol)
 
 
+@pytest.mark.parametrize(
+    "r_dtype,c_dtype", [[np.float32, np.complex64], [np.float64, np.complex128]]
+)
+def test_qchem_expvalcost_correct(r_dtype, c_dtype):
+    """EvpvalCost with qchem Hamiltonian work corectly"""
+    from pennylane import qchem
+
+    symbols = ["Li", "H"]
+    geometry = np.array([0.0, 0.0, 0.0, 0.0, 0.0, 2.969280527])
+    H, qubits = qchem.molecular_hamiltonian(
+        symbols, geometry, active_electrons=2, active_orbitals=5
+    )
+    active_electrons = 2
+    hf_state = qchem.hf_state(active_electrons, qubits)
+
+    def circuit_1(params, wires):
+        qml.BasisState(hf_state, wires=wires)
+        qml.RX(params[0], wires=0)
+        qml.RY(params[0], wires=1)
+        qml.RZ(params[0], wires=2)
+        qml.Hadamard(wires=1)
+
+    diff_method = "adjoint"
+    dev_lig = qml.device("lightning.qubit", wires=qubits, c_dtype=c_dtype)
+    cost_fn_lig = qml.ExpvalCost(circuit_1, H, dev_lig, optimize=False, diff_method=diff_method)
+    circuit_gradient_lig = qml.grad(cost_fn_lig, argnum=0)
+    params = np.array([0.123], requires_grad=True)
+    grads_lig = circuit_gradient_lig(params)
+
+    dev_def = qml.device("default.qubit", wires=qubits)
+    cost_fn_def = qml.ExpvalCost(circuit_1, H, dev_def, optimize=False, diff_method=diff_method)
+    circuit_gradient_def = qml.grad(cost_fn_def, argnum=0)
+    params = np.array([0.123], requires_grad=True)
+    grads_def = circuit_gradient_def(params)
+
+    assert np.allclose(grads_lig, grads_def)
+
+
 def circuit_ansatz(params, wires):
     """Circuit ansatz containing all the parametrized gates"""
     qml.QubitStateVector(unitary_group.rvs(2**4, random_state=0)[0], wires=wires)
@@ -639,6 +891,29 @@ def circuit_ansatz(params, wires):
     qml.RX(params[29], wires=wires[1])
 
 
+@pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
+def test__tape_qchem(tol):
+    """The circit Ansatz with a QChem Hamiltonian produces correct results"""
+
+    H, qubits = qml.qchem.molecular_hamiltonian(
+        ["H", "H"], np.array([0.0, 0.1, 0.0, 0.0, -0.1, 0.0])
+    )
+
+    def circuit(params):
+        circuit_ansatz(params, wires=range(4))
+        return qml.expval(H)
+
+    params = np.arange(30) * 0.111
+
+    dev_lq = qml.device("lightning.qubit", wires=4)
+    dev_dq = qml.device("default.qubit", wires=4)
+
+    circuit_lq = qml.QNode(circuit, dev_lq, diff_method="adjoint")
+    circuit_dq = qml.QNode(circuit, dev_lq, diff_method="parameter-shift")
+
+    assert np.allclose(qml.grad(circuit_lq)(params), qml.grad(circuit_dq)(params), tol)
+
+
 @pytest.mark.parametrize(
     "returns",
     [
@@ -651,9 +926,11 @@ def circuit_ansatz(params, wires):
         # qml.Projector([0, 0], wires=[2, 0])
         qml.PauliX(0) @ qml.PauliY(3),
         qml.PauliY(0) @ qml.PauliY(2) @ qml.PauliY(3),
-        # qml.Hermitian(np.kron(qml.PauliY.matrix, qml.PauliZ.matrix), wires=[3, 2]),
-        # qml.Hermitian(np.array([[0,1],[1,0]], requires_grad=False), wires=0),
-        # qml.Hermitian(np.array([[0,1],[1,0]], requires_grad=False), wires=0) @ qml.PauliZ(2),
+        qml.Hermitian(
+            np.kron(qml.PauliY.compute_matrix(), qml.PauliZ.compute_matrix()), wires=[3, 2]
+        ),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=0),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=0) @ qml.PauliZ(2),
     ],
 )
 def test_integration(returns):
@@ -719,9 +996,13 @@ def circuit(params):
         # qml.Projector([0, 0], wires=[custom_wires[2], custom_wires[0]])
         qml.PauliX(custom_wires[0]) @ qml.PauliY(custom_wires[3]),
         qml.PauliY(custom_wires[0]) @ qml.PauliY(custom_wires[2]) @ qml.PauliY(custom_wires[3]),
-        # qml.Hermitian(np.array([[0,1],[1,0]], requires_grad=False), wires=custom_wires[0]),
-        # qml.Hermitian(np.kron(qml.PauliY.matrix, qml.PauliZ.matrix), wires=[custom_wires[3], custom_wires[2]]),
-        # qml.Hermitian(np.array([[0,1],[1,0]], requires_grad=False), wires=custom_wires[0]) @ qml.PauliZ(custom_wires[2]),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=custom_wires[0]),
+        qml.Hermitian(
+            np.kron(qml.PauliY.compute_matrix(), qml.PauliZ.compute_matrix()),
+            wires=[custom_wires[3], custom_wires[2]],
+        ),
+        qml.Hermitian(np.array([[0, 1], [1, 0]], requires_grad=False), wires=custom_wires[0])
+        @ qml.PauliZ(custom_wires[2]),
     ],
 )
 def test_integration_custom_wires(returns):
diff --git a/tests/test_serialize.py b/tests/test_serialize.py
index 3f6dc85941..3dc121cde0 100644
--- a/tests/test_serialize.py
+++ b/tests/test_serialize.py
@@ -14,26 +14,36 @@
 """
 Unit tests for the serialization helper functions
 """
+from asyncio import set_child_watcher
 import pennylane as qml
-from pennylane import numpy as np
+from pennylane import numpy as qnp
+import numpy as np
 import pennylane_lightning
 
 from pennylane_lightning._serialize import (
-    _serialize_obs,
+    _serialize_observables,
     _serialize_ops,
     _obs_has_kernel,
 )
 import pytest
 from unittest import mock
 
-try:
-    from pennylane_lightning.lightning_qubit_ops import (
-        ObsStructC64,
-        ObsStructC128,
-    )
-except (ImportError, ModuleNotFoundError):
+from pennylane_lightning.lightning_qubit import CPP_BINARY_AVAILABLE
+
+if not CPP_BINARY_AVAILABLE:
     pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
+from pennylane_lightning.lightning_qubit_ops.adjoint_diff import (
+    NamedObsC64,
+    NamedObsC128,
+    HermitianObsC64,
+    HermitianObsC128,
+    TensorProdObsC64,
+    TensorProdObsC128,
+    HamiltonianC64,
+    HamiltonianC128,
+)
+
 
 class TestObsHasKernel:
     """Tests for the _obs_has_kernel function"""
@@ -84,11 +94,7 @@ class TestSerializeObs:
 
     wires_dict = {i: i for i in range(10)}
 
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
+    @pytest.mark.parametrize("ObsFunc", [NamedObsC128, NamedObsC64])
     def test_basic_return(self, monkeypatch, ObsFunc):
         """Test expected serialization for a simple return"""
         with qml.tape.QuantumTape() as tape:
@@ -96,275 +102,280 @@ def test_basic_return(self, monkeypatch, ObsFunc):
 
         mock_obs = mock.MagicMock()
 
-        use_csingle = True if ObsFunc == ObsStructC64 else False
-        obs_str = "ObsStructC64" if ObsFunc == ObsStructC64 else "ObsStructC128"
+        use_csingle = True if ObsFunc == NamedObsC64 else False
+        obs_str = "NamedObsC64" if ObsFunc == NamedObsC64 else "NamedObsC128"
 
         with monkeypatch.context() as m:
             m.setattr(pennylane_lightning._serialize, obs_str, mock_obs)
-            _serialize_obs(tape, self.wires_dict, use_csingle=use_csingle)
+            _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
 
         s = mock_obs.call_args[0]
-        s_expected = (["PauliZ"], [], [[0]])
+        s_expected = ("PauliZ", [0])
         ObsFunc(*s_expected)
 
         assert s == s_expected
 
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
-    def test_tensor_return(self, monkeypatch, ObsFunc):
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_tensor_return(self, monkeypatch, use_csingle):
         """Test expected serialization for a tensor product return"""
         with qml.tape.QuantumTape() as tape:
             qml.expval(qml.PauliZ(0) @ qml.PauliZ(1))
 
         mock_obs = mock.MagicMock()
 
-        use_csingle = True if ObsFunc == ObsStructC64 else False
-        obs_str = "ObsStructC64" if ObsFunc == ObsStructC64 else "ObsStructC128"
+        ObsFunc = TensorProdObsC64 if use_csingle else TensorProdObsC128
+        named_obs = NamedObsC64 if use_csingle else NamedObsC128
+        obs_str = "TensorProdObsC64" if use_csingle else "TensorProdObsC128"
 
         with monkeypatch.context() as m:
             m.setattr(pennylane_lightning._serialize, obs_str, mock_obs)
-            _serialize_obs(tape, self.wires_dict, use_csingle=use_csingle)
+            _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
 
         s = mock_obs.call_args[0]
-        s_expected = (["PauliZ", "PauliZ"], [], [[0], [1]])
+        s_expected = ([named_obs("PauliZ", [0]), named_obs("PauliZ", [1])],)
         ObsFunc(*s_expected)
 
         assert s == s_expected
 
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
-    def test_tensor_non_tensor_return(self, monkeypatch, ObsFunc):
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_tensor_non_tensor_return(self, use_csingle):
         """Test expected serialization for a mixture of tensor product and non-tensor product
         return"""
         with qml.tape.QuantumTape() as tape:
             qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
             qml.expval(qml.Hadamard(1))
 
-        mock_obs = mock.MagicMock()
-
-        use_csingle = True if ObsFunc == ObsStructC64 else False
-        obs_str = "ObsStructC64" if ObsFunc == ObsStructC64 else "ObsStructC128"
-
-        with monkeypatch.context() as m:
-            m.setattr(pennylane_lightning._serialize, obs_str, mock_obs)
-            _serialize_obs(tape, self.wires_dict, use_csingle=use_csingle)
+        tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
+        named_obs = NamedObsC64 if use_csingle else NamedObsC128
 
-        s = mock_obs.call_args_list
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
 
         s_expected = [
-            (["PauliZ", "PauliX"], [], [[0], [1]]),
-            (["Hadamard"], [], [[1]]),
+            tensor_prod_obs([named_obs("PauliZ", [0]), named_obs("PauliX", [1])]),
+            named_obs("Hadamard", [1]),
         ]
-        [ObsFunc(*s_expected) for s_expected in s_expected]
 
-        assert s[0][0] == s_expected[0]
-        assert s[1][0] == s_expected[1]
+        assert s == s_expected
 
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
-    def test_hermitian_return(self, monkeypatch, ObsFunc):
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_hermitian_return(self, use_csingle):
         """Test expected serialization for a Hermitian return"""
         with qml.tape.QuantumTape() as tape:
             qml.expval(qml.Hermitian(np.eye(4), wires=[0, 1]))
 
-        mock_obs = mock.MagicMock()
-
-        use_csingle = True if ObsFunc == ObsStructC64 else False
-        obs_str = "ObsStructC64" if ObsFunc == ObsStructC64 else "ObsStructC128"
-
-        with monkeypatch.context() as m:
-            m.setattr(pennylane_lightning._serialize, obs_str, mock_obs)
-            _serialize_obs(tape, self.wires_dict, use_csingle=use_csingle)
+        hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
+        c_dtype = np.complex64 if use_csingle else np.complex128
 
-        s = mock_obs.call_args[0]
-        s_expected = (["Hermitian"], [np.eye(4).ravel()], [[0, 1]])
-        ObsFunc(*s_expected)
-
-        assert s[0] == s_expected[0]
-        assert np.allclose(s[1], s_expected[1])
-        assert s[2] == s_expected[2]
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
+        s_expected = hermitian_obs(
+            np.array(
+                [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0],
+                dtype=c_dtype,
+            ),
+            [0, 1],
+        )
+        s[0] == s_expected
 
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
-    def test_hermitian_tensor_return(self, monkeypatch, ObsFunc):
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_hermitian_tensor_return(self, use_csingle):
         """Test expected serialization for a Hermitian return"""
         with qml.tape.QuantumTape() as tape:
             qml.expval(qml.Hermitian(np.eye(4), wires=[0, 1]) @ qml.Hermitian(np.eye(2), wires=[2]))
 
-        mock_obs = mock.MagicMock()
-
-        use_csingle = True if ObsFunc == ObsStructC64 else False
-        obs_str = "ObsStructC64" if ObsFunc == ObsStructC64 else "ObsStructC128"
-
-        with monkeypatch.context() as m:
-            m.setattr(pennylane_lightning._serialize, obs_str, mock_obs)
-            _serialize_obs(tape, self.wires_dict, use_csingle=use_csingle)
+        c_dtype = np.complex64 if use_csingle else np.complex128
+        tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
+        hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
 
-        s = mock_obs.call_args[0]
-        s_expected = (
-            ["Hermitian", "Hermitian"],
-            [np.eye(4).ravel(), np.eye(2).ravel()],
-            [[0, 1], [2]],
+        s_expected = tensor_prod_obs(
+            [
+                hermitian_obs(np.eye(4, dtype=c_dtype).ravel(), [0, 1]),
+                hermitian_obs(np.eye(2, dtype=c_dtype).ravel(), [2]),
+            ]
         )
-        ObsFunc(*s_expected)
 
-        assert s[0] == s_expected[0]
-        assert np.allclose(s[1][0], s_expected[1][0])
-        assert np.allclose(s[1][1], s_expected[1][1])
-        assert s[2] == s_expected[2]
-
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
-    def test_mixed_tensor_return(self, monkeypatch, ObsFunc):
+        assert s[0] == s_expected
+
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_mixed_tensor_return(self, use_csingle):
         """Test expected serialization for a mixture of Hermitian and Pauli return"""
         with qml.tape.QuantumTape() as tape:
             qml.expval(qml.Hermitian(np.eye(4), wires=[0, 1]) @ qml.PauliY(2))
 
-        mock_obs = mock.MagicMock()
-
-        use_csingle = True if ObsFunc == ObsStructC64 else False
-        obs_str = "ObsStructC64" if ObsFunc == ObsStructC64 else "ObsStructC128"
+        c_dtype = np.complex64 if use_csingle else np.complex128
+        tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
+        hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
+        named_obs = NamedObsC64 if use_csingle else NamedObsC128
 
-        with monkeypatch.context() as m:
-            m.setattr(pennylane_lightning._serialize, obs_str, mock_obs)
-            _serialize_obs(tape, self.wires_dict, use_csingle=use_csingle)
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
 
-        s = mock_obs.call_args[0]
-        s_expected = (["Hermitian", "PauliY"], [np.eye(4).ravel()], [[0, 1], [2]])
-        ObsFunc(*s_expected)
+        s_expected = tensor_prod_obs(
+            [hermitian_obs(np.eye(4, dtype=c_dtype).ravel(), [0, 1]), named_obs("PauliY", [2])]
+        )
 
-        assert s[0] == s_expected[0]
-        assert np.allclose(s[1][0], s_expected[1][0])
-        assert s[2] == s_expected[2]
-
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    def test_integration_c64(self, monkeypatch):
-        """Test for a comprehensive range of returns"""
-        wires_dict = {"a": 0, 1: 1, "b": 2, -1: 3, 3.141: 4, "five": 5, 6: 6, 77: 7, 9: 8}
-        I = np.eye(2).astype(np.complex64)
-        X = qml.PauliX.compute_matrix().astype(np.complex64)
-        Y = qml.PauliY.compute_matrix().astype(np.complex64)
-        Z = qml.PauliZ.compute_matrix().astype(np.complex64)
+        assert s[0] == s_expected
 
-        mock_obs = mock.MagicMock()
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_hamiltonian_return(self, use_csingle):
+        """Test expected serialization for a Hamiltonian return"""
 
-        use_csingle = True
+        ham = qml.Hamiltonian(
+            [0.3, 0.5, 0.4],
+            [
+                qml.Hermitian(np.eye(4), wires=[0, 1]) @ qml.PauliY(2),
+                qml.PauliX(0) @ qml.PauliY(2),
+                qml.Hermitian(np.ones((8, 8)), wires=range(3)),
+            ],
+        )
 
         with qml.tape.QuantumTape() as tape:
-            qml.expval(qml.PauliZ("a") @ qml.PauliX("b"))
-            qml.expval(qml.Hermitian(I, wires=1))
-            qml.expval(qml.PauliZ(-1) @ qml.Hermitian(X, wires=3.141) @ qml.Hadamard("five"))
-            # qml.expval(qml.Projector([1, 1], wires=[6, 77]) @ qml.Hermitian(Y, wires=9))
-            qml.expval(qml.Hermitian(Z, wires="a") @ qml.Identity(1))
+            qml.expval(ham)
+
+        obs_str = "HamiltonianC64" if use_csingle else "HamiltonianC128"
+        hamiltonian_obs = HamiltonianC64 if use_csingle else HamiltonianC128
+        named_obs = NamedObsC64 if use_csingle else NamedObsC128
+        hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
+        tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
+        r_dtype = np.float32 if use_csingle else np.float64
+        c_dtype = np.complex64 if use_csingle else np.complex128
+
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
+
+        s_expected = hamiltonian_obs(
+            np.array([0.3, 0.5, 0.4], dtype=r_dtype),
+            [
+                tensor_prod_obs(
+                    [
+                        hermitian_obs(np.eye(4, dtype=c_dtype).ravel(), [0, 1]),
+                        named_obs("PauliY", [2]),
+                    ]
+                ),
+                tensor_prod_obs([named_obs("PauliX", [0]), named_obs("PauliY", [2])]),
+                hermitian_obs(np.ones(64, dtype=c_dtype), [0, 1, 2]),
+            ],
+        )
 
-        with monkeypatch.context() as m:
-            m.setattr(pennylane_lightning._serialize, "ObsStructC64", mock_obs)
-            _serialize_obs(tape, wires_dict, use_csingle=use_csingle)
+        assert s[0] == s_expected
 
-        s = mock_obs.call_args_list
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_hamiltonian_tensor_return(self, use_csingle):
+        """Test expected serialization for a Hamiltonian return"""
 
-        s_expected = [
-            (["PauliZ", "PauliX"], [], [[0], [2]]),
-            (["Hermitian"], [I.ravel()], [[1]]),
-            (["PauliZ", "Hermitian", "Hadamard"], [[], X.ravel(), []], [[3], [4], [5]]),
-            # (["Projector", "Hermitian"], [[],Y.ravel()], [[6, 7], [8]]),
-            (["Hermitian", "Identity"], [Z.ravel(), []], [[0], [1]]),
-        ]
-        [ObsStructC64(*s_expected) for s_expected in s_expected]
-
-        assert all(s1[0][0] == s2[0] for s1, s2 in zip(s, s_expected))
-        for s1, s2 in zip(s, s_expected):
-            for v1, v2 in zip(s1[0][1], s2[1]):
-                assert np.allclose(v1, v2)
-        assert all(s1[0][2] == s2[2] for s1, s2 in zip(s, s_expected))
-
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
-    def test_integration_c128(self, monkeypatch, ObsFunc):
-        """Test for a comprehensive range of returns"""
-        wires_dict = {"a": 0, 1: 1, "b": 2, -1: 3, 3.141: 4, "five": 5, 6: 6, 77: 7, 9: 8}
-        I = np.eye(2).astype(np.complex128)
-        X = qml.PauliX.compute_matrix().astype(np.complex128)
-        Y = qml.PauliY.compute_matrix().astype(np.complex128)
-        Z = qml.PauliZ.compute_matrix().astype(np.complex128)
+        with qml.tape.QuantumTape() as tape:
+            ham = qml.Hamiltonian(
+                [0.3, 0.5, 0.4],
+                [
+                    qml.Hermitian(np.eye(4), wires=[0, 1]) @ qml.PauliY(2),
+                    qml.PauliX(0) @ qml.PauliY(2),
+                    qml.Hermitian(np.ones((8, 8)), wires=range(3)),
+                ],
+            )
+            qml.expval(ham @ qml.PauliZ(3))
+
+        obs_str = "HamiltonianC64" if use_csingle else "HamiltonianC128"
+        hamiltonian_obs = HamiltonianC64 if use_csingle else HamiltonianC128
+        named_obs = NamedObsC64 if use_csingle else NamedObsC128
+        hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
+        tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
+        r_dtype = np.float32 if use_csingle else np.float64
+        c_dtype = np.complex64 if use_csingle else np.complex128
+
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
+
+        # Expression (ham @ obs) is converted internally by Pennylane
+        # where obs is appended to each term of the ham
+        s_expected = hamiltonian_obs(
+            np.array([0.3, 0.5, 0.4], dtype=r_dtype),
+            [
+                tensor_prod_obs(
+                    [
+                        hermitian_obs(np.eye(4, dtype=c_dtype).ravel(), [0, 1]),
+                        named_obs("PauliY", [2]),
+                        named_obs("PauliZ", [3]),
+                    ]
+                ),
+                tensor_prod_obs(
+                    [named_obs("PauliX", [0]), named_obs("PauliY", [2]), named_obs("PauliZ", [3])]
+                ),
+                tensor_prod_obs(
+                    [hermitian_obs(np.ones(64, dtype=c_dtype), [0, 1, 2]), named_obs("PauliZ", [3])]
+                ),
+            ],
+        )
 
-        mock_obs = mock.MagicMock()
+        assert s[0] == s_expected
 
-        use_csingle = False
+    @pytest.mark.parametrize("use_csingle", [True, False])
+    def test_hamiltonian_mix_return(self, use_csingle):
+        """Test expected serialization for a Hamiltonian return"""
 
-        with qml.tape.QuantumTape() as tape:
-            qml.expval(qml.PauliZ("a") @ qml.PauliX("b"))
-            qml.expval(qml.Hermitian(I, wires=1))
-            qml.expval(qml.PauliZ(-1) @ qml.Hermitian(X, wires=3.141) @ qml.Hadamard("five"))
-            # qml.expval(qml.Projector([1, 1], wires=[6, 77]) @ qml.Hermitian(Y, wires=9))
-            qml.expval(qml.Hermitian(Z, wires="a") @ qml.Identity(1))
+        ham1 = qml.Hamiltonian(
+            [0.3, 0.5, 0.4],
+            [
+                qml.Hermitian(np.eye(4), wires=[0, 1]) @ qml.PauliY(2),
+                qml.PauliX(0) @ qml.PauliY(2),
+                qml.Hermitian(np.ones((8, 8)), wires=range(3)),
+            ],
+        )
+        ham2 = qml.Hamiltonian(
+            [0.7, 0.3],
+            [qml.PauliX(0) @ qml.Hermitian(np.eye(4), wires=[1, 2]), qml.PauliY(0) @ qml.PauliX(2)],
+        )
 
-        with monkeypatch.context() as m:
-            m.setattr(pennylane_lightning._serialize, "ObsStructC128", mock_obs)
-            _serialize_obs(tape, wires_dict, use_csingle=use_csingle)
+        with qml.tape.QuantumTape() as tape:
+            qml.expval(ham1)
+            qml.expval(ham2)
+
+        obs_str = "HamiltonianC64" if use_csingle else "HamiltonianC128"
+        hamiltonian_obs = HamiltonianC64 if use_csingle else HamiltonianC128
+        named_obs = NamedObsC64 if use_csingle else NamedObsC128
+        hermitian_obs = HermitianObsC64 if use_csingle else HermitianObsC128
+        tensor_prod_obs = TensorProdObsC64 if use_csingle else TensorProdObsC128
+        r_dtype = np.float32 if use_csingle else np.float64
+        c_dtype = np.complex64 if use_csingle else np.complex128
+
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
+
+        s_expected1 = hamiltonian_obs(
+            np.array([0.3, 0.5, 0.4], dtype=r_dtype),
+            [
+                tensor_prod_obs(
+                    [
+                        hermitian_obs(np.eye(4, dtype=c_dtype).ravel(), [0, 1]),
+                        named_obs("PauliY", [2]),
+                    ]
+                ),
+                tensor_prod_obs([named_obs("PauliX", [0]), named_obs("PauliY", [2])]),
+                hermitian_obs(np.ones(64, dtype=c_dtype), [0, 1, 2]),
+            ],
+        )
+        s_expected2 = hamiltonian_obs(
+            np.array([0.7, 0.3], dtype=r_dtype),
+            [
+                tensor_prod_obs(
+                    [
+                        named_obs("PauliX", [0]),
+                        hermitian_obs(np.eye(4, dtype=c_dtype).ravel(), [1, 2]),
+                    ]
+                ),
+                tensor_prod_obs([named_obs("PauliY", [0]), named_obs("PauliX", [2])]),
+            ],
+        )
 
-        s = mock_obs.call_args_list
+        assert s[0] == s_expected1
+        assert s[1] == s_expected2
 
-        s_expected = [
-            (["PauliZ", "PauliX"], [], [[0], [2]]),
-            (["Hermitian"], [I.ravel()], [[1]]),
-            (["PauliZ", "Hermitian", "Hadamard"], [[], X.ravel(), []], [[3], [4], [5]]),
-            # (["Projector", "Hermitian"], [[],Y.ravel()], [[6, 7], [8]]),
-            (["Hermitian", "Identity"], [Z.ravel(), []], [[0], [1]]),
-        ]
-        [ObsStructC128(*s_expected) for s_expected in s_expected]
-
-        assert all(s1[0][0] == s2[0] for s1, s2 in zip(s, s_expected))
-        for s1, s2 in zip(s, s_expected):
-            for v1, v2 in zip(s1[0][1], s2[1]):
-                assert np.allclose(v1, v2)
-        assert all(s1[0][2] == s2[2] for s1, s2 in zip(s, s_expected))
-
-    @pytest.mark.skipif(
-        "ObsStructC128" and "ObsStructC64" not in dir(pennylane_lightning.lightning_qubit_ops),
-        reason="ObsStructC128 and ObsStructC64 are required",
-    )
-    @pytest.mark.parametrize("ObsFunc", [ObsStructC128, ObsStructC64])
+    @pytest.mark.parametrize("use_csingle", [True, False])
     @pytest.mark.parametrize("ObsChunk", list(range(1, 5)))
-    def test_chunk_obs(self, monkeypatch, ObsFunc, ObsChunk):
+    def test_chunk_obs(self, monkeypatch, use_csingle, ObsChunk):
         """Test chunking of observable array"""
         with qml.tape.QuantumTape() as tape:
             qml.expval(qml.PauliZ(0) @ qml.PauliX(1))
             qml.expval(qml.PauliY(wires=1))
             qml.expval(qml.PauliX(0) @ qml.Hermitian([[0, 1], [1, 0]], wires=3) @ qml.Hadamard(2))
-            qml.expval(qml.Hermitian(qml.PauliZ.compute_matrix(), wires=1) @ qml.Identity(1))
+            qml.expval(qml.Hermitian(qml.PauliZ.compute_matrix(), wires=0) @ qml.Identity(1))
 
-        mock_obs = mock.MagicMock()
-
-        use_csingle = True if ObsFunc == ObsStructC64 else False
-        obs_str = "ObsStructC64" if ObsFunc == ObsStructC64 else "ObsStructC128"
-
-        with monkeypatch.context() as m:
-            m.setattr(pennylane_lightning._serialize, obs_str, mock_obs)
-            _serialize_obs(tape, self.wires_dict, use_csingle=use_csingle)
-
-        s = mock_obs.call_args_list
+        s = _serialize_observables(tape, self.wires_dict, use_csingle=use_csingle)
 
         obtained_chunks = pennylane_lightning.lightning_qubit._chunk_iterable(s, ObsChunk)
         assert len(list(obtained_chunks)) == int(np.ceil(len(s) / ObsChunk))
@@ -386,13 +397,14 @@ def test_basic_circuit(self):
         s_expected = (
             (
                 ["RX", "RY", "CNOT"],
-                [[0.4], [0.6], []],
+                [np.array([0.4]), np.array([0.6]), []],
                 [[0], [1], [0, 1]],
                 [False, False, False],
                 [[], [], []],
             ),
             False,
         )
+        print(s == s_expected)
         assert s == s_expected
 
     def test_skips_prep_circuit(self):
@@ -470,8 +482,6 @@ def test_custom_wires_circuit(self):
             qml.SingleExcitationMinus(0.5, wires=["a", 3.2]).inv()
 
         s = _serialize_ops(tape, wires_dict)
-        print(s)
-        print()
         s_expected = (
             (
                 [
diff --git a/tests/test_vjp.py b/tests/test_vjp.py
index 746aa5bfc5..24da05e601 100644
--- a/tests/test_vjp.py
+++ b/tests/test_vjp.py
@@ -16,126 +16,15 @@
 """
 from cmath import exp
 import pytest
+import math
 
 import pennylane as qml
 from pennylane import numpy as np
 
-try:
-    from pennylane_lightning.lightning_qubit_ops import (
-        VectorJacobianProductC64,
-        VectorJacobianProductC128,
-    )
-except (ImportError, ModuleNotFoundError):
-    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
-
-
-class TestComputeVJP:
-    """Tests for the numeric computation of VJPs"""
-
-    @pytest.fixture(params=[np.complex64, np.complex128])
-    def dev(self, request):
-        return qml.device("lightning.qubit", wires=2, c_dtype=request.param)
-
-    def test_computation(self, tol, dev):
-        """Test that the correct VJP is returned"""
-        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
-        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
-
-        vjp = dev.compute_vjp(dy, jac)
-        expected = np.tensordot(dy, jac, axes=[[0, 1], [0, 1]])
-
-        assert vjp.shape == (3,)
-        assert vjp.dtype == dev.R_DTYPE
-        assert np.allclose(vjp, expected, atol=tol, rtol=0)
-
-    def test_computation_num(self, tol, dev):
-        """Test that the correct VJP is returned"""
-        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
-        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
+from pennylane_lightning.lightning_qubit import CPP_BINARY_AVAILABLE
 
-        vjp = dev.compute_vjp(dy, jac, num=4)
-        expected = np.tensordot(dy, jac, axes=[[0, 1], [0, 1]])
-
-        assert vjp.shape == (3,)
-        assert vjp.dtype == dev.R_DTYPE
-        assert np.allclose(vjp, expected, atol=tol, rtol=0)
-
-    def test_computation_num_error(self, dev):
-        """Test that the correct VJP is returned"""
-        dev._state = dev._asarray(dev._state)
-
-        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
-        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
-
-        with pytest.raises(ValueError, match="Invalid size for the gradient-output vector"):
-            dev.compute_vjp(dy, jac, num=3)
-
-    def test_jacobian_is_none(self, dev):
-        """A None Jacobian returns a None VJP"""
-        dev._state = dev._asarray(dev._state)
-
-        dy = np.array([[1.0, 2.0], [3.0, 4.0]])
-        jac = None
-
-        vjp = dev.compute_vjp(dy, jac)
-        assert vjp is None
-
-    def test_zero_dy(self, dev):
-        """A zero dy vector will return a zero matrix"""
-        dev._state = dev._asarray(dev._state)
-
-        dy = np.zeros([2, 2])
-        jac = np.array([[[1.0, 0.1, 0.2], [0.2, 0.6, 0.1]], [[0.4, -0.7, 1.2], [-0.5, -0.6, 0.7]]])
-
-        vjp = dev.compute_vjp(dy, jac)
-        assert np.all(vjp == np.zeros([3]))
-
-    def test_array_dy(self, dev):
-        """Test vjp_compute using Python array"""
-
-        dy = [1.0, 1.0, 1.0, 1.0]
-        jac = [dy, dy, dy, dy]
-
-        expected = [4.0, 4.0, 4.0, 4.0]
-        vjp = dev.compute_vjp(dy, jac)
-
-        assert np.all(vjp == expected)
-
-    def test_torch_tensor_dy(self, dev):
-        """Test vjp_compute using the Torch interface"""
-        torch = pytest.importorskip("torch")
-
-        if dev.R_DTYPE == np.float32:
-            torch_r_dtype = torch.float32
-        else:
-            torch_r_dtype = torch.float64
-
-        dy = torch.ones(4, dtype=torch_r_dtype)
-        jac = torch.ones((4, 4), dtype=torch_r_dtype)
-
-        expected = torch.tensor([4.0, 4.0, 4.0, 4.0], dtype=torch_r_dtype)
-        vjp = dev.compute_vjp(dy, jac)
-
-        assert vjp.dtype == torch_r_dtype
-        assert torch.all(vjp == expected)
-
-    def test_tf_tensor_dy(self, dev):
-        """Test vjp_compute using the Tensorflow interface"""
-        tf = pytest.importorskip("tensorflow")
-
-        if dev.R_DTYPE == np.float32:
-            tf_r_dtype = tf.float32
-        else:
-            tf_r_dtype = tf.float64
-
-        dy = tf.ones(4, dtype=tf_r_dtype)
-        jac = tf.ones((4, 4), dtype=tf_r_dtype)
-
-        expected = tf.constant([4.0, 4.0, 4.0, 4.0], dtype=tf_r_dtype)
-        vjp = dev.compute_vjp(dy, jac)
-
-        assert vjp.dtype == dev.R_DTYPE  # ?
-        assert tf.reduce_all(vjp == expected)
+if not CPP_BINARY_AVAILABLE:
+    pytest.skip("No binary module found. Skipping.", allow_module_level=True)
 
 
 class TestVectorJacobianProduct:
@@ -159,11 +48,11 @@ def test_use_device_state(self, tol, dev):
 
         dy = np.array([1.0])
 
-        fn1 = dev.vjp(tape, dy)
+        fn1 = dev.vjp(tape.measurements, dy)
         vjp1 = fn1(tape)
 
         qml.execute([tape], dev, None)
-        fn2 = dev.vjp(tape, dy, use_device_state=True)
+        fn2 = dev.vjp(tape.measurements, dy, use_device_state=True)
         vjp2 = fn2(tape)
 
         assert np.allclose(vjp1, vjp2, atol=tol, rtol=0)
@@ -182,15 +71,70 @@ def test_provide_starting_state(self, tol, dev):
 
         dy = np.array([1.0])
 
-        fn1 = dev.vjp(tape, dy)
+        fn1 = dev.vjp(tape.measurements, dy)
         vjp1 = fn1(tape)
 
         qml.execute([tape], dev, None)
-        fn2 = dev.vjp(tape, dy, starting_state=dev._pre_rotated_state)
+        fn2 = dev.vjp(tape.measurements, dy, starting_state=dev._pre_rotated_state)
         vjp2 = fn2(tape)
 
         assert np.allclose(vjp1, vjp2, atol=tol, rtol=0)
 
+    def test_multiple_measurements(self, tol, dev):
+        """Tests provides correct answer when provided multiple measurements."""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.PauliX(0))
+            qml.expval(qml.PauliY(1))
+            qml.expval(qml.PauliZ(1))
+
+        dy = np.array([1.0, 2.0, 3.0])
+        tape1.trainable_params = {1, 2, 3}
+
+        with qml.tape.QuantumTape() as tape2:
+            ham = qml.Hamiltonian(dy, [qml.PauliX(0), qml.PauliY(1), qml.PauliY(1)])
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(ham)
+
+        tape2.trainable_params = {1, 2, 3}
+
+        fn1 = dev.vjp(tape1.measurements, dy)
+        vjp1 = fn1(tape1)
+
+        vjp2 = dev.adjoint_jacobian(tape2)
+
+        assert np.allclose(vjp1, vjp2, atol=tol, rtol=0)
+
+    def test_wrong_dy_expval(self, tol, dev):
+        """Tests raise an exception when dy is incorrect"""
+        x, y, z = [0.5, 0.3, -0.7]
+
+        with qml.tape.QuantumTape() as tape1:
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.expval(qml.PauliX(0))
+            qml.expval(qml.PauliY(1))
+            qml.expval(qml.PauliZ(1))
+
+        dy1 = np.array([1.0, 2.0])
+        tape1.trainable_params = {1, 2, 3}
+
+        with pytest.raises(
+            ValueError, match="Number of observables in the tape must be the same as"
+        ):
+            dev.vjp(tape1.measurements, dy1)
+
+        dy2 = np.array([1.0 + 3.0j, 0.3 + 2.0j, 0.5 + 0.1j])
+        with pytest.raises(ValueError, match="The vjp method only works with a real-valued dy"):
+            dev.vjp(tape1.measurements, dy2)
+
     def test_not_expval(self, dev):
         """Test if a QuantumFunctionError is raised for a tape with measurements that are not
         expectation values"""
@@ -200,8 +144,10 @@ def test_not_expval(self, dev):
 
         dy = np.array([1.0])
 
-        with pytest.raises(qml.QuantumFunctionError, match="Adjoint differentiation method does"):
-            dev.vjp(tape, dy)(tape)
+        with pytest.raises(
+            qml.QuantumFunctionError, match="Adjoint differentiation method does not"
+        ):
+            dev.vjp(tape.measurements, dy)(tape)
 
     def test_finite_shots_warns(self):
         """Tests warning raised when finite shots specified"""
@@ -216,11 +162,8 @@ def test_finite_shots_warns(self):
         with pytest.warns(
             UserWarning, match="Requested adjoint differentiation to be computed with finite shots."
         ):
-            dev.vjp(tape, dy)(tape)
-
-    from pennylane_lightning import LightningQubit as lq
+            dev.vjp(tape.measurements, dy)(tape)
 
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
     def test_unsupported_op(self, dev):
         """Test if a QuantumFunctionError is raised for an unsupported operation, i.e.,
         multi-parameter operations that are not qml.Rot"""
@@ -234,9 +177,8 @@ def test_unsupported_op(self, dev):
         with pytest.raises(
             qml.QuantumFunctionError, match="The CRot operation is not supported using the"
         ):
-            dev.vjp(tape, dy)(tape)
+            dev.vjp(tape.measurements, dy)(tape)
 
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
     def test_proj_unsupported(self, dev):
         """Test if a QuantumFunctionError is raised for a Projector observable"""
 
@@ -249,7 +191,7 @@ def test_proj_unsupported(self, dev):
         with pytest.raises(
             qml.QuantumFunctionError, match="differentiation method does not support the Projector"
         ):
-            dev.vjp(tape, dy)(tape)
+            dev.vjp(tape.measurements, dy)(tape)
 
         with qml.tape.QuantumTape() as tape:
             qml.CRX(0.1, wires=[0, 1])
@@ -258,31 +200,116 @@ def test_proj_unsupported(self, dev):
         with pytest.raises(
             qml.QuantumFunctionError, match="differentiation method does not support the Projector"
         ):
-            dev.vjp(tape, dy)(tape)
-
-    @pytest.mark.skipif(not lq._CPP_BINARY_AVAILABLE, reason="Lightning binary required")
-    def test_unsupported_hermitian_expectation(self, dev):
-        obs = np.array([[1, 0], [0, -1]], dtype=np.complex128, requires_grad=False)
+            dev.vjp(tape.measurements, dy)(tape)
+
+    def test_hermitian_expectation(self, dev, tol):
+        obs = np.array([[1, 0], [0, -1]], dtype=dev.C_DTYPE, requires_grad=False)
+        dy = np.array([0.8])
+
+        fn = dev.vjp([qml.expval(qml.Hermitian(obs, wires=(0,)))], dy)
+
+        for x in np.linspace(-2 * math.pi, 2 * math.pi, 7):
+            with qml.tape.QuantumTape() as tape:
+                qml.RY(x, wires=(0,))
+            vjp = fn(tape)
+            assert np.allclose(vjp[0], -0.8 * np.sin(x), atol=tol)
+
+    def test_hermitian_tensor_expectation(self, dev, tol):
+        obs = np.array([[1, 0], [0, -1]], dtype=dev.C_DTYPE, requires_grad=False)
+        dy = np.array([0.8])
+
+        fn = dev.vjp([qml.expval(qml.Hermitian(obs, wires=(0,)) @ qml.PauliZ(wires=1))], dy)
+
+        for x in np.linspace(-2 * math.pi, 2 * math.pi, 7):
+            with qml.tape.QuantumTape() as tape:
+                qml.RY(x, wires=(0,))
+            assert np.allclose(fn(tape), -0.8 * np.sin(x), atol=tol)
+
+    def test_statevector_ry(self, dev, tol):
+        dy = np.array(
+            [[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        fn0 = dev.vjp([qml.state()], dy[0, :])
+        fn1 = dev.vjp([qml.state()], dy[1, :])
+        fn2 = dev.vjp([qml.state()], dy[2, :])
+        fn3 = dev.vjp([qml.state()], dy[3, :])
+
+        for x in np.linspace(-2 * math.pi, 2 * math.pi, 7):
+            with qml.tape.QuantumTape() as tape:
+                qml.RY(x, wires=(0,))
+            assert np.allclose(fn0(tape), -np.sin(x / 2) / 2, atol=tol)
+            assert np.allclose(fn1(tape), np.cos(x / 2) / 2, atol=tol)
+            assert np.allclose(fn2(tape), 0.0, atol=tol)
+            assert np.allclose(fn3(tape), 0.0, atol=tol)
+
+    def test_wrong_dy_statevector(self, tol, dev):
+        """Tests raise an exception when dy is incorrect"""
+        x, y, z = [0.5, 0.3, -0.7]
 
         with qml.tape.QuantumTape() as tape:
-            qml.RY(0.1, wires=(0,))
-            qml.expval(qml.Hermitian(obs, wires=(0,)))
+            qml.RX(0.4, wires=[0])
+            qml.Rot(x, y, z, wires=[0])
+            qml.RY(-0.2, wires=[0])
+            qml.state()
 
-        dy = np.array([1.0])
+        tape.trainable_params = {1, 2, 3}
+
+        dy1 = np.ones(3, dtype=dev.C_DTYPE)
 
         with pytest.raises(
-            qml.QuantumFunctionError, match="Lightning adjoint differentiation method does not"
+            ValueError, match="Size of the provided vector dy must be the same as the size of"
         ):
-            dev.vjp(tape, dy)(tape)
+            dev.vjp(tape.measurements, dy1)
 
-        with qml.tape.QuantumTape() as tape:
-            qml.RY(0.1, wires=(0,))
-            qml.expval(qml.Hermitian(obs, wires=(0,)) @ qml.PauliZ(wires=1))
+        dy2 = np.ones(4, dtype=dev.R_DTYPE)
 
-        with pytest.raises(
-            qml.QuantumFunctionError, match="Lightning adjoint differentiation method does not"
-        ):
-            dev.vjp(tape, dy)(tape)
+        with pytest.warns(UserWarning, match="The vjp method only works with complex-valued dy"):
+            dev.vjp(tape.measurements, dy2)
+
+    def test_statevector_complex_circuit(self, dev, tol):
+        dy = np.array(
+            [[1.0, 0.0, 0.0, 0.0], [0.0, 0.0, 1.0, 0.0], [0.0, 1.0, 0.0, 0.0], [0.0, 0.0, 0.0, 1.0]]
+        )
+        fn0 = dev.vjp([qml.state()], dy[0, :])
+        fn1 = dev.vjp([qml.state()], dy[1, :])
+        fn2 = dev.vjp([qml.state()], dy[2, :])
+        fn3 = dev.vjp([qml.state()], dy[3, :])
+
+        params = [math.pi / 7, 6 * math.pi / 7]
+
+        with qml.tape.QuantumTape() as tape:
+            qml.QubitStateVector(np.array([1.0] * 4) / 2, wires=range(2))
+            qml.RY(params[0], wires=0)
+            qml.RZ(params[1], wires=1)
+            qml.CZ(wires=[0, 1])
+
+        tape.trainable_params = {2}  # RZ
+
+        psi_00_diff = (
+            (math.cos(params[0] / 2) - math.sin(params[0] / 2))
+            * (-math.sin(params[1] / 2) - 1j * math.cos(params[1] / 2))
+            / 4
+        )
+        psi_01_diff = (
+            (math.cos(params[0] / 2) + math.sin(params[0] / 2))
+            * (-math.sin(params[1] / 2) - 1j * math.cos(params[1] / 2))
+            / 4
+        )
+        psi_10_diff = (
+            (math.cos(params[0] / 2) - math.sin(params[0] / 2))
+            * (-math.sin(params[1] / 2) + 1j * math.cos(params[1] / 2))
+            / 4
+        )
+        psi_11_diff = (
+            -(math.cos(params[0] / 2) + math.sin(params[0] / 2))
+            * (-math.sin(params[1] / 2) + 1j * math.cos(params[1] / 2))
+            / 4
+        )
+
+        assert np.allclose(fn0(tape), psi_00_diff, atol=tol)
+        assert np.allclose(fn1(tape), psi_01_diff, atol=tol)
+        assert np.allclose(fn2(tape), psi_10_diff, atol=tol)
+        assert np.allclose(fn3(tape), psi_11_diff, atol=tol)
 
     def test_no_trainable_parameters(self, dev):
         """A tape with no trainable parameters will simply return None"""
@@ -296,10 +323,10 @@ def test_no_trainable_parameters(self, dev):
         tape.trainable_params = {}
         dy = np.array([1.0])
 
-        fn = dev.vjp(tape, dy)
+        fn = dev.vjp(tape.measurements, dy)
         vjp = fn(tape)
 
-        assert vjp is None
+        assert len(vjp) == 0
 
     def test_no_trainable_parameters_NEW(self, dev):
         """A tape with no trainable parameters will simply return None"""
@@ -314,12 +341,12 @@ def test_no_trainable_parameters_NEW(self, dev):
 
         tape.trainable_params = {}
         dy = np.array([1.0])
-        fn = dev.vjp(tape, dy)
+        fn = dev.vjp(tape.measurements, dy)
         vjp = fn(tape)
 
-        assert vjp is None
+        assert len(vjp) == 0
 
-    def test_no_trainable_parameters_(self, dev):
+    def test_no_trainable_parameters(self, dev):
         """A tape with no trainable parameters will simply return None"""
         x = 0.4
 
@@ -331,10 +358,10 @@ def test_no_trainable_parameters_(self, dev):
         tape.trainable_params = {}
         dy = np.array([1.0])
 
-        fn = dev.vjp(tape, dy)
+        fn = dev.vjp(tape.measurements, dy)
         vjp = fn(tape)
 
-        assert vjp is None
+        assert len(vjp) == 0
 
     def test_zero_dy(self, dev):
         """A zero dy vector will return no tapes and a zero matrix"""
@@ -350,7 +377,7 @@ def test_zero_dy(self, dev):
         tape.trainable_params = {0, 1}
         dy = np.array([0.0])
 
-        fn = dev.vjp(tape, dy)
+        fn = dev.vjp(tape.measurements, dy)
         vjp = fn(tape)
 
         assert np.all(vjp == np.zeros([len(tape.trainable_params)]))
@@ -370,7 +397,7 @@ def test_single_expectation_value(self, tol, dev):
         tape.trainable_params = {0, 1}
         dy = np.array([1.0])
 
-        fn = dev.vjp(tape, dy)
+        fn = dev.vjp(tape.measurements, dy)
         vjp = fn(tape)
 
         expected = np.array([-np.sin(y) * np.sin(x), np.cos(y) * np.cos(x)])
@@ -392,7 +419,7 @@ def test_multiple_expectation_values(self, tol, dev):
         tape.trainable_params = {0, 1}
         dy = np.array([1.0, 2.0])
 
-        fn = dev.vjp(tape, dy)
+        fn = dev.vjp(tape.measurements, dy)
         vjp = fn(tape)
 
         expected = np.array([-np.sin(x), 2 * np.cos(y)])
@@ -416,8 +443,10 @@ def test_prob_expectation_values(self, dev):
         tape.trainable_params = {0, 1}
         dy = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
 
-        with pytest.raises(qml.QuantumFunctionError, match="Adjoint differentiation method does"):
-            dev.vjp(tape, dy)(tape)
+        with pytest.raises(
+            qml.QuantumFunctionError, match="Adjoint differentiation method does not support"
+        ):
+            dev.vjp(tape.measurements, dy)(tape)
 
 
 class TestBatchVectorJacobianProduct:
@@ -427,7 +456,7 @@ class TestBatchVectorJacobianProduct:
     def dev(self, request):
         return qml.device("lightning.qubit", wires=2, c_dtype=request.param)
 
-    def test_one_tape_no_trainable_parameters(self, dev):
+    def test_one_tape_no_trainable_parameters_1(self, dev):
         """A tape with no trainable parameters will simply return None"""
         with qml.tape.QuantumTape() as tape1:
             qml.RX(0.4, wires=0)
@@ -449,10 +478,10 @@ def test_one_tape_no_trainable_parameters(self, dev):
         fn = dev.batch_vjp(tapes, dys)
         vjps = fn(tapes)
 
-        assert vjps[0] is None
+        assert len(vjps[0]) == 0
         assert vjps[1] is not None
 
-    def test_all_tapes_no_trainable_parameters(self, dev):
+    def test_all_tapes_no_trainable_parameters_2(self, dev):
         """If all tapes have no trainable parameters all outputs will be None"""
         with qml.tape.QuantumTape() as tape1:
             qml.RX(0.4, wires=0)
@@ -474,8 +503,8 @@ def test_all_tapes_no_trainable_parameters(self, dev):
         fn = dev.batch_vjp(tapes, dys)
         vjps = fn(tapes)
 
-        assert vjps[0] is None
-        assert vjps[1] is None
+        assert len(vjps[0]) == 0
+        assert len(vjps[1]) == 0
 
     def test_zero_dy(self, dev):
         """A zero dy vector will return no tapes and a zero matrix"""