PennyLaneAI · vincentmr · Sep 5, 2024 · Aug 26, 2024 · Aug 26, 2024 · Aug 27, 2024
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -36,6 +36,9 @@
 
 ### Improvements
 
+* Smarter defaults for the `split_obs` argument in the serializer. The serializer splits linear combinations into chunks instead of all their terms.
+  [(#873)](https://github.com/PennyLaneAI/pennylane-lightning/pull/873/)
+
 * Updated calls of ``size_t`` to ``std::size_t`` everywhere.
   [(#816)](https://github.com/PennyLaneAI/pennylane-lightning/pull/816/)
 

diff --git a/pennylane_lightning/core/_serialize.py b/pennylane_lightning/core/_serialize.py
@@ -14,6 +14,7 @@
 r"""
 Helper functions for serializing quantum tapes.
 """
+from itertools import islice
 from typing import List, Sequence, Tuple
 
 import numpy as np
@@ -47,14 +48,20 @@
 }
 
 
+def _chunk_iterable(iteration, num_chunks):
+    """Lazy-evaluated chunking of given iterable from https://stackoverflow.com/a/22045226"""
+    iteration = iter(iteration)
+    return iter(lambda: tuple(islice(iteration, num_chunks)), ())
+
+
 class QuantumScriptSerializer:
     """Serializer class for `pennylane.tape.QuantumScript` data.
 
     Args:
     device_name: device shortname.
     use_csingle (bool): whether to use np.complex64 instead of np.complex128
     use_mpi (bool, optional): If using MPI to accelerate calculation. Defaults to False.
-    split_obs (bool, optional): If splitting the observables in a list. Defaults to False.
+    split_obs (Union[bool, int], optional): If splitting the observables in a list. Defaults to False.
 
     """
 
@@ -214,17 +221,34 @@
         obs = observable.obs if isinstance(observable, Tensor) else observable.operands
         return self.tensor_obs([self._ob(o, wires_map) for o in obs])
 
+    def _chunk_ham_terms(self, coeffs, ops, split_num: int = 1) -> List:
+        "Create split_num sub-Hamiltonians from a single high term-count Hamiltonian"
+        num_terms = len(coeffs)
+        step_size = num_terms // split_num + bool(num_terms % split_num)
+        c_coeffs = list(_chunk_iterable(coeffs, step_size))
+        c_ops = list(_chunk_iterable(ops, step_size))
+        return c_coeffs, c_ops
+
     def _hamiltonian(self, observable, wires_map: dict = None):
         coeffs, ops = observable.terms()
         coeffs = np.array(unwrap(coeffs)).astype(self.rtype)
+        if self.split_obs:
+            ops_l = []
+            for t in ops:
+                term_cpp = self._ob(t, wires_map)
+                if isinstance(term_cpp, Sequence):
+                    ops_l.extend(term_cpp)
+                else:
+                    ops_l.append(term_cpp)
+            c, o = self._chunk_ham_terms(coeffs, ops_l, self.split_obs)
+            hams = [self.hamiltonian_obs(c_coeffs, c_obs) for (c_coeffs, c_obs) in zip(c, o)]
+            return hams
+
         terms = [self._ob(t, wires_map) for t in ops]
         # TODO: This is in case `_hamiltonian` is called recursively which would cause a list
         # to be passed where `_ob` expects an observable.
         terms = [t[0] if isinstance(t, Sequence) and len(t) == 1 else t for t in terms]
 
-        if self.split_obs:
-            return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)]
-
         return self.hamiltonian_obs(coeffs, terms)
 
     def _sparse_hamiltonian(self, observable, wires_map: dict = None):
@@ -282,11 +306,14 @@
         terms = [self._pauli_word(pw, wires_map) for pw in pwords]
         coeffs = np.array(coeffs).astype(self.rtype)
 
+        if self.split_obs:
+            c, o = self._chunk_ham_terms(coeffs, terms, self.split_obs)
+            psentences = [self.hamiltonian_obs(c_coeffs, c_obs) for (c_coeffs, c_obs) in zip(c, o)]
+            return psentences
+
         if len(terms) == 1 and coeffs[0] == 1.0:
             return terms[0]
 
-        if self.split_obs:
-            return [self.hamiltonian_obs([c], [t]) for (c, t) in zip(coeffs, terms)]
         return self.hamiltonian_obs(coeffs, terms)
 
     # pylint: disable=protected-access, too-many-return-statements
@@ -326,17 +353,17 @@
         """
 
         serialized_obs = []
-        offset_indices = [0]
+        obs_indices = []
 
-        for observable in tape.observables:
+        for i, observable in enumerate(tape.observables):
             ser_ob = self._ob(observable, wires_map)
             if isinstance(ser_ob, list):
                 serialized_obs.extend(ser_ob)
-                offset_indices.append(offset_indices[-1] + len(ser_ob))
+                obs_indices.extend([i] * len(ser_ob))
             else:
                 serialized_obs.append(ser_ob)
-                offset_indices.append(offset_indices[-1] + 1)
-        return serialized_obs, offset_indices
+                obs_indices.append(i)
+        return serialized_obs, obs_indices
 
     def serialize_ops(self, tape: QuantumTape, wires_map: dict = None) -> Tuple[
         List[List[str]],

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev51"
+__version__ = "0.38.0-dev52"
diff --git a/pennylane_lightning/core/lightning_base.py b/pennylane_lightning/core/lightning_base.py
@@ -16,8 +16,8 @@
 This module contains the base class for all PennyLane Lightning simulator devices,
 and interfaces with C++ for improved performance.
 """
-from itertools import islice, product
-from typing import List
+from itertools import product
+from typing import List, Union
 
 import numpy as np
 import pennylane as qml
@@ -31,12 +31,6 @@
 from ._version import __version__
 
 
-def _chunk_iterable(iteration, num_chunks):
-    "Lazy-evaluated chunking of given iterable from https://stackoverflow.com/a/22045226"
-    iteration = iter(iteration)
-    return iter(lambda: tuple(islice(iteration, num_chunks)), ())
-
-
 class LightningBase(QubitDevice):
     """PennyLane Lightning Base device.
 
@@ -262,11 +256,16 @@ def _get_basis_state_index(self, state, wires):
 
     # pylint: disable=too-many-function-args, assignment-from-no-return, too-many-arguments
     def _process_jacobian_tape(
-        self, tape, starting_state, use_device_state, use_mpi: bool = False, split_obs: bool = False
+        self,
+        tape,
+        starting_state,
+        use_device_state,
+        use_mpi: bool = False,
+        split_obs: Union[bool, int] = False,
     ):
         state_vector = self._init_process_jacobian_tape(tape, starting_state, use_device_state)
 
-        obs_serialized, obs_idx_offsets = QuantumScriptSerializer(
+        obs_serialized, obs_indices = QuantumScriptSerializer(
             self.short_name, self.use_csingle, use_mpi, split_obs
         ).serialize_observables(tape, self.wire_map)
 
@@ -309,7 +308,7 @@ def _process_jacobian_tape(
             "tp_shift": tp_shift,
             "record_tp_rows": record_tp_rows,
             "all_params": all_params,
-            "obs_idx_offsets": obs_idx_offsets,
+            "obs_indices": obs_indices,
         }
 
     @staticmethod

diff --git a/pennylane_lightning/lightning_gpu/lightning_gpu.py b/pennylane_lightning/lightning_gpu/lightning_gpu.py
@@ -30,6 +30,7 @@
 from pennylane.measurements import Expectation, State
 from pennylane.ops.op_math import Adjoint
 from pennylane.wires import Wires
+from scipy.sparse import csr_matrix
 
 from pennylane_lightning.core._serialize import QuantumScriptSerializer, global_phase_diagonal
 from pennylane_lightning.core._version import __version__
@@ -633,8 +634,12 @@
         # Check adjoint diff support
         self._check_adjdiff_supported_operations(tape.operations)
 
+        if self._mpi:
+            split_obs = False  # with MPI batched means compute Jacobian one observables at a time, no point splitting linear combinations
+        else:
+            split_obs = self._dp.getTotalDevices() if self._batch_obs else False
         processed_data = self._process_jacobian_tape(
-            tape, starting_state, use_device_state, self._mpi, self._batch_obs
+            tape, starting_state, use_device_state, self._mpi, split_obs
         )
 
         if not processed_data:  # training_params is empty
@@ -653,68 +658,31 @@
         adjoint_jacobian = _adj_dtype(self.use_csingle, self._mpi)()
 
         if self._batch_obs:  # Batching of Measurements
-            if not self._mpi:  # Single-node path, controlled batching over available GPUs
-                num_obs = len(processed_data["obs_serialized"])
-                batch_size = (
-                    num_obs
-                    if isinstance(self._batch_obs, bool)
-                    else self._batch_obs * self._dp.getTotalDevices()
-                )
-                jac = []
-                for chunk in range(0, num_obs, batch_size):
-                    obs_chunk = processed_data["obs_serialized"][chunk : chunk + batch_size]
-                    jac_chunk = adjoint_jacobian.batched(
-                        self._gpu_state,
-                        obs_chunk,
-                        processed_data["ops_serialized"],
-                        trainable_params,
-                    )
-                    jac.extend(jac_chunk)
-            else:  # MPI path, restrict memory per known GPUs
-                jac = adjoint_jacobian.batched(
-                    self._gpu_state,
-                    processed_data["obs_serialized"],
-                    processed_data["ops_serialized"],
-                    trainable_params,
-                )
-
+            jac = adjoint_jacobian.batched(
+                self._gpu_state,
+                processed_data["obs_serialized"],
+                processed_data["ops_serialized"],
+                trainable_params,
+            )
         else:
             jac = adjoint_jacobian(
                 self._gpu_state,
                 processed_data["obs_serialized"],
                 processed_data["ops_serialized"],
                 trainable_params,
             )
-
-        jac = np.array(jac)  # only for parameters differentiable with the adjoint method
-        jac = jac.reshape(-1, len(trainable_params))
-        jac_r = np.zeros((len(tape.observables), processed_data["all_params"]))
-        if not self._batch_obs:
-            jac_r[:, processed_data["record_tp_rows"]] = jac
-        else:
-            # Reduce over decomposed expval(H), if required.
-            for idx in range(len(processed_data["obs_idx_offsets"][0:-1])):
-                if (
-                    processed_data["obs_idx_offsets"][idx + 1]
-                    - processed_data["obs_idx_offsets"][idx]
-                ) > 1:
-                    jac_r[idx, :] = np.sum(
-                        jac[
-                            processed_data["obs_idx_offsets"][idx] : processed_data[
-                                "obs_idx_offsets"
-                            ][idx + 1],
-                            :,
-                        ],
-                        axis=0,
-                    )
-                else:
-                    jac_r[idx, :] = jac[
-                        processed_data["obs_idx_offsets"][idx] : processed_data["obs_idx_offsets"][
-                            idx + 1
-                        ],
-                        :,
-                    ]
-
+        jac = np.array(jac)
+        has_shape0 = bool(len(jac))
+
+        num_obs = len(np.unique(processed_data["obs_indices"]))
+        rows = processed_data["obs_indices"]
+        cols = np.arange(len(rows), dtype=int)
+        data = np.ones(len(rows))
+        red_mat = csr_matrix((data, (rows, cols)), shape=(num_obs, len(rows)))
+        jac = red_mat @ jac.reshape((len(rows), -1))
+        jac = jac.reshape(-1, len(trainable_params)) if has_shape0 else jac
+        jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
+        jac_r[:, processed_data["record_tp_rows"]] = jac
         return self._adjoint_jacobian_processing(jac_r)
 
     # pylint: disable=inconsistent-return-statements, line-too-long, missing-function-docstring

diff --git a/pennylane_lightning/lightning_kokkos/lightning_kokkos.py b/pennylane_lightning/lightning_kokkos/lightning_kokkos.py
@@ -19,7 +19,6 @@
 
 import os
 import sys
-from os import getenv
 from pathlib import Path
 from typing import List
 from warnings import warn
@@ -34,7 +33,7 @@
 
 from pennylane_lightning.core._serialize import QuantumScriptSerializer, global_phase_diagonal
 from pennylane_lightning.core._version import __version__
-from pennylane_lightning.core.lightning_base import LightningBase, _chunk_iterable
+from pennylane_lightning.core.lightning_base import LightningBase
 
 try:
     # pylint: disable=import-error, no-name-in-module
@@ -194,7 +193,7 @@
         batch_obs=False,
         kokkos_args=None,
     ):  # pylint: disable=unused-argument, too-many-arguments
-        super().__init__(wires, shots=shots, c_dtype=c_dtype)
+        super().__init__(wires, shots=shots, c_dtype=c_dtype, batch_obs=batch_obs)
 
         if kokkos_args is None:
             self._kokkos_state = _kokkos_dtype(c_dtype)(self.num_wires)
@@ -717,33 +716,16 @@
 
         trainable_params = processed_data["tp_shift"]
 
-        # If requested batching over observables, chunk into OMP_NUM_THREADS sized chunks.
-        # This will allow use of Lightning with adjoint for large-qubit numbers AND large
-        # numbers of observables, enabling choice between compute time and memory use.
-        requested_threads = int(getenv("OMP_NUM_THREADS", "1"))
-
         adjoint_jacobian = AdjointJacobianC64() if self.use_csingle else AdjointJacobianC128()
 
-        if self._batch_obs and requested_threads > 1:  # pragma: no cover
-            obs_partitions = _chunk_iterable(processed_data["obs_serialized"], requested_threads)
-            jac = []
-            for obs_chunk in obs_partitions:
-                jac_local = adjoint_jacobian(
-                    processed_data["state_vector"],
-                    obs_chunk,
-                    processed_data["ops_serialized"],
-                    trainable_params,
-                )
-                jac.extend(jac_local)
-        else:
-            jac = adjoint_jacobian(
-                processed_data["state_vector"],
-                processed_data["obs_serialized"],
-                processed_data["ops_serialized"],
-                trainable_params,
-            )
+        jac = adjoint_jacobian(
+            processed_data["state_vector"],
+            processed_data["obs_serialized"],
+            processed_data["ops_serialized"],
+            trainable_params,
+        )
         jac = np.array(jac)
-        jac = jac.reshape(-1, len(trainable_params))
+        jac = jac.reshape(-1, len(trainable_params)) if len(jac) else jac
         jac_r = np.zeros((jac.shape[0], processed_data["all_params"]))
         jac_r[:, processed_data["record_tp_rows"]] = jac
         if hasattr(qml, "active_return"):  # pragma: no cover