basf · JochenSiegWork · Jun 27, 2024 · Aug 13, 2024 · Aug 13, 2024 · Aug 13, 2024
diff --git a/molpipeline/experimental/explainability/explainer.py b/molpipeline/experimental/explainability/explainer.py
@@ -3,15 +3,19 @@
 from __future__ import annotations
 
 import abc
-from typing import Any, TypeAlias
+from typing import Any, Callable
 
 import numpy as np
 import numpy.typing as npt
 import pandas as pd
 import shap
 from scipy.sparse import issparse, spmatrix
 from sklearn.base import BaseEstimator
-from typing_extensions import override
+
+try:
+    from typing import override  # type: ignore[attr-defined]
+except ImportError:
+    from typing_extensions import override
 
 from molpipeline import Pipeline
 from molpipeline.abstract_pipeline_elements.core import OptionalMol
@@ -51,27 +55,9 @@ def _to_dense(
     return feature_matrix
 
 
-def _convert_to_array(value: Any) -> npt.NDArray[np.float64]:
-    """Convert a value to a numpy array.
-
-    Parameters
-    ----------
-    value : Any
-        The value to convert.
-
-    Returns
-    -------
-    npt.NDArray[np.float64]
-        The value as a numpy array.
-    """
-    if isinstance(value, np.ndarray):
-        return value
-    if np.isscalar(value):
-        return np.array([value])
-    raise ValueError("Value is not a scalar or numpy array.")
-
-
-def _get_prediction_function(pipeline: Pipeline | BaseEstimator) -> Any:
+def _get_prediction_function(
+    pipeline: Pipeline | BaseEstimator,
+) -> Callable[[npt.ArrayLike], npt.ArrayLike]:
     """Get the prediction function of a model.
 
     Parameters
@@ -166,18 +152,13 @@ def _convert_shap_feature_weights_to_atom_weights(
     return atom_weights
 
 
-ShapExplanation: TypeAlias = list[
-    SHAPFeatureExplanation | SHAPFeatureAndAtomExplanation
-]
-
-
 class AbstractSHAPExplainer(abc.ABC):  # pylint: disable=too-few-public-methods
     """Abstract class for SHAP explainer objects."""
 
     @abc.abstractmethod
     def explain(
         self, X: Any, **kwargs: Any  # pylint: disable=invalid-name,unused-argument
-    ) -> ShapExplanation:
+    ) -> list[SHAPFeatureExplanation | SHAPFeatureAndAtomExplanation]:
         """Explain the predictions for the input data.
 
         Parameters
@@ -199,6 +180,9 @@ class SHAPExplainerAdapter(
 ):  # pylint: disable=too-few-public-methods
     """Adapter for SHAP explainer wrappers for handling molecules and pipelines."""
 
+    # used for dynamically defining the return type of the explain method
+    return_element_type_: type[SHAPFeatureExplanation | SHAPFeatureAndAtomExplanation]
+
     def __init__(
         self,
         pipeline: Pipeline,
@@ -234,9 +218,6 @@ def __init__(
 
         # determine type of returned explanation
         featurization_element = self.featurization_subpipeline.steps[-1][1]  # type: ignore[union-attr]
-        self.return_element_type_: type[
-            SHAPFeatureExplanation | SHAPFeatureAndAtomExplanation
-        ]
         if isinstance(featurization_element, MolToMorganFP):
             self.return_element_type_ = SHAPFeatureAndAtomExplanation
         else:
@@ -271,7 +252,7 @@ def _prediction_is_valid(prediction: Any) -> bool:
     @override
     def explain(
         self, X: Any, **kwargs: Any  # pylint: disable=invalid-name,unused-argument
-    ) -> ShapExplanation:
+    ) -> list[SHAPFeatureExplanation | SHAPFeatureAndAtomExplanation]:
         """Explain the predictions for the input data.
 
         If the calculation of the SHAP values for an input sample fails, the explanation will be invalid.
@@ -291,7 +272,9 @@ def explain(
         """
         featurization_element = self.featurization_subpipeline.steps[-1][1]  # type: ignore[union-attr]
 
-        explanation_results: ShapExplanation = []
+        explanation_results: list[
+            SHAPFeatureExplanation | SHAPFeatureAndAtomExplanation
+        ] = []
         for input_sample in X:
 
             input_sample = [input_sample]
@@ -359,7 +342,7 @@ def explain(
             if issubclass(self.return_element_type_, BondExplanationMixin):
                 explanation_data["bond_weights"] = bond_weights
             if issubclass(self.return_element_type_, SHAPExplanationMixin):
-                explanation_data["expected_value"] = _convert_to_array(
+                explanation_data["expected_value"] = np.atleast_1d(
                     self.explainer.expected_value
                 )
 

diff --git a/molpipeline/experimental/explainability/visualization/heatmaps.py b/molpipeline/experimental/explainability/visualization/heatmaps.py
@@ -50,17 +50,19 @@ def __init__(
         self.y_lim = y_lim
         self.x_res = x_res
         self.y_res = y_res
+        self._dx = (max(self.x_lim) - min(self.x_lim)) / self.x_res
+        self._dy = (max(self.y_lim) - min(self.y_lim)) / self.y_res
         self.values = np.zeros((self.x_res, self.y_res))
 
     @property
     def dx(self) -> float:
         """Length of cell in x-direction."""
-        return (max(self.x_lim) - min(self.x_lim)) / self.x_res
+        return self._dx
 
     @property
     def dy(self) -> float:
         """Length of cell in y-direction."""
-        return (max(self.y_lim) - min(self.y_lim)) / self.y_res
+        return self._dy
 
     def grid_field_center(self, x_idx: int, y_idx: int) -> tuple[float, float]:
         """Center of cell specified by index along x and y.
@@ -149,6 +151,9 @@ def __init__(
         y_lim: Sequence[float],
         x_res: int,
         y_res: int,
+        function_list: (
+            list[Callable[[npt.NDArray[np.float64]], npt.NDArray[np.float64]]] | None
+        ) = None,
     ):
         """Initialize the ValueGrid with limits and resolution of the axes.
 
@@ -162,11 +167,16 @@ def __init__(
             Resolution (number of cells) along x-axis.
         y_res: int
             Resolution (number of cells) along y-axis.
+        function_list: list[Callable[[npt.NDArray[np.float64]], npt.NDArray[np.float64]]], optional
+            List of functions to be evaluated for each cell, by default None.
         """
         super().__init__(x_lim, y_lim, x_res, y_res)
-        self.function_list: list[
-            Callable[[npt.NDArray[np.float64]], npt.NDArray[np.float64]]
-        ] = []
+        if function_list is not None:
+            self.function_list: list[
+                Callable[[npt.NDArray[np.float64]], npt.NDArray[np.float64]]
+            ] = function_list
+        else:
+            self.function_list = []
         self.values = np.zeros((self.x_res, self.y_res))
 
     def add_function(

diff --git a/molpipeline/experimental/explainability/visualization/visualization.py b/molpipeline/experimental/explainability/visualization/visualization.py
@@ -14,6 +14,7 @@
 from matplotlib import colors
 from matplotlib import pyplot as plt
 from matplotlib.colors import Colormap
+from matplotlib.figure import Figure
 from PIL import Image
 from rdkit import Chem
 from rdkit.Chem import Draw
@@ -176,7 +177,7 @@ def _add_gaussians_for_bonds(
         bond_center = (a1_coords + a2_coords) / 2
 
         func = GaussFunctor2D(
-            center=bond_center,
+            center=bond_center,  # type: ignore
             std1=bond_width,
             std2=bond_length,
             scale=bond_weights[i],
@@ -265,6 +266,127 @@ def make_sum_of_gaussians_grid(
     return value_grid
 
 
+def _add_shap_present_absent_features_text(
+    fig: Figure,
+    explanation: SHAPFeatureAndAtomExplanation,
+    sum_present_shap: float,
+    sum_absent_shap: float,
+) -> None:
+    """Add text to the figure to display the SHAP prediction composition.
+
+    The added text includes the prediction value, the expected value, the sum of the SHAP values for present features,
+    and the sum of the SHAP values for absent features.
+
+    Parameters
+    ----------
+    fig: Figure
+        The figure.
+    explanation: SHAPFeatureAndAtomExplanation
+        The SHAP explanation.
+    sum_present_shap: float
+        The sum of the SHAP values for present features.
+    sum_absent_shap: float
+        The sum of the SHAP values for absent features.
+    """
+    if explanation.prediction is None:
+        raise AssertionError("Prediction value is None.")
+    if explanation.expected_value is None:
+        raise AssertionError("Expected value is None.")
+
+    color1 = "black"
+    color2 = "green"
+    color3 = "darkorchid"
+
+    fontsize_numbers = 11
+    delta = 0.04
+    offset = 0.375
+    fig.text(
+        offset + delta,
+        0.18,
+        f"{explanation.prediction[-1]:.2f} =",
+        fontsize=fontsize_numbers,
+        ha="center",
+    )
+    fig.text(
+        offset + 2 * delta,
+        0.18,
+        f" {'' if explanation.expected_value[-1] >= 0 else '-'}",
+        ha="center",
+        fontsize=fontsize_numbers,
+        color=color1,
+    )
+    fig.text(
+        offset + 3 * delta,
+        0.18,
+        f" {abs(explanation.expected_value[-1]):.2f}",
+        ha="center",
+        fontsize=fontsize_numbers,
+        color=color1,
+    )
+    fig.text(
+        offset + 4 * delta,
+        0.18,
+        f" {'+' if sum_present_shap >= 0 else '-'}",
+        ha="center",
+        fontsize=fontsize_numbers,
+        color=color2,
+    )
+    fig.text(
+        offset + 5 * delta,
+        0.18,
+        f" {abs(sum_present_shap):.2f}",
+        ha="center",
+        fontsize=fontsize_numbers,
+        color=color2,
+    )
+    fig.text(
+        offset + 6 * delta,
+        0.18,
+        f" {'+' if sum_absent_shap >= 0 else '-'}",
+        fontsize=fontsize_numbers,
+        ha="center",
+        color=color3,
+    )
+    fig.text(
+        offset + 7 * delta,
+        0.18,
+        f" {abs(sum_absent_shap):.2f}",
+        ha="center",
+        fontsize=fontsize_numbers,
+        color=color3,
+    )
+
+    delta = 0.05
+    offset = offset + 0.0165
+    fig.text(offset, 0.13, "prediction =", ha="center", fontsize=10)
+    fig.text(
+        offset + 2 * delta,
+        0.12,
+        "expected\nvalue",
+        ha="center",
+        fontsize=10,
+        color=color1,
+    )
+    fig.text(offset + 3 * delta, 0.13, " + ", ha="center", fontsize=10, color=color2)
+    fig.text(
+        offset + 4 * delta,
+        0.12,
+        "features\npresent",
+        ha="center",
+        fontsize=10,
+        color=color2,
+    )
+    fig.text(offset + 5 * delta, 0.13, " + ", ha="center", fontsize=10, color=color3)
+    fig.text(
+        offset + 6 * delta,
+        0.12,
+        "features\nabsent",
+        ha="center",
+        fontsize=10,
+        color=color3,
+    )
+
+
 def _structure_heatmap(
     mol: RDKitMol,
     atom_weights: npt.NDArray[np.float64],
@@ -462,16 +584,9 @@ def structure_heatmap_shap(  # pylint: disable=too-many-branches, too-many-local
 
         fig.colorbar(im, ax=ax, orientation="vertical", fraction=0.015, pad=0.0)
 
-        # note: the prediction/expected value of the last array element is used
-        text = (
-            f"$Prediction = {explanation.prediction[-1]:.2f}$ ="
-            "\n"
-            "\n"
-            f"  $expected \ value={explanation.expected_value[-1]:.2f}$   +   "  # noqa: W605 # pylint: disable=anomalous-backslash-in-string
-            f"$features_{{present}}= {sum_present_shap:.2f}$   +   "
-            f"$features_{{absent}}={sum_absent_shap:.2f}$"
+        _add_shap_present_absent_features_text(
+            fig, explanation, sum_present_shap, sum_absent_shap
         )
-        fig.text(0.5, 0.18, text, ha="center")
 
         image = plt_to_pil(fig)
         # clear the figure and memory