From 72d546068c44422622cd6f17e7e40e3861a9b0c8 Mon Sep 17 00:00:00 2001
From: JochenSiegWork <135010976+JochenSiegWork@users.noreply.github.com>
Date: Fri, 5 Jul 2024 11:30:46 +0200
Subject: [PATCH] utils/mol2concatinated_vector: changes for xai (#33)

* utils/mol2concatinated_vector: changes for xai

    - Add helper class SubpipelineExtractor to get certain
      parts of an existing Pipeline.
    - Add property to mol2concatinated_vector to extract
      total number of features.

* utils: code review subpipeline
---
 .../mol2any/mol2concatinated_vector.py        |  15 +
 molpipeline/utils/subpipeline.py              | 387 ++++++++++++++++++
 .../test_mol2any/test_mol2concatenated.py     |  48 ++-
 tests/test_utils/test_subpipeline.py          | 336 +++++++++++++++
 4 files changed, 780 insertions(+), 6 deletions(-)
 create mode 100644 molpipeline/utils/subpipeline.py
 create mode 100644 tests/test_utils/test_subpipeline.py

diff --git a/molpipeline/mol2any/mol2concatinated_vector.py b/molpipeline/mol2any/mol2concatinated_vector.py
index bf85256e..490baadc 100644
--- a/molpipeline/mol2any/mol2concatinated_vector.py
+++ b/molpipeline/mol2any/mol2concatinated_vector.py
@@ -72,6 +72,21 @@ def element_list(self) -> list[tuple[str, MolToAnyPipelineElement]]:
         """Get pipeline elements."""
         return self._element_list
 
+    @property
+    def n_features(self) -> int:
+        """Calculates and returns the number of features."""
+        feature_count = 0
+        for _, element in self._element_list:
+            if hasattr(element, "n_features"):
+                feature_count += element.n_features
+            elif hasattr(element, "n_bits"):
+                feature_count += element.n_bits
+            else:
+                raise AssertionError(
+                    f"Element {element} does not have n_features or n_bits."
+                )
+        return feature_count
+
     def get_params(self, deep: bool = True) -> dict[str, Any]:
         """Return all parameters defining the object.
 
diff --git a/molpipeline/utils/subpipeline.py b/molpipeline/utils/subpipeline.py
new file mode 100644
index 00000000..a55aa1c4
--- /dev/null
+++ b/molpipeline/utils/subpipeline.py
@@ -0,0 +1,387 @@
+"""Helper functions to extract subpipelines from a pipeline."""
+
+from __future__ import annotations
+
+from typing import Any, Callable
+
+from sklearn.base import BaseEstimator
+
+from molpipeline import FilterReinserter, Pipeline, PostPredictionWrapper
+from molpipeline.abstract_pipeline_elements.core import (
+    AnyToMolPipelineElement,
+    MolToAnyPipelineElement,
+)
+
+
+def _get_molecule_reading_position_from_pipeline(pipeline: Pipeline) -> int | None:
+    """Heuristic to select the position of the central molecule reading element in a pipeline.
+
+    This function searches the last AnyToMolPipelineElement in the pipeline. We select the last
+    AnyToMolPipelineElement in the pipeline because we have some standardization pipelines
+    that write the molecule a smiles and read them back in to ensure they are readable.
+
+    Parameters
+    ----------
+    pipeline: Pipeline
+        The pipeline to search for the molecule reading element.
+
+    Returns
+    -------
+    int | None
+        The position of the molecule reading element in the pipeline.
+    """
+    for i, step in enumerate(reversed(pipeline.steps)):
+        if isinstance(step[1], AnyToMolPipelineElement):
+            return len(pipeline.steps) - i - 1
+    return None
+
+
+def _get_model_element_position_from_pipeline(pipeline: Pipeline) -> int | None:
+    """Heuristic to select the position of the machine learning estimator model in a pipeline.
+
+    The returned element should be the element returning the pipeline's predictions. So, the
+    pipeline up to this element can be executed to obtain the predictions.
+
+    Parameters
+    ----------
+    pipeline: Pipeline
+        The pipeline to search for the model element.
+
+    Returns
+    -------
+    int | None
+        The position of the model element in the pipeline or None if no model element is found.
+    """
+    for i, step in enumerate(reversed(pipeline.steps)):
+        if isinstance(step[1], BaseEstimator):
+            if isinstance(step[1], PostPredictionWrapper):
+                # skip PostPredictionWrappers.
+                continue
+            return len(pipeline.steps) - i - 1
+    return None
+
+
+def _get_featurization_element_position_from_pipeline(pipeline: Pipeline) -> int | None:
+    """Heuristic to select the position of the featurization element in a pipeline.
+
+    Parameters
+    ----------
+    pipeline: Pipeline
+        The pipeline to search for the featurization element.
+
+    Returns
+    -------
+    int | None
+        The position of the featurization element in the pipeline or None if no featurization element is found.
+
+    """
+    for i, step in enumerate(reversed(pipeline.steps)):
+        if isinstance(step[1], MolToAnyPipelineElement):
+            return len(pipeline.steps) - i - 1
+    return None
+
+
+class SubpipelineExtractor:
+    """A helper class to extract parts of a pipeline."""
+
+    def __init__(self, pipeline: Pipeline) -> None:
+        """Initialize the SubpipelineExtractor.
+
+        Parameters
+        ----------
+        pipeline : Pipeline
+            The pipeline to extract subpipelines from.
+        """
+        self.pipeline = pipeline
+
+    def _get_index_of_element_by_id(self, element: Any) -> int | None:
+        """Get the index of an element by id (the pointer or memory address).
+
+        Parameters
+        ----------
+        element : Any
+            The element to extract.
+
+        Returns
+        -------
+        int | None
+            The index of the element or None if the element was not found.
+        """
+        for i, (_, pipeline_element) in enumerate(self.pipeline.steps):
+            if pipeline_element is element:
+                return i
+        return None
+
+    def _get_index_of_element_by_name(self, element_name: str) -> int | None:
+        """Get the index of an element by name.
+
+        Parameters
+        ----------
+        element_name : str
+            The name of the element to extract.
+
+        Returns
+        -------
+        int | None
+            The index of the element or None if the element was not found.
+        """
+        for i, (name, _) in enumerate(self.pipeline.steps):
+            if name == element_name:
+                return i
+        return None
+
+    def _extract_single_element_index(
+        self,
+        element_name: str | None,
+        get_index_function: Callable[[Pipeline], int | None],
+    ) -> int | None:
+        """Extract the index of a single element from the pipeline.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the element to extract.
+        get_index_function : Callable[[Pipeline], int | None]
+            A function that returns the index of the element to extract.
+
+        Returns
+        -------
+        Any | None
+            The index of the extracted element or None if the element was not found.
+        """
+        if element_name is not None:
+            return self._get_index_of_element_by_name(element_name)
+        return get_index_function(self.pipeline)
+
+    def _extract_single_element(
+        self,
+        element_name: str | None,
+        get_index_function: Callable[[Pipeline], int | None],
+    ) -> Any | None:
+        """Extract a single element from the pipeline.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the element to extract.
+        get_index_function : Callable[[Pipeline], int | None]
+            A function that returns the index of the element to extract.
+
+        Returns
+        -------
+        Any | None
+            The extracted element or None if the element was not found.
+        """
+        if element_name is not None:
+            # if a name is provided, access the element by name
+            return self.pipeline.named_steps[element_name]
+        element_index = self._extract_single_element_index(None, get_index_function)
+        if element_index is None:
+            return None
+        return self.pipeline.steps[element_index][1]
+
+    def get_molecule_reader_element(
+        self, element_name: str | None = None
+    ) -> AnyToMolPipelineElement | None:
+        """Get the molecule reader element from the pipeline, e.g. a SmilesToMol element.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the element to extract.
+
+        Returns
+        -------
+        AnyToMolPipelineElement | None
+            The extracted molecule reader element or None if the element was not found.
+        """
+        return self._extract_single_element(
+            element_name,
+            _get_molecule_reading_position_from_pipeline,
+        )
+
+    def get_featurization_element(
+        self, element_name: str | None = None
+    ) -> BaseEstimator | None:
+        """Get the featurization element from the pipeline, e.g., a MolToMorganFP element.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the element to extract.
+
+        Returns
+        -------
+        BaseEstimator | None
+            The extracted featurization element or None if the element was not found.
+        """
+        return self._extract_single_element(
+            element_name, _get_featurization_element_position_from_pipeline
+        )
+
+    def get_model_element(
+        self, element_name: str | None = None
+    ) -> BaseEstimator | None:
+        """Get the machine learning model element from the pipeline, e.g. a RandomForestClassifier.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the element to extract.
+
+        Returns
+        -------
+        BaseEstimator | None
+            The extracted model element or None if the element was not found.
+        """
+        return self._extract_single_element(
+            element_name, _get_model_element_position_from_pipeline
+        )
+
+    def _get_subpipline_from_start(
+        self,
+        element_name: str | None,
+        start_get_index_function: Callable[[Pipeline], int | None],
+    ) -> Pipeline | None:
+        """Get a subpipeline up to a specific element starting from the first element of the original pipeline.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the element to extract.
+        start_get_index_function : Callable[[Pipeline], int | None]
+            A function that returns the index of the subpipline's last element.
+
+        Returns
+        -------
+        Pipeline | None
+            The extracted subpipeline or None if the corresponding last element was not found.
+        """
+        element_index = self._extract_single_element_index(
+            element_name, start_get_index_function
+        )
+        if element_index is None:
+            return None
+        return Pipeline(steps=self.pipeline.steps[: element_index + 1])
+
+    def get_molecule_reader_subpipeline(
+        self, element_name: str | None = None
+    ) -> Pipeline | None:
+        """Get a subpipeline up to the molecule reading element.
+
+        Note that standardization steps, like salt removal, are not guaranteed to be included.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the last element in the subpipeline to extract.
+
+        Returns
+        -------
+        Pipeline | None
+            The extracted subpipeline or None if the corresponding last element was not found.
+        """
+        return self._get_subpipline_from_start(
+            element_name, _get_molecule_reading_position_from_pipeline
+        )
+
+    def get_featurization_subpipeline(
+        self, element_name: str | None = None
+    ) -> Pipeline | None:
+        """Get a subpipeline up to the featurization element.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the last element in the subpipeline to extract.
+
+        Returns
+        -------
+        Pipeline | None
+            The extracted subpipeline or None if the corresponding last element was not found.
+        """
+        return self._get_subpipline_from_start(
+            element_name, _get_featurization_element_position_from_pipeline
+        )
+
+    def get_model_subpipeline(self, element_name: str | None = None) -> Pipeline | None:
+        """Get a subpipeline up to the machine learning model element.
+
+        Parameters
+        ----------
+        element_name : str | None
+            The name of the last element in the subpipeline to extract.
+
+        Returns
+        -------
+        Pipeline | None
+            The extracted subpipeline or None if the corresponding last element was not found.
+        """
+        return self._get_subpipline_from_start(
+            element_name, _get_model_element_position_from_pipeline
+        )
+
+    def get_subpipeline(
+        self,
+        first_element: Any,
+        second_element: Any,
+        first_offset: int = 0,
+        second_offset: int = 0,
+    ) -> Pipeline | None:
+        """Get a subpipeline between two elements.
+
+        This function only checks the names of the elements.
+        If the elements are not found or the second element is before the first element, a ValueError is raised.
+
+        Parameters
+        ----------
+        first_element : Any
+            The first element of the subpipeline.
+        second_element : Any
+            The second element of the subpipeline.
+        first_offset : int
+            The offset to apply to the first element.
+        second_offset : int
+            The offset to apply to the second element.
+
+        Returns
+        -------
+        Pipeline | None
+            The extracted subpipeline or None if the elements were not found.
+        """
+        first_element_index = self._get_index_of_element_by_id(first_element)
+        if first_element_index is None:
+            raise ValueError(f"Element {first_element} not found in pipeline.")
+        second_element_index = self._get_index_of_element_by_id(second_element)
+        if second_element_index is None:
+            raise ValueError(f"Element {second_element} not found in pipeline.")
+
+        # apply user-defined offsets
+        first_element_index += first_offset
+        second_element_index += second_offset
+
+        if second_element_index < first_element_index:
+            raise ValueError(
+                f"Element {second_element} must be after element {first_element}."
+            )
+        return Pipeline(
+            steps=self.pipeline.steps[first_element_index : second_element_index + 1]
+        )
+
+    def get_all_filter_reinserter_fill_values(self) -> list[Any]:
+        """Get all fill values for FilterReinserter elements in the pipeline.
+
+        Returns
+        -------
+        list[Any]
+            The fill values for all FilterReinserter elements in the pipeline.
+        """
+        fill_values = set()
+        for _, step in self.pipeline.steps:
+            if isinstance(step, FilterReinserter):
+                fill_values.add(step.fill_value)
+            if isinstance(step, PostPredictionWrapper) and isinstance(
+                step.wrapped_estimator, FilterReinserter
+            ):
+                fill_values.add(step.wrapped_estimator.fill_value)
+        return list(fill_values)
diff --git a/tests/test_elements/test_mol2any/test_mol2concatenated.py b/tests/test_elements/test_mol2any/test_mol2concatenated.py
index 420794ba..5bb57742 100644
--- a/tests/test_elements/test_mol2any/test_mol2concatenated.py
+++ b/tests/test_elements/test_mol2any/test_mol2concatenated.py
@@ -14,6 +14,7 @@
 from molpipeline.mol2any import (
     MolToConcatenatedVector,
     MolToMorganFP,
+    MolToNetCharge,
     MolToRDKitPhysChem,
 )
 from tests.utils.fingerprints import fingerprints_to_numpy
@@ -23,12 +24,7 @@ class TestConcatenatedFingerprint(unittest.TestCase):
     """Unittest for MolToConcatenatedVector, which calculates concatenated fingerprints."""
 
     def test_generation(self) -> None:
-        """Test if the feature concatenation works as expected.
-
-        Returns
-        -------
-        None
-        """
+        """Test if the feature concatenation works as expected."""
         fingerprint_morgan_output_types: tuple[Any, ...] = get_args(
             Literal[
                 "sparse",
@@ -95,6 +91,46 @@ def test_generation(self) -> None:
             self.assertTrue(np.allclose(output, output2))
             self.assertTrue(np.allclose(output, output3))
 
+    def test_n_features(self) -> None:
+        """Test getting the number of features in the concatenated vector."""
+
+        physchem_elem = (
+            "RDKitPhysChem",
+            MolToRDKitPhysChem(),
+        )
+        morgan_elem = (
+            "MorganFP",
+            MolToMorganFP(n_bits=16),
+        )
+        net_charge_elem = ("NetCharge", MolToNetCharge())
+
+        self.assertEqual(
+            MolToConcatenatedVector([physchem_elem]).n_features,
+            physchem_elem[1].n_features,
+        )
+        self.assertEqual(
+            MolToConcatenatedVector([morgan_elem]).n_features,
+            16,
+        )
+        self.assertEqual(
+            MolToConcatenatedVector([net_charge_elem]).n_features,
+            net_charge_elem[1].n_features,
+        )
+        self.assertEqual(
+            MolToConcatenatedVector([physchem_elem, morgan_elem]).n_features,
+            physchem_elem[1].n_features + 16,
+        )
+        self.assertEqual(
+            MolToConcatenatedVector([net_charge_elem, morgan_elem]).n_features,
+            net_charge_elem[1].n_features + 16,
+        )
+        self.assertEqual(
+            MolToConcatenatedVector(
+                [net_charge_elem, morgan_elem, physchem_elem]
+            ).n_features,
+            net_charge_elem[1].n_features + 16 + physchem_elem[1].n_features,
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/tests/test_utils/test_subpipeline.py b/tests/test_utils/test_subpipeline.py
new file mode 100644
index 00000000..672f203e
--- /dev/null
+++ b/tests/test_utils/test_subpipeline.py
@@ -0,0 +1,336 @@
+"""Test SubpipelineExtractor."""
+
+import unittest
+
+import numpy as np
+from sklearn.ensemble import RandomForestClassifier
+
+from molpipeline import ErrorFilter, FilterReinserter, Pipeline, PostPredictionWrapper
+from molpipeline.any2mol import SmilesToMol
+from molpipeline.mol2any import MolToMorganFP, MolToSmiles
+from molpipeline.mol2mol import SaltRemover
+from molpipeline.utils.subpipeline import SubpipelineExtractor
+
+
+class TestSubpipelineExtractor(unittest.TestCase):
+    """Test SubpipelineExtractor."""
+
+    def test_get_molecule_reader_element(self) -> None:
+        """Test extracting molecule reader element from pipelines."""
+
+        # test basic example
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        self.assertIs(extractor.get_molecule_reader_element(), pipeline.steps[0][1])
+
+        # test with multiple molecule readers
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("mol2smi", MolToSmiles()),
+                ("smi2mol2", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        self.assertIs(extractor.get_molecule_reader_element(), pipeline.steps[2][1])
+
+    def test_get_featurization_element(self) -> None:
+        """Test extracting featurization element from pipelines."""
+
+        # test basic example
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        self.assertIs(extractor.get_featurization_element(), pipeline.steps[1][1])
+
+        # test with PostPredictionWrapper
+        error_filter = ErrorFilter()
+        error_reinserter = PostPredictionWrapper(
+            FilterReinserter.from_error_filter(error_filter, None)
+        )
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("error_filter", error_filter),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+                (
+                    "error_reinserter",
+                    error_reinserter,
+                ),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        self.assertIs(extractor.get_featurization_element(), pipeline.steps[2][1])
+
+    def test_get_model_element(self) -> None:
+        """Test extracting model element from pipeline."""
+
+        # test basic example
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        self.assertIs(extractor.get_model_element(), pipeline.steps[2][1])
+
+        # test with PostPredictionWrapper
+        error_filter = ErrorFilter()
+        error_reinserter = PostPredictionWrapper(
+            FilterReinserter.from_error_filter(error_filter, None)
+        )
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("error_filter", error_filter),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+                (
+                    "error_reinserter",
+                    error_reinserter,
+                ),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        self.assertIs(extractor.get_model_element(), pipeline.steps[3][1])
+
+    def test_get_molecule_reader_subpipeline(self) -> None:
+        """Test extracting subpipeline up to the molecule reader element from pipelines."""
+
+        # test basic example
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        subpipeline = extractor.get_molecule_reader_subpipeline()
+        self.assertIsInstance(subpipeline, Pipeline)
+        self.assertEqual(len(subpipeline.steps), 1)  # type: ignore[union-attr]
+        self.assertIs(subpipeline.steps[0], pipeline.steps[0])  # type: ignore[union-attr]
+
+        # test with multiple molecule readers
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("mol2smi", MolToSmiles()),
+                ("smi2mol2", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        subpipeline = extractor.get_molecule_reader_subpipeline()
+        self.assertIsInstance(subpipeline, Pipeline)
+        self.assertEqual(len(subpipeline.steps), 3)  # type: ignore[union-attr]
+        for i, subpipe_step in enumerate(subpipeline.steps):  # type: ignore[union-attr]
+            self.assertIs(subpipe_step, pipeline.steps[i])
+
+    def test_get_featurization_subpipeline(self) -> None:
+        """Test extracting subpipeline up to the featurization element from pipelines."""
+
+        # test basic example
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        subpipeline = extractor.get_featurization_subpipeline()
+        self.assertIsInstance(subpipeline, Pipeline)
+        self.assertEqual(len(subpipeline.steps), 2)  # type: ignore[union-attr]
+        for i, subpipe_step in enumerate(subpipeline.steps):  # type: ignore[union-attr]
+            self.assertIs(subpipe_step, pipeline.steps[i])
+
+        # test with PostPredictionWrapper
+        error_filter = ErrorFilter()
+        error_reinserter = PostPredictionWrapper(
+            FilterReinserter.from_error_filter(error_filter, None)
+        )
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("error_filter", error_filter),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+                (
+                    "error_reinserter",
+                    error_reinserter,
+                ),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        subpipeline = extractor.get_featurization_subpipeline()
+        self.assertIsInstance(subpipeline, Pipeline)
+        self.assertEqual(len(subpipeline.steps), 3)  # type: ignore[union-attr]
+        for i, subpipe_step in enumerate(subpipeline.steps):  # type: ignore[union-attr]
+            self.assertIs(subpipe_step, pipeline.steps[i])
+
+    def test_get_model_subpipeline(self) -> None:
+        """Test extracting subpipeline up to the model element from pipelines."""
+
+        # test basic example
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        subpipeline = extractor.get_model_subpipeline()
+        self.assertIsInstance(subpipeline, Pipeline)
+        self.assertEqual(len(subpipeline.steps), 3)  # type: ignore[union-attr]
+        for i, subpipe_step in enumerate(subpipeline.steps):  # type: ignore[union-attr]
+            self.assertIs(subpipe_step, pipeline.steps[i])
+
+        # test with PostPredictionWrapper
+        error_filter = ErrorFilter()
+        error_reinserter = PostPredictionWrapper(
+            FilterReinserter.from_error_filter(error_filter, None)
+        )
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("error_filter", error_filter),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+                (
+                    "error_reinserter",
+                    error_reinserter,
+                ),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        subpipeline = extractor.get_model_subpipeline()
+        self.assertIsInstance(subpipeline, Pipeline)
+        self.assertEqual(len(subpipeline.steps), 4)  # type: ignore[union-attr]
+        for i, subpipe_step in enumerate(subpipeline.steps):  # type: ignore[union-attr]
+            self.assertIs(subpipe_step, pipeline.steps[i])
+
+    def test_get_subpipeline(self) -> None:
+        """Test extracting subpipeline as a certain interval from the original pipeline."""
+
+        pipeline = Pipeline(
+            [
+                ("smi2mol", SmilesToMol()),
+                ("salt_remover", SaltRemover()),
+                ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                ("model", RandomForestClassifier()),
+            ]
+        )
+        extractor = SubpipelineExtractor(pipeline)
+        reader_element = extractor.get_molecule_reader_element("smi2mol")
+        self.assertIs(reader_element, pipeline.steps[0][1])
+        feature_element = extractor.get_featurization_element("morgan")
+        self.assertIs(feature_element, pipeline.steps[2][1])
+        model_element = extractor.get_model_element("model")
+        self.assertIs(model_element, pipeline.steps[3][1])
+
+        # test smi2mol to morgan
+        subpipeline_reader_feature = extractor.get_subpipeline(
+            reader_element, feature_element
+        )
+        self.assertIsInstance(subpipeline_reader_feature, Pipeline)
+        self.assertEqual(len(subpipeline_reader_feature.steps), 3)  # type: ignore[union-attr]
+        self.assertIs(subpipeline_reader_feature.steps[0], pipeline.steps[0])  # type: ignore[union-attr]
+        self.assertIs(subpipeline_reader_feature.steps[1], pipeline.steps[1])  # type: ignore[union-attr]
+        self.assertIs(subpipeline_reader_feature.steps[2], pipeline.steps[2])  # type: ignore[union-attr]
+
+        # test smi2mol to model
+        subpipeline_reader_model = extractor.get_subpipeline(
+            reader_element, model_element
+        )
+        self.assertIsInstance(subpipeline_reader_model, Pipeline)
+        self.assertEqual(len(subpipeline_reader_model.steps), 4)  # type: ignore[union-attr]
+        self.assertIs(subpipeline_reader_model.steps[0], pipeline.steps[0])  # type: ignore[union-attr]
+        self.assertIs(subpipeline_reader_model.steps[1], pipeline.steps[1])  # type: ignore[union-attr]
+        self.assertIs(subpipeline_reader_model.steps[2], pipeline.steps[2])  # type: ignore[union-attr]
+        self.assertIs(subpipeline_reader_model.steps[3], pipeline.steps[3])  # type: ignore[union-attr]
+
+        # test morgan to model
+        subpipeline_feature_model = extractor.get_subpipeline(
+            feature_element, model_element
+        )
+        self.assertIsInstance(subpipeline_feature_model, Pipeline)
+        self.assertEqual(len(subpipeline_feature_model.steps), 2)  # type: ignore[union-attr]
+        self.assertIs(subpipeline_feature_model.steps[0], pipeline.steps[2])  # type: ignore[union-attr]
+        self.assertIs(subpipeline_feature_model.steps[1], pipeline.steps[3])  # type: ignore[union-attr]
+
+        # test morgan to morgan
+        subpipeline_feature_feature = extractor.get_subpipeline(
+            feature_element, feature_element
+        )
+        self.assertIsInstance(subpipeline_feature_feature, Pipeline)
+        self.assertEqual(len(subpipeline_feature_feature.steps), 1)  # type: ignore[union-attr]
+        self.assertIs(subpipeline_feature_feature.steps[0], pipeline.steps[2])  # type: ignore[union-attr]
+
+        # test the first element comes after the second element
+        self.assertRaises(
+            ValueError,
+            extractor.get_subpipeline,
+            feature_element,
+            reader_element,
+        )
+
+        element_not_in_pipeline = SmilesToMol()
+
+        # test element not in pipeline raises an exception
+        self.assertRaises(
+            ValueError,
+            extractor.get_subpipeline,
+            element_not_in_pipeline,
+            feature_element,
+        )
+        self.assertRaises(
+            ValueError,
+            extractor.get_subpipeline,
+            reader_element,
+            element_not_in_pipeline,
+        )
+
+    def test_get_all_filter_reinserter_fill_values(self) -> None:
+        """Test extracting all FilterReinserter fill values from pipelines."""
+
+        test_fill_values = [None, np.nan]
+
+        for test_fill_value in test_fill_values:
+            error_filter = ErrorFilter()
+            error_reinserter = PostPredictionWrapper(
+                FilterReinserter.from_error_filter(error_filter, test_fill_value)
+            )
+            pipeline = Pipeline(
+                [
+                    ("smi2mol", SmilesToMol()),
+                    ("error_filter", error_filter),
+                    ("morgan", MolToMorganFP(radius=1, n_bits=64)),
+                    ("model", RandomForestClassifier()),
+                    (
+                        "error_reinserter",
+                        error_reinserter,
+                    ),
+                ]
+            )
+            extractor = SubpipelineExtractor(pipeline)
+            fill_values = extractor.get_all_filter_reinserter_fill_values()
+            self.assertEqual(fill_values, [test_fill_value])