diff --git a/molpipeline/mol2any/mol2rdkit_phys_chem.py b/molpipeline/mol2any/mol2rdkit_phys_chem.py index 559bc35b..15a79313 100644 --- a/molpipeline/mol2any/mol2rdkit_phys_chem.py +++ b/molpipeline/mol2any/mol2rdkit_phys_chem.py @@ -16,8 +16,7 @@ import numpy as np import numpy.typing as npt from loguru import logger -from rdkit import rdBase -from rdkit import Chem +from rdkit import Chem, rdBase from rdkit.Chem import Descriptors from sklearn.preprocessing import StandardScaler diff --git a/molpipeline/pipeline/_skl_pipeline.py b/molpipeline/pipeline/_skl_pipeline.py index bdbc5455..80e451e3 100644 --- a/molpipeline/pipeline/_skl_pipeline.py +++ b/molpipeline/pipeline/_skl_pipeline.py @@ -82,7 +82,10 @@ def __init__( """ super().__init__(steps, memory=memory, verbose=verbose) self.n_jobs = n_jobs + self._set_error_resinserter() + def _set_error_resinserter(self) -> None: + """Connect the error resinserters with the error filters.""" error_replacer_list = [ e_filler for _, e_filler in self.steps @@ -288,6 +291,9 @@ def _fit( self.steps[idx_i] = (name_i, ele_i) if y is not None: y = fitted_transformer.co_transform(y) + for idx_i, name_i, ele_i in zip(step_idx, name, ele_list): + self.steps[idx_i] = (name_i, ele_i) + self._set_error_resinserter() elif isinstance(name, list) or isinstance(step_idx, list): raise AssertionError() else: diff --git a/test_extras/test_chemprop/test_chemprop_pipeline.py b/test_extras/test_chemprop/test_chemprop_pipeline.py index eb7ff6e3..c5f66fa0 100644 --- a/test_extras/test_chemprop/test_chemprop_pipeline.py +++ b/test_extras/test_chemprop/test_chemprop_pipeline.py @@ -28,6 +28,7 @@ from molpipeline.pipeline import Pipeline from molpipeline.post_prediction import PostPredictionWrapper from test_extras.test_chemprop.chemprop_test_utils.compare_models import compare_params +from tests import TEST_DATA_DIR # pylint: disable=duplicate-code @@ -256,8 +257,8 @@ def test_prediction(self) -> None: """Test the prediction of the regression model.""" molecule_net_logd_df = pd.read_csv( - "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/Lipophilicity.csv" - ).head(1000) + TEST_DATA_DIR / "molecule_net_logd.tsv.gz", sep="\t", nrows=100 + ) regression_model = get_regression_pipeline() regression_model.fit( molecule_net_logd_df["smiles"].tolist(), @@ -279,8 +280,9 @@ def test_prediction(self) -> None: """Test the prediction of the classification model.""" molecule_net_bbbp_df = pd.read_csv( - "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv" - ).head(1000) + TEST_DATA_DIR / "molecule_net_bbbp.tsv.gz", sep="\t", nrows=100 + ) + molecule_net_bbbp_df.to_csv("molecule_net_bbbp.tsv.gz", sep="\t", index=False) classification_model = get_classification_pipeline() classification_model.fit( molecule_net_bbbp_df["smiles"].tolist(), diff --git a/tests/test_data/molecule_net_bbbp.tsv.gz b/tests/test_data/molecule_net_bbbp.tsv.gz new file mode 100644 index 00000000..4858c334 Binary files /dev/null and b/tests/test_data/molecule_net_bbbp.tsv.gz differ diff --git a/tests/test_data/molecule_net_logd.tsv.gz b/tests/test_data/molecule_net_logd.tsv.gz new file mode 100644 index 00000000..069d31f9 Binary files /dev/null and b/tests/test_data/molecule_net_logd.tsv.gz differ diff --git a/tests/test_pipeline.py b/tests/test_pipeline.py index 85adf8ae..566dc208 100644 --- a/tests/test_pipeline.py +++ b/tests/test_pipeline.py @@ -2,11 +2,17 @@ from __future__ import annotations +import tempfile import unittest +from itertools import combinations +from pathlib import Path from typing import Any +import numpy as np +import pandas as pd +from joblib import Memory from sklearn.base import BaseEstimator -from sklearn.ensemble import RandomForestClassifier +from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import GridSearchCV from sklearn.tree import DecisionTreeClassifier @@ -21,6 +27,8 @@ ) from molpipeline.utils.json_operations import recursive_from_json, recursive_to_json from molpipeline.utils.matrices import are_equal +from tests import TEST_DATA_DIR +from tests.utils.execution_count import get_exec_counted_rf_regressor from tests.utils.fingerprints import make_sparse_fp TEST_SMILES = ["CC", "CCO", "COC", "CCCCC", "CCC(-O)O", "CCCN"] @@ -275,6 +283,97 @@ def test_gridsearchcv(self) -> None: for k, value in param_grid.items(): self.assertIn(grid_search_cv.best_params_[k], value) + def test_caching(self) -> None: + """Test if the caching gives the same results and is faster on the second run.""" + + molecule_net_logd_df = pd.read_csv( + TEST_DATA_DIR / "molecule_net_logd.tsv.gz", sep="\t", nrows=20 + ) + prediction_list = [] + for cache_activated in [False, True]: + pipeline = get_exec_counted_rf_regressor(_RANDOM_STATE) + with tempfile.TemporaryDirectory() as temp_dir: + + if cache_activated: + cache_dir = Path(temp_dir) / ".cache" + mem = Memory(location=cache_dir, verbose=0) + else: + mem = Memory(location=None, verbose=0) + pipeline.memory = mem + # Run fitting 1 + pipeline.fit( + molecule_net_logd_df["smiles"].tolist(), + molecule_net_logd_df["exp"].tolist(), + ) + # Get predictions + prediction = pipeline.predict(molecule_net_logd_df["smiles"].tolist()) + prediction_list.append(prediction) + + # Reset the last step with an untrained model + pipeline.steps[-1] = ( + "rf", + RandomForestRegressor(random_state=_RANDOM_STATE, n_jobs=1), + ) + + # Run fitting 2 + pipeline.fit( + molecule_net_logd_df["smiles"].tolist(), + molecule_net_logd_df["exp"].tolist(), + ) + # Get predictions + prediction = pipeline.predict(molecule_net_logd_df["smiles"].tolist()) + prediction_list.append(prediction) + + n_transformations = pipeline.named_steps["mol2concat"].n_transformations + if cache_activated: + # Fit is called twice, but the transform is only called once, since the second run is cached + self.assertEqual(n_transformations, 1) + else: + self.assertEqual(n_transformations, 2) + + mem.clear(warn=False) + for pred1, pred2 in combinations(prediction_list, 2): + self.assertTrue(np.allclose(pred1, pred2)) + + def test_gridsearch_cache(self) -> None: + """Run a short GridSearchCV and check if the caching and not caching gives the same results.""" + h_params = { + "rf__n_estimators": [1, 2], + } + # First without caching + data_df = pd.read_csv( + TEST_DATA_DIR / "molecule_net_logd.tsv.gz", sep="\t", nrows=20 + ) + best_param_dict = {} + prediction_dict = {} + for cache_activated in [True, False]: + pipeline = get_exec_counted_rf_regressor(_RANDOM_STATE) + with tempfile.TemporaryDirectory() as temp_dir: + cache_dir = Path(temp_dir) / ".cache" + if cache_activated: + mem = Memory(location=cache_dir, verbose=0) + else: + mem = Memory(location=None, verbose=0) + pipeline.memory = mem + grid_search_cv = GridSearchCV( + estimator=pipeline, + param_grid=h_params, + cv=2, + scoring="neg_mean_squared_error", + n_jobs=1, + error_score="raise", + refit=True, + pre_dispatch=1, + ) + grid_search_cv.fit(data_df["smiles"].tolist(), data_df["exp"].tolist()) + best_param_dict[cache_activated] = grid_search_cv.best_params_ + prediction_dict[cache_activated] = grid_search_cv.predict( + data_df["smiles"].tolist() + ) + mem.clear(warn=False) + self.assertEqual(best_param_dict[True], best_param_dict[False]) + self.assertTrue(np.allclose(prediction_dict[True], prediction_dict[False])) + if __name__ == "__main__": unittest.main() diff --git a/tests/utils/execution_count.py b/tests/utils/execution_count.py new file mode 100644 index 00000000..abf37d6a --- /dev/null +++ b/tests/utils/execution_count.py @@ -0,0 +1,155 @@ +"""Functions for counting the number of times a function is executed.""" + +from __future__ import annotations + +from typing import Any + +try: + from typing import Self # type: ignore[attr-defined] +except ImportError: + from typing_extensions import Self + +from sklearn.base import BaseEstimator +from sklearn.ensemble import RandomForestRegressor + +from molpipeline import Pipeline +from molpipeline.abstract_pipeline_elements.core import ABCPipelineElement +from molpipeline.any2mol import SmilesToMol +from molpipeline.mol2any import MolToMorganFP + + +class CountingTransformerWrapper(BaseEstimator): + """A transformer that counts the number of transformations.""" + + def __init__(self, element: ABCPipelineElement): + """Initialize the wrapper. + + Parameters + ---------- + element : ABCPipelineElement + The element to wrap. + """ + self.element = element + self.n_transformations = 0 + + def fit(self, X: Any, y: Any) -> Self: # pylint: disable=invalid-name + """Fit the data. + + Parameters + ---------- + X : Any + The input data. + y : Any + The target data. + + Returns + ------- + Any + The fitted data. + """ + self.element.fit(X, y) + return self + + def transform(self, X: Any) -> Any: # pylint: disable=invalid-name + """Transform the data. + + Transform is called during prediction, which is not cached. + Since the transformer is not cached, the counter is not increased. + + Parameters + ---------- + X : Any + The input data. + + Returns + ------- + Any + The transformed data. + """ + return self.element.transform(X) + + def fit_transform(self, X: Any, y: Any) -> Any: # pylint: disable=invalid-name + """Fit and transform the data. + + Parameters + ---------- + X : Any + The input data. + y : Any + The target data. + + Returns + ------- + Any + The transformed data. + """ + self.n_transformations += 1 + return self.element.fit_transform(X, y) + + def get_params(self, deep: bool = True) -> dict[str, Any]: + """Get the parameters of the transformer. + + Parameters + ---------- + deep : bool + If True, the parameters of the transformer are also returned. + + Returns + ------- + dict[str, Any] + The parameters of the transformer. + """ + params = { + "element": self.element, + } + if deep: + params.update(self.element.get_params(deep)) + return params + + def set_params(self, **params: Any) -> Self: + """Set the parameters of the transformer. + + Parameters + ---------- + **params + The parameters to set. + + Returns + ------- + Self + The transformer with the set parameters + """ + element = params.pop("element", None) + if element is not None: + self.element = element + self.element.set_params(**params) + return self + + +def get_exec_counted_rf_regressor(random_state: int) -> Pipeline: + """Get a morgan + random forest pipeline, which counts the number of transformations. + + Parameters + ---------- + random_state : int + The random state to use. + + Returns + ------- + Pipeline + A pipeline with a morgan fingerprint, physchem descriptors, and a random forest + """ + smi2mol = SmilesToMol() + + mol2concat = CountingTransformerWrapper( + MolToMorganFP(radius=2, n_bits=2048), + ) + rf = RandomForestRegressor(random_state=random_state, n_jobs=1) + return Pipeline( + [ + ("smi2mol", smi2mol), + ("mol2concat", mol2concat), + ("rf", rf), + ], + n_jobs=1, + )