From 5a9b758d4bfdc2fb5f3a6b3e9f1f3a69b697bd51 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Mon, 17 Jun 2024 15:59:39 +0200 Subject: [PATCH 01/24] install numpy<2.0.0 (until rdkit is updated) --- .github/workflows/linting.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 6c195a15..cb01b627 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -13,6 +13,7 @@ jobs: python-version: "3.11" - name: Install dependencies run: | + pip install "numpy<2.0.0" python -m pip install --upgrade pip pip install $(find . -name "requirement*" -type f -printf ' -r %p') pip install pylint From c6e0c1c0404db5bff17f3dbf808a9eae05364725 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Mon, 17 Jun 2024 16:03:28 +0200 Subject: [PATCH 02/24] enable counted fingerprint --- molpipeline/mol2any/mol2morgan_fingerprint.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/molpipeline/mol2any/mol2morgan_fingerprint.py b/molpipeline/mol2any/mol2morgan_fingerprint.py index dd3d20c3..6e38c893 100644 --- a/molpipeline/mol2any/mol2morgan_fingerprint.py +++ b/molpipeline/mol2any/mol2morgan_fingerprint.py @@ -35,6 +35,7 @@ def __init__( radius: int = 2, use_features: bool = False, n_bits: int = 2048, + counted: bool = False, return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse", name: str = "MolToMorganFP", n_jobs: int = 1, @@ -50,6 +51,9 @@ def __init__( Instead of atoms, features are encoded in the fingerprint. [2] n_bits: int, optional (default=2048) Size of fingerprint. + counted: bool, optional (default=False) + If True, the fingerprint will be counted. + If False, the fingerprint will be binary. return_as: Literal["sparse", "dense", "explicit_bit_vect"] Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix holding a sparse representation of the bit vectors. With "dense" a numpy matrix will be returned. @@ -76,6 +80,7 @@ def __init__( n_jobs=n_jobs, uuid=uuid, ) + self.counted = counted if isinstance(n_bits, int) and n_bits >= 0: self._n_bits = n_bits else: @@ -145,16 +150,20 @@ def pretransform_single( radius=self.radius, fpSize=self._n_bits, ) - if self._return_as == "explicit_bit_vect": + if self.counted: + return fingerprint_generator.GetCountFingerprint(value) return fingerprint_generator.GetFingerprint(value) + + if self.counted: + fingerprint = fingerprint_generator.GetCountFingerprintAsNumPy(value) + else: + fingerprint = fingerprint_generator.GetFingerprintAsNumPy(value) + if self._return_as == "dense": - return fingerprint_generator.GetFingerprintAsNumPy(value) - # sparse return type - return { - bit_idx: 1 - for bit_idx in fingerprint_generator.GetFingerprint(value).GetOnBits() - } + return fingerprint + + return {pos: count for pos, count in enumerate(fingerprint) if count > 0} def _explain_rdmol(self, mol_obj: RDKitMol) -> dict[int, list[tuple[int, int]]]: """Get central atom and radius of all features in molecule. From b32ef83131fa36e5b4bc8e81ab9da2e8f8607677 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Mon, 17 Jun 2024 16:05:44 +0200 Subject: [PATCH 03/24] set numpy < 2.0.0 in requirements.txt --- .github/workflows/linting.yml | 1 - requirements.txt | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index cb01b627..6c195a15 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -13,7 +13,6 @@ jobs: python-version: "3.11" - name: Install dependencies run: | - pip install "numpy<2.0.0" python -m pip install --upgrade pip pip install $(find . -name "requirement*" -type f -printf ' -r %p') pip install pylint diff --git a/requirements.txt b/requirements.txt index c6fab9f9..9b597696 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ joblib >= 1.3.0 loguru -numpy +numpy < 2.0.0 pandas rdkit >= 2023.9.1 scipy From 7d114cfe3c10f08cb46399f6fc2cf55379d70fe2 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Mon, 17 Jun 2024 16:08:13 +0200 Subject: [PATCH 04/24] set numpy < 2.0.0 in mypy --- .github/workflows/linting.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 6c195a15..575b11ab 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -30,6 +30,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install "numpy<2.0.0" pip install mypy mypy . || exit_code=$? mypy --install-types --non-interactive From 59a2f1586b7d743912c404a61a68e7a35853bdac Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 10:37:52 +0200 Subject: [PATCH 05/24] implement Mol2PathFP --- .../mol2any/mol2bitvector.py | 120 +++++++++- molpipeline/mol2any/mol2morgan_fingerprint.py | 38 +-- molpipeline/mol2any/mol2path_fingerprint.py | 220 ++++++++++++++++++ 3 files changed, 345 insertions(+), 33 deletions(-) create mode 100644 molpipeline/mol2any/mol2path_fingerprint.py diff --git a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py index 4e4533ab..779f272e 100644 --- a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py +++ b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py @@ -13,6 +13,7 @@ import numpy as np import numpy.typing as npt +from rdkit.Chem import rdFingerprintGenerator from rdkit.DataStructs import ExplicitBitVect from scipy import sparse @@ -196,7 +197,120 @@ def pretransform_single( """ -class ABCMorganFingerprintPipelineElement(MolToFingerprintPipelineElement, abc.ABC): +class MolToRDKitGenFPElement(MolToFingerprintPipelineElement, abc.ABC): + """Abstract class for PipelineElements using the FingeprintGenerator64.""" + + def __init__( + self, + counted: bool = False, + return_as: OutputDatatype = "sparse", + name: str = "MolToRDKitGenFin", + n_jobs: int = 1, + uuid: Optional[str] = None, + ): + """Initialize abstract class. + + Parameters + ---------- + counted: bool + Whether to count the bits or not. + return_as: Literal["sparse", "dense", "explicit_bit_vect"] + Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix + name: str + Name of PipelineElement. + n_jobs: + Number of jobs. + uuid: Optional[str] + Unique identifier. + """ + super().__init__( + return_as=return_as, + name=name, + n_jobs=n_jobs, + uuid=uuid, + ) + self.counted = counted + + @abc.abstractmethod + def _get_fp_generator(self) -> rdFingerprintGenerator.FingerprintGenerator: + """Get fingerprint generator.""" + + def pretransform_single( + self, value: RDKitMol + ) -> ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int]: + """Transform a single compound to a dictionary. + + Keys denote the feature position, values the count. Here always 1. + + Parameters + ---------- + value: RDKitMol + Molecule for which the fingerprint is generated. + + Returns + ------- + dict[int, int] + Dictionary with feature-position as key and count as value. + """ + fingerprint_generator = self._get_fp_generator() + if self._return_as == "explicit_bit_vect": + if self.counted: + return fingerprint_generator.GetCountFingerprint(value) + return fingerprint_generator.GetFingerprint(value) + + if self.counted: + fingerprint = fingerprint_generator.GetCountFingerprintAsNumPy(value) + else: + fingerprint = fingerprint_generator.GetFingerprintAsNumPy(value) + + if self._return_as == "dense": + return fingerprint + + return {pos: count for pos, count in enumerate(fingerprint) if count > 0} + + def get_params(self, deep: bool = True) -> dict[str, Any]: + """Get object parameters relevant for copying the class. + + Parameters + ---------- + deep: bool + If True get a deep copy of the parameters. + + Returns + ------- + dict[str, Any] + Dictionary of parameter names and values. + """ + parameters = super().get_params(deep) + if deep: + parameters["counted"] = bool(self.counted) + else: + parameters["counted"] = self.counted + + return parameters + + def set_params(self, **parameters: dict[str, Any]) -> Self: + """Set object parameters relevant for copying the class. + + Parameters + ---------- + parameters: dict[str, Any] + Dictionary of parameter names and values. + + Returns + ------- + Self + Copied object with updated parameters. + """ + parameter_dict_copy = dict(parameters) + counted = parameter_dict_copy.pop("counted", None) + if counted is not None: + self.counted = bool(counted) + super().set_params(**parameter_dict_copy) + return self + + +class ABCMorganFingerprintPipelineElement(MolToRDKitGenFPElement, abc.ABC): """Abstract Class for Morgan fingerprints.""" # pylint: disable=R0913 @@ -204,6 +318,7 @@ def __init__( self, radius: int = 2, use_features: bool = False, + counted: bool = False, return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse", name: str = "AbstractMorgan", n_jobs: int = 1, @@ -217,6 +332,8 @@ def __init__( Radius of fingerprint. use_features: bool Whether to represent atoms by element or category (donor, acceptor. etc.) + counted: bool + Whether to count the bits or not. return_as: Literal["sparse", "dense", "explicit_bit_vect"] Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix holding a sparse representation of the bit vectors. With "dense" a numpy matrix will be returned. @@ -232,6 +349,7 @@ def __init__( # pylint: disable=R0801 super().__init__( return_as=return_as, + counted=counted, name=name, n_jobs=n_jobs, uuid=uuid, diff --git a/molpipeline/mol2any/mol2morgan_fingerprint.py b/molpipeline/mol2any/mol2morgan_fingerprint.py index 6e38c893..d7a0e10b 100644 --- a/molpipeline/mol2any/mol2morgan_fingerprint.py +++ b/molpipeline/mol2any/mol2morgan_fingerprint.py @@ -11,10 +11,7 @@ import copy -import numpy as np -import numpy.typing as npt from rdkit.Chem import AllChem, rdFingerprintGenerator -from rdkit.DataStructs import ExplicitBitVect from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import ( ABCMorganFingerprintPipelineElement, @@ -75,12 +72,12 @@ def __init__( super().__init__( radius=radius, use_features=use_features, + counted=counted, return_as=return_as, name=name, n_jobs=n_jobs, uuid=uuid, ) - self.counted = counted if isinstance(n_bits, int) and n_bits >= 0: self._n_bits = n_bits else: @@ -129,41 +126,18 @@ def set_params(self, **parameters: dict[str, Any]) -> Self: return self - def pretransform_single( - self, value: RDKitMol - ) -> ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int]: - """Transform a single compound to a dictionary. - - Keys denote the feature position, values the count. Here always 1. - - Parameters - ---------- - value: RDKitMol - Molecule for which the fingerprint is generated. + def _get_fp_generator(self) -> rdFingerprintGenerator.FingerprintGenerator: + """Get the fingerprint generator. Returns ------- - dict[int, int] - Dictionary with feature-position as key and count as value. + rdFingerprintGenerator.FingerprintGenerator + RDKit fingerprint generator. """ - fingerprint_generator = rdFingerprintGenerator.GetMorganGenerator( + return rdFingerprintGenerator.GetMorganGenerator( radius=self.radius, fpSize=self._n_bits, ) - if self._return_as == "explicit_bit_vect": - if self.counted: - return fingerprint_generator.GetCountFingerprint(value) - return fingerprint_generator.GetFingerprint(value) - - if self.counted: - fingerprint = fingerprint_generator.GetCountFingerprintAsNumPy(value) - else: - fingerprint = fingerprint_generator.GetFingerprintAsNumPy(value) - - if self._return_as == "dense": - return fingerprint - - return {pos: count for pos, count in enumerate(fingerprint) if count > 0} def _explain_rdmol(self, mol_obj: RDKitMol) -> dict[int, list[tuple[int, int]]]: """Get central atom and radius of all features in molecule. diff --git a/molpipeline/mol2any/mol2path_fingerprint.py b/molpipeline/mol2any/mol2path_fingerprint.py new file mode 100644 index 00000000..7a0478ce --- /dev/null +++ b/molpipeline/mol2any/mol2path_fingerprint.py @@ -0,0 +1,220 @@ +"""Implementations for the Morgan fingerprint.""" + +from __future__ import annotations # for all the python 3.8 users out there. + +from typing import Any, Literal, Optional + +try: + from typing import Self # type: ignore[attr-defined] +except ImportError: + from typing_extensions import Self + +import copy + +from rdkit.Chem import rdFingerprintGenerator + +from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import ( + MolToRDKitGenFPElement, +) + + +class Mol2PathFP( + MolToRDKitGenFPElement +): # pylint: disable=too-many-instance-attributes + """Folded Morgan Fingerprint. + + Feature-mapping to vector-positions is arbitrary. + + """ + + # pylint: disable=too-many-arguments,too-many-locals + def __init__( + self, + min_path: int = 1, + max_path: int = 7, + use_hs: bool = True, + branched_paths: bool = True, + use_bond_order: bool = True, + count_simulation: bool = False, + count_bonds: bool = False, + n_bits: int = 2048, + num_bits_per_feature: int = 2, + atom_invariants_generator: Any = None, + counted: bool = False, + return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse", + name: str = "Mol2PathFP", + n_jobs: int = 1, + uuid: Optional[str] = None, + ) -> None: + """Initialize Mol2PathFP. + + Parameters + ---------- + min_path: int, optional (default=1) + Minimum path length. + max_path: int, optional (default=7) + Maximum path length. + use_hs: bool, optional (default=True) + Include hydrogens (If explicit hydrogens are present in the molecule). + branched_paths: bool, optional (default=True) + Include branched paths. + use_bond_order: bool, optional (default=True) + Include bond order in path. + count_simulation: bool, optional (default=False) + Count simulation. + count_bonds: bool, optional (default=False) + Count bonds. + n_bits: int, optional (default=2048) + Size of fingerprint. + num_bits_per_feature: int, optional (default=2) + Number of bits per feature. + atom_invariants_generator: Any, optional (default=None) + Atom invariants generator. + counted: bool, optional (default=False) + If True, the fingerprint will be counted. + If False, the fingerprint will be binary. + return_as: Literal["sparse", "dense", "explicit_bit_vect"] + Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix + holding a sparse representation of the bit vectors. With "dense" a numpy matrix will be returned. + With "explicit_bit_vect" the fingerprints will be returned as a list of RDKit's + rdkit.DataStructs.cDataStructs.ExplicitBitVect. + name: str, optional (default="MolToMorganFP") + + + References + ---------- + [1] https://www.rdkit.org/docs/source/rdkit.Chem.rdFingerprintGenerator.html#rdkit.Chem.rdFingerprintGenerator.GetRDKitFPGenerator + """ + # pylint: disable=R0801 + super().__init__( + counted=counted, + return_as=return_as, + name=name, + n_jobs=n_jobs, + uuid=uuid, + ) + if isinstance(n_bits, int) and n_bits >= 0: + self._n_bits = n_bits + else: + raise ValueError( + f"Number of bits has to be a positive integer! (Received: {n_bits})" + ) + self._min_path = min_path + self._max_path = max_path + self._use_hs = use_hs + self._branched_paths = branched_paths + self._use_bond_order = use_bond_order + self._count_simulation = count_simulation + self._count_bonds = count_bonds + self._num_bits_per_feature = num_bits_per_feature + self._atom_invariants_generator = atom_invariants_generator + + def get_params(self, deep: bool = True) -> dict[str, Any]: + """Return all parameters defining the object. + + Parameters + ---------- + deep: bool + If True get a deep copy of the parameters. + + Returns + ------- + dict[str, Any] + Dictionary of parameters. + """ + parameters = super().get_params(deep) + if deep: + parameters["min_path"] = int(self._min_path) + parameters["max_path"] = int(self._max_path) + parameters["use_hs"] = bool(self._use_hs) + parameters["branched_paths"] = bool(self._branched_paths) + parameters["use_bond_order"] = bool(self._use_bond_order) + parameters["count_simulation"] = bool(self._count_simulation) + parameters["count_bonds"] = bool(self._count_bonds) + parameters["num_bits_per_feature"] = int(self._num_bits_per_feature) + parameters["atom_invariants_generator"] = copy.copy( + self._atom_invariants_generator + ) + parameters["n_bits"] = int(self._n_bits) + else: + parameters["min_path"] = self._min_path + parameters["max_path"] = self._max_path + parameters["use_hs"] = self._use_hs + parameters["branched_paths"] = self._branched_paths + parameters["use_bond_order"] = self._use_bond_order + parameters["count_simulation"] = self._count_simulation + parameters["count_bonds"] = self._count_bonds + parameters["num_bits_per_feature"] = self._num_bits_per_feature + parameters["atom_invariants_generator"] = self._atom_invariants_generator + parameters["n_bits"] = self._n_bits + return parameters + + def set_params(self, **parameters: dict[str, Any]) -> Self: + """Set parameters. + + Parameters + ---------- + parameters: dict[str, Any] + Dictionary of parameter names and values. + + Returns + ------- + Self + MolToMorganFP pipeline element with updated parameters. + """ + parameter_copy = dict(parameters) + min_path = parameter_copy.pop("min_path", None) + if min_path is not None: + self._min_path = min_path # type: ignore + max_path = parameter_copy.pop("max_path", None) + if max_path is not None: + self._max_path = max_path # type: ignore + use_hs = parameter_copy.pop("use_hs", None) + if use_hs is not None: + self._use_hs = use_hs # type: ignore + branched_paths = parameter_copy.pop("branched_paths", None) + if branched_paths is not None: + self._branched_paths = branched_paths # type: ignore + use_bond_order = parameter_copy.pop("use_bond_order", None) + if use_bond_order is not None: + self._use_bond_order = use_bond_order # type: ignore + count_simulation = parameter_copy.pop("count_simulation", None) + if count_simulation is not None: + self._count_simulation = count_simulation # type: ignore + count_bonds = parameter_copy.pop("count_bonds", None) + if count_bonds is not None: + self._count_bonds = count_bonds # type: ignore + num_bits_per_feature = parameter_copy.pop("num_bits_per_feature", None) + if num_bits_per_feature is not None: + self._num_bits_per_feature = num_bits_per_feature # type: ignore + atom_invariants_generator = parameter_copy.pop( + "atom_invariants_generator", None + ) + if atom_invariants_generator is not None: + self._atom_invariants_generator = atom_invariants_generator + n_bits = parameter_copy.pop("n_bits", None) # pylint: disable=duplicate-code + if n_bits is not None: + self._n_bits = n_bits # type: ignore + super().set_params(**parameter_copy) + return self + + def _get_fp_generator(self) -> rdFingerprintGenerator.GetRDKitFPGenerator: + """Get the fingerprint generator for the RDKit path fingerprint. + + Returns + ------- + rdFingerprintGenerator.GetRDKitFPGenerator + RDKit Path fingerprint generator. + """ + return rdFingerprintGenerator.GetRDKitFPGenerator( + minPath=self._min_path, + maxPath=self._max_path, + fpSize=self._n_bits, + useHs=self._use_hs, + branchedPaths=self._branched_paths, + useBondOrder=self._use_bond_order, + countSimulation=self._count_simulation, + countBonds=self._count_bonds, + numBitsPerFeature=self._num_bits_per_feature, + atomInvariantsGenerator=self._atom_invariants_generator, + ) From dfe28d2869f2df560ac64e0f2d278f3a72bd4f50 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 10:40:29 +0200 Subject: [PATCH 06/24] extend docstrings --- .../abstract_pipeline_elements/mol2any/mol2bitvector.py | 8 +++++++- molpipeline/mol2any/mol2path_fingerprint.py | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py index 779f272e..46707646 100644 --- a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py +++ b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py @@ -233,7 +233,13 @@ def __init__( @abc.abstractmethod def _get_fp_generator(self) -> rdFingerprintGenerator.FingerprintGenerator: - """Get fingerprint generator.""" + """Get fingerprint generator. + + Returns + ------- + rdFingerprintGenerator.FingerprintGenerator + Fingerprint generator. + """ def pretransform_single( self, value: RDKitMol diff --git a/molpipeline/mol2any/mol2path_fingerprint.py b/molpipeline/mol2any/mol2path_fingerprint.py index 7a0478ce..08bca4f6 100644 --- a/molpipeline/mol2any/mol2path_fingerprint.py +++ b/molpipeline/mol2any/mol2path_fingerprint.py @@ -79,7 +79,11 @@ def __init__( With "explicit_bit_vect" the fingerprints will be returned as a list of RDKit's rdkit.DataStructs.cDataStructs.ExplicitBitVect. name: str, optional (default="MolToMorganFP") - + Name of PipelineElement + n_jobs: int, optional (default=1) + Number of cores to use. + uuid: str | None, optional (default=None) + UUID of the PipelineElement. References ---------- From 9f55cafcbb468ac09fac37bfc08328d613d59bc2 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 10:55:39 +0200 Subject: [PATCH 07/24] fix type --- .../abstract_pipeline_elements/mol2any/mol2bitvector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py index 46707646..bd07b5fd 100644 --- a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py +++ b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py @@ -232,12 +232,12 @@ def __init__( self.counted = counted @abc.abstractmethod - def _get_fp_generator(self) -> rdFingerprintGenerator.FingerprintGenerator: + def _get_fp_generator(self) -> rdFingerprintGenerator.FingeprintGenerator64: """Get fingerprint generator. Returns ------- - rdFingerprintGenerator.FingerprintGenerator + rdFingerprintGenerator.FingeprintGenerator64 Fingerprint generator. """ From f6b8c3a8c1c43b707b78911d270dc64f75b02a37 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 10:57:02 +0200 Subject: [PATCH 08/24] add to init --- molpipeline/mol2any/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/molpipeline/mol2any/__init__.py b/molpipeline/mol2any/__init__.py index da7bb760..3d36171c 100644 --- a/molpipeline/mol2any/__init__.py +++ b/molpipeline/mol2any/__init__.py @@ -5,6 +5,7 @@ from molpipeline.mol2any.mol2inchi import MolToInchi, MolToInchiKey from molpipeline.mol2any.mol2morgan_fingerprint import MolToMorganFP from molpipeline.mol2any.mol2net_charge import MolToNetCharge +from molpipeline.mol2any.mol2path_fingerprint import Mol2PathFP from molpipeline.mol2any.mol2rdkit_phys_chem import MolToRDKitPhysChem from molpipeline.mol2any.mol2smiles import MolToSmiles @@ -14,6 +15,7 @@ "MolToSmiles", "MolToMorganFP", "MolToNetCharge", + "Mol2PathFP", "MolToInchi", "MolToInchiKey", "MolToRDKitPhysChem", From 81c9c60cceab6f8cfd3e98322d6007d0e54b2c3e Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 11:46:50 +0200 Subject: [PATCH 09/24] count bonds -> count boUnds --- molpipeline/mol2any/mol2path_fingerprint.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/molpipeline/mol2any/mol2path_fingerprint.py b/molpipeline/mol2any/mol2path_fingerprint.py index 08bca4f6..b04c325f 100644 --- a/molpipeline/mol2any/mol2path_fingerprint.py +++ b/molpipeline/mol2any/mol2path_fingerprint.py @@ -36,7 +36,7 @@ def __init__( branched_paths: bool = True, use_bond_order: bool = True, count_simulation: bool = False, - count_bonds: bool = False, + count_bounds: Any = None, n_bits: int = 2048, num_bits_per_feature: int = 2, atom_invariants_generator: Any = None, @@ -62,8 +62,8 @@ def __init__( Include bond order in path. count_simulation: bool, optional (default=False) Count simulation. - count_bonds: bool, optional (default=False) - Count bonds. + count_bounds: Any, optional (default=None) + Set the bins for the bond count. n_bits: int, optional (default=2048) Size of fingerprint. num_bits_per_feature: int, optional (default=2) @@ -109,7 +109,7 @@ def __init__( self._branched_paths = branched_paths self._use_bond_order = use_bond_order self._count_simulation = count_simulation - self._count_bonds = count_bonds + self._count_bounds = count_bounds self._num_bits_per_feature = num_bits_per_feature self._atom_invariants_generator = atom_invariants_generator @@ -134,7 +134,7 @@ def get_params(self, deep: bool = True) -> dict[str, Any]: parameters["branched_paths"] = bool(self._branched_paths) parameters["use_bond_order"] = bool(self._use_bond_order) parameters["count_simulation"] = bool(self._count_simulation) - parameters["count_bonds"] = bool(self._count_bonds) + parameters["count_bounds"] = copy.copy(self._count_bounds) parameters["num_bits_per_feature"] = int(self._num_bits_per_feature) parameters["atom_invariants_generator"] = copy.copy( self._atom_invariants_generator @@ -147,7 +147,7 @@ def get_params(self, deep: bool = True) -> dict[str, Any]: parameters["branched_paths"] = self._branched_paths parameters["use_bond_order"] = self._use_bond_order parameters["count_simulation"] = self._count_simulation - parameters["count_bonds"] = self._count_bonds + parameters["count_bounds"] = self._count_bounds parameters["num_bits_per_feature"] = self._num_bits_per_feature parameters["atom_invariants_generator"] = self._atom_invariants_generator parameters["n_bits"] = self._n_bits @@ -185,9 +185,9 @@ def set_params(self, **parameters: dict[str, Any]) -> Self: count_simulation = parameter_copy.pop("count_simulation", None) if count_simulation is not None: self._count_simulation = count_simulation # type: ignore - count_bonds = parameter_copy.pop("count_bonds", None) - if count_bonds is not None: - self._count_bonds = count_bonds # type: ignore + count_bounds = parameter_copy.pop("count_bounds", None) + if count_bounds is not None: + self._count_bounds = count_bounds # type: ignore num_bits_per_feature = parameter_copy.pop("num_bits_per_feature", None) if num_bits_per_feature is not None: self._num_bits_per_feature = num_bits_per_feature # type: ignore @@ -218,7 +218,7 @@ def _get_fp_generator(self) -> rdFingerprintGenerator.GetRDKitFPGenerator: branchedPaths=self._branched_paths, useBondOrder=self._use_bond_order, countSimulation=self._count_simulation, - countBonds=self._count_bonds, + countBounds=self._count_bounds, numBitsPerFeature=self._num_bits_per_feature, atomInvariantsGenerator=self._atom_invariants_generator, ) From e8bcdbada510df76bf034cbe4b01a64e10366866 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 11:47:02 +0200 Subject: [PATCH 10/24] implement tests --- .../test_mol2any/test_mol2path_fingerprint.py | 151 ++++++++++++++++++ 1 file changed, 151 insertions(+) create mode 100644 tests/test_elements/test_mol2any/test_mol2path_fingerprint.py diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py new file mode 100644 index 00000000..13d724c9 --- /dev/null +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -0,0 +1,151 @@ +"""Tests for the MolToPathFingerprint pipeline element.""" + +from __future__ import annotations + +import unittest +from typing import Any + +import numpy as np + +from molpipeline import Pipeline +from molpipeline.any2mol import SmilesToMol +from molpipeline.mol2any import Mol2PathFP + +test_smiles = [ + "c1ccccc1", + "c1ccccc1C", + "NCCOCCCC(=O)O", +] + + +class TestMol2PathFingerprint(unittest.TestCase): + """Unittest for Mol2PathFP, which calculates the RDKit Path Fingerprint.""" + + def test_can_be_constructed(self) -> None: + """Test if the MolToFoldedMorganFingerprint pipeline element can be constructed. + + Returns + ------- + None + """ + mol_fp = Mol2PathFP() + mol_fp_copy = mol_fp.copy() + self.assertTrue(mol_fp_copy is not mol_fp) + for key, value in mol_fp.get_params().items(): + self.assertEqual(value, mol_fp_copy.get_params()[key]) + mol_fp_recreated = Mol2PathFP(**mol_fp.get_params()) + for key, value in mol_fp.get_params().items(): + self.assertEqual(value, mol_fp_recreated.get_params()[key]) + + def test_sparse_dense_accordance(self) -> None: + """Test if the calculation of Morgan fingprints in dense and sparse are equal. + + Compared to precalculated values. + + Returns + ------- + None + """ + smi2mol = SmilesToMol() + sparse_path_fp = Mol2PathFP(n_bits=1024, return_as="sparse") + dense_path_fp = Mol2PathFP(n_bits=1024, return_as="dense") + sparse_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("sparse_path_fp", sparse_path_fp), + ], + ) + dense_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("dense_path_fp", dense_path_fp), + ], + ) + + sparse_output = sparse_pipeline.fit_transform(test_smiles) + dense_output = dense_pipeline.fit_transform(test_smiles) + + self.assertTrue(np.all(sparse_output.toarray() == dense_output)) + + def test_output_types(self) -> None: + """Test equality of different output_types.""" + + smi2mol = SmilesToMol() + sparse_path_fp = Mol2PathFP(n_bits=1024, return_as="sparse") + dense_path_fp = Mol2PathFP(n_bits=1024, return_as="dense") + explicit_bit_vect_path_fp = Mol2PathFP( + n_bits=1024, return_as="explicit_bit_vect" + ) + sparse_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("sparse_path_fp", sparse_path_fp), + ], + ) + dense_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("dense_path_fp", dense_path_fp), + ], + ) + explicit_bit_vect_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("explicit_bit_vect_path_fp", explicit_bit_vect_path_fp), + ], + ) + + sparse_output = sparse_pipeline.fit_transform(test_smiles) + dense_output = dense_pipeline.fit_transform(test_smiles) + explicit_bit_vect_path_fp_output = explicit_bit_vect_pipeline.fit_transform( + test_smiles + ) + + self.assertTrue(np.all(sparse_output.toarray() == dense_output)) + + self.assertTrue( # pylint: disable=duplicate-code + np.equal( + dense_output, + np.array(explicit_bit_vect_path_fp_output), + ).all() + ) + + def test_setter_getter(self) -> None: + """Test if the setters and getters work as expected.""" + mol_fp = Mol2PathFP() + params: dict[str, Any] = { + "min_path": 10, + "max_path": 12, + "use_hs": False, + "branched_paths": False, + "use_bond_order": False, + "count_simulation": True, + "num_bits_per_feature": 4, + "counted": True, + "n_bits": 1024, + } + mol_fp.set_params(**params) + self.assertEqual(mol_fp.get_params()["min_path"], 10) + self.assertEqual(mol_fp.get_params()["max_path"], 12) + self.assertEqual(mol_fp.get_params()["use_hs"], False) + self.assertEqual(mol_fp.get_params()["branched_paths"], False) + self.assertEqual(mol_fp.get_params()["use_bond_order"], False) + self.assertEqual(mol_fp.get_params()["count_simulation"], True) + self.assertEqual(mol_fp.get_params()["num_bits_per_feature"], 4) + self.assertEqual(mol_fp.get_params()["counted"], True) + self.assertEqual(mol_fp.get_params()["n_bits"], 1024) + + def test_setter_getter_error_handling(self) -> None: + """Test if the setters and getters work as expected when errors are encountered.""" + + mol_fp = Mol2PathFP() + params: dict[str, Any] = { + "min_path": 2, + "n_bits": 1024, + "return_as": "invalid-option", + } + self.assertRaises(ValueError, mol_fp.set_params, **params) + + +if __name__ == "__main__": + unittest.main() From 7326aea58c90474d0b4abb835c5dc0aea86d239a Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 12:21:39 +0200 Subject: [PATCH 11/24] implement maccs key fingerprint --- .../mol2any/mol2maccs_key_fingerprint.py | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 molpipeline/mol2any/mol2maccs_key_fingerprint.py diff --git a/molpipeline/mol2any/mol2maccs_key_fingerprint.py b/molpipeline/mol2any/mol2maccs_key_fingerprint.py new file mode 100644 index 00000000..701b5ef9 --- /dev/null +++ b/molpipeline/mol2any/mol2maccs_key_fingerprint.py @@ -0,0 +1,77 @@ +"""Implementation of MACCS key fingerprint.""" + +from typing import Literal + +import numpy as np +from numpy import typing as npt +from rdkit.Chem import MACCSkeys +from rdkit.DataStructs import ExplicitBitVect + +from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import ( + MolToFingerprintPipelineElement, +) +from molpipeline.utils.molpipeline_types import RDKitMol + + +class MolToMACCSFP(MolToFingerprintPipelineElement): + """MACCS key fingerprint. + + The MACCS keys are a set of 166 keys that encode the presence or absence of + particular substructures in a molecule. The MACCS keys are a subset of the + PubChem substructure keys. + + """ + + _n_bits = 167 # MACCS keys have 166 bits + 1 bit for an all-zero vector (bit 0) + + def __init__( + self, + return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse", + name: str = "MolToMACCS", + n_jobs: int = 1, + uuid: str | None = None, + ) -> None: + """Initialize MolToMACCS. + + Parameters + ---------- + return_as: Literal["sparse", "dense", "explicit_bit_vect"], optional (default="sparse") + Type of output. When "sparse" the fingerprints will be returned as a + scipy.sparse.csr_matrix holding a sparse representation of the bit vectors. + With "dense" a numpy matrix will be returned. + With "explicit_bit_vect" the fingerprints will be returned as a list of RDKit's + rdkit.DataStructs.cDataStructs.ExplicitBitVect. + name: str, optional (default="MolToMACCS") + Name of PipelineElement + n_jobs: int, optional (default=1) + Number of cores to use. + uuid: str | None, optional (default=None) + UUID of the PipelineElement. + + """ + super().__init__(return_as=return_as, name=name, n_jobs=n_jobs, uuid=uuid) + + def pretransform_single( + self, value: RDKitMol + ) -> dict[int, int] | npt.NDArray[np.int_] | ExplicitBitVect: + """Transform a single molecule to MACCS key fingerprint. + + Parameters + ---------- + value : RDKitMol + RDKit molecule. + + Returns + ------- + dict[int, int] | npt.NDArray[np.int_] | ExplicitBitVect + MACCS key fingerprint. + + """ + fingerprint = MACCSkeys.GenMACCSKeys(value) + if self._return_as == "explicit_bit_vect": + return fingerprint + if self._return_as == "dense": + return np.array(fingerprint) + if self._return_as == "sparse": + return {idx: 1 for idx in fingerprint.GetOnBits()} + raise ValueError(f"Unknown return_as value: {self._return_as}") From 42f5f792a67be636430ab201829e0df6fb785554 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 12:21:52 +0200 Subject: [PATCH 12/24] add maccs key to init --- molpipeline/mol2any/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/molpipeline/mol2any/__init__.py b/molpipeline/mol2any/__init__.py index 3d36171c..cf87f151 100644 --- a/molpipeline/mol2any/__init__.py +++ b/molpipeline/mol2any/__init__.py @@ -3,6 +3,7 @@ from molpipeline.mol2any.mol2bin import MolToBinary from molpipeline.mol2any.mol2concatinated_vector import MolToConcatenatedVector from molpipeline.mol2any.mol2inchi import MolToInchi, MolToInchiKey +from molpipeline.mol2any.mol2maccs_key_fingerprint import MolToMACCSFP from molpipeline.mol2any.mol2morgan_fingerprint import MolToMorganFP from molpipeline.mol2any.mol2net_charge import MolToNetCharge from molpipeline.mol2any.mol2path_fingerprint import Mol2PathFP @@ -13,6 +14,7 @@ "MolToBinary", "MolToConcatenatedVector", "MolToSmiles", + "MolToMACCSFP", "MolToMorganFP", "MolToNetCharge", "Mol2PathFP", From 413bb6a422fdcb3fab4c94ddd6748c83ad87d9bf Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 12:22:06 +0200 Subject: [PATCH 13/24] implement tests for maccs key fp --- .../test_mol2maccs_key_fingerprint.py | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py diff --git a/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py new file mode 100644 index 00000000..382840ed --- /dev/null +++ b/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py @@ -0,0 +1,134 @@ +"""Tests for the MolToFoldedMorganFingerprint pipeline element.""" + +from __future__ import annotations + +import unittest +from typing import Any + +import numpy as np + +from molpipeline import Pipeline +from molpipeline.any2mol import SmilesToMol +from molpipeline.mol2any import MolToMACCSFP + +# pylint: disable=duplicate-code +# Similar to test_mol2morgan_fingerprint.py and test_mol2path_fingerprint.py + +test_smiles = [ + "c1ccccc1", + "c1ccccc1C", + "NCCOCCCC(=O)O", +] + + +class TestMol2MorganFingerprint(unittest.TestCase): + """Unittest for MolToFoldedMorganFingerprint, which calculates folded Morgan Fingerprints.""" + + def test_can_be_constructed(self) -> None: + """Test if the MolToFoldedMorganFingerprint pipeline element can be constructed. + + Returns + ------- + None + """ + mol_fp = MolToMACCSFP() + mol_fp_copy = mol_fp.copy() + self.assertTrue(mol_fp_copy is not mol_fp) + for key, value in mol_fp.get_params().items(): + self.assertEqual(value, mol_fp_copy.get_params()[key]) + mol_fp_recreated = MolToMACCSFP(**mol_fp.get_params()) + for key, value in mol_fp.get_params().items(): + self.assertEqual(value, mol_fp_recreated.get_params()[key]) + + def test_sparse_dense_accordance(self) -> None: + """Test if the calculation of Morgan fingprints in dense and sparse are equal. + + Compared to precalculated values. + + Returns + ------- + None + """ + smi2mol = SmilesToMol() + sparse_maccs = MolToMACCSFP(return_as="sparse") + dense_maccs = MolToMACCSFP(return_as="dense") + sparse_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("sparse_maccs", sparse_maccs), + ], + ) + dense_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("dense_maccs", dense_maccs), + ], + ) + + sparse_output = sparse_pipeline.fit_transform(test_smiles) + dense_output = dense_pipeline.fit_transform(test_smiles) + + self.assertTrue(np.all(sparse_output.toarray() == dense_output)) + + def test_output_types(self) -> None: + """Test equality of different output_types.""" + + smi2mol = SmilesToMol() + sparse_maccs = MolToMACCSFP(return_as="sparse") + dense_maccs = MolToMACCSFP(return_as="dense") + explicit_bit_vect_maccs = MolToMACCSFP(return_as="explicit_bit_vect") + sparse_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("sparse_maccs", sparse_maccs), + ], + ) + dense_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("dense_maccs", dense_maccs), + ], + ) + explicit_bit_vect_pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("explicit_bit_vect_maccs", explicit_bit_vect_maccs), + ], + ) + + sparse_output = sparse_pipeline.fit_transform(test_smiles) + dense_output = dense_pipeline.fit_transform(test_smiles) + explicit_bit_vect_maccs_output = explicit_bit_vect_pipeline.fit_transform( + test_smiles + ) + + self.assertTrue(np.all(sparse_output.toarray() == dense_output)) + + self.assertTrue( + np.equal( + dense_output, + np.array(explicit_bit_vect_maccs_output), + ).all() + ) + + def test_setter_getter(self) -> None: + """Test if the setters and getters work as expected.""" + mol_fp = MolToMACCSFP() + params: dict[str, Any] = { + "return_as": "dense", + } + mol_fp.set_params(**params) + self.assertEqual(mol_fp.get_params()["return_as"], "dense") + + def test_setter_getter_error_handling(self) -> None: + """Test if the setters and getters work as expected when errors are encountered.""" + + mol_fp = MolToMACCSFP() + params: dict[str, Any] = { + "return_as": "invalid-option", + } + self.assertRaises(ValueError, mol_fp.set_params, **params) + + +if __name__ == "__main__": + unittest.main() From 62be9e844dae2d85453aad22fb8e83d02cca591c Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 12:23:37 +0200 Subject: [PATCH 14/24] Fix copy paste error --- .../test_mol2any/test_mol2maccs_key_fingerprint.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py index 382840ed..41fb2510 100644 --- a/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py @@ -1,4 +1,4 @@ -"""Tests for the MolToFoldedMorganFingerprint pipeline element.""" +"""Tests for the MolToMACCSFP pipeline element.""" from __future__ import annotations @@ -21,11 +21,11 @@ ] -class TestMol2MorganFingerprint(unittest.TestCase): - """Unittest for MolToFoldedMorganFingerprint, which calculates folded Morgan Fingerprints.""" +class TestMolToMACCSFP(unittest.TestCase): + """Unittest for MolToMACCSFP, which calculates MACCS Key Fingerprints.""" def test_can_be_constructed(self) -> None: - """Test if the MolToFoldedMorganFingerprint pipeline element can be constructed. + """Test if the MolToMACCSFP pipeline element can be constructed. Returns ------- @@ -41,7 +41,7 @@ def test_can_be_constructed(self) -> None: self.assertEqual(value, mol_fp_recreated.get_params()[key]) def test_sparse_dense_accordance(self) -> None: - """Test if the calculation of Morgan fingprints in dense and sparse are equal. + """Test if the calculation of MACCS Keys for dense and sparse are equal. Compared to precalculated values. From ae044adfc985c23186942b8709033d64f6252925 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 12:46:46 +0200 Subject: [PATCH 15/24] Add tests for counted fingerprint --- .../test_mol2morgan_fingerprint.py | 21 ++++++++++++++++ .../test_mol2any/test_mol2path_fingerprint.py | 25 ++++++++++++++++++- 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index ce95b6c2..5899c1ca 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -37,6 +37,27 @@ def test_can_be_constructed(self) -> None: for key, value in mol_fp.get_params().items(): self.assertEqual(value, mol_fp_recreated.get_params()[key]) + def test_counted_bits(self) -> None: + """Test if the option counted bits works as expected. + + Returns + ------- + None + """ + mol_fp = MolToMorganFP(radius=2, n_bits=1024) + smi2mol = SmilesToMol() + pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("mol_fp", mol_fp), + ], + ) + output_binary = pipeline.fit_transform(test_smiles) + pipeline.set_params(mol_fp__counted=True) + output_counted = pipeline.fit_transform(test_smiles) + self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) + + def test_sparse_dense_accordance(self) -> None: """Test if the calculation of Morgan fingprints in dense and sparse are equal. diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py index 13d724c9..2aa0f78d 100644 --- a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -11,6 +11,9 @@ from molpipeline.any2mol import SmilesToMol from molpipeline.mol2any import Mol2PathFP + +# pylint: disable=duplicate-code + test_smiles = [ "c1ccccc1", "c1ccccc1C", @@ -103,13 +106,33 @@ def test_output_types(self) -> None: self.assertTrue(np.all(sparse_output.toarray() == dense_output)) - self.assertTrue( # pylint: disable=duplicate-code + self.assertTrue( np.equal( dense_output, np.array(explicit_bit_vect_path_fp_output), ).all() ) + def test_counted_bits(self) -> None: + """Test if the option counted bits works as expected. + + Returns + ------- + None + """ + mol_fp = Mol2PathFP(n_bits=1024) + smi2mol = SmilesToMol() + pipeline = Pipeline( + [ + ("smi2mol", smi2mol), + ("mol_fp", mol_fp), + ], + ) + output_binary = pipeline.fit_transform(test_smiles) + pipeline.set_params(mol_fp__counted=True) + output_counted = pipeline.fit_transform(test_smiles) + self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) + def test_setter_getter(self) -> None: """Test if the setters and getters work as expected.""" mol_fp = Mol2PathFP() From e063dea7a82b87ae610dc1c87caa4806a70b2013 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 12:47:14 +0200 Subject: [PATCH 16/24] black --- tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py | 1 - tests/test_elements/test_mol2any/test_mol2path_fingerprint.py | 1 - 2 files changed, 2 deletions(-) diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index 5899c1ca..f17c53c0 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -57,7 +57,6 @@ def test_counted_bits(self) -> None: output_counted = pipeline.fit_transform(test_smiles) self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) - def test_sparse_dense_accordance(self) -> None: """Test if the calculation of Morgan fingprints in dense and sparse are equal. diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py index 2aa0f78d..68d391cb 100644 --- a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -11,7 +11,6 @@ from molpipeline.any2mol import SmilesToMol from molpipeline.mol2any import Mol2PathFP - # pylint: disable=duplicate-code test_smiles = [ From 1dd0f46abea837b5f83af649b0109fd0c9412335 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 15:56:22 +0200 Subject: [PATCH 17/24] Use rdkit structures to compile data for sparse fp --- .../mol2any/mol2bitvector.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py index bd07b5fd..6bbc3f10 100644 --- a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py +++ b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py @@ -259,20 +259,23 @@ def pretransform_single( Dictionary with feature-position as key and count as value. """ fingerprint_generator = self._get_fp_generator() - if self._return_as == "explicit_bit_vect": + if self._return_as == "dense": if self.counted: - return fingerprint_generator.GetCountFingerprint(value) - return fingerprint_generator.GetFingerprint(value) + return fingerprint_generator.GetCountFingerprintAsNumPy(value) + return fingerprint_generator.GetFingerprintAsNumPy(value) if self.counted: - fingerprint = fingerprint_generator.GetCountFingerprintAsNumPy(value) + fingerprint = fingerprint_generator.GetCountFingerprint(value) else: - fingerprint = fingerprint_generator.GetFingerprintAsNumPy(value) + fingerprint = fingerprint_generator.GetFingerprint(value) - if self._return_as == "dense": + if self._return_as == "explicit_bit_vect": return fingerprint - return {pos: count for pos, count in enumerate(fingerprint) if count > 0} + if self.counted: + return fingerprint.GetNonzeroElements() + + return {pos: 1 for pos in fingerprint.GetOnBits()} def get_params(self, deep: bool = True) -> dict[str, Any]: """Get object parameters relevant for copying the class. From e15c7c3f39ca6e65686ef532e5a2dd95a4c2267b Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 16:11:36 +0200 Subject: [PATCH 18/24] Update docstrings --- .../abstract_pipeline_elements/mol2any/mol2bitvector.py | 6 ++++-- molpipeline/mol2any/mol2path_fingerprint.py | 2 +- .../test_elements/test_mol2any/test_mol2path_fingerprint.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py index 6bbc3f10..66317966 100644 --- a/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py +++ b/molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py @@ -255,8 +255,10 @@ def pretransform_single( Returns ------- - dict[int, int] - Dictionary with feature-position as key and count as value. + ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int] + If return_as is "explicit_bit_vect" return ExplicitBitVect. + If return_as is "dense" return numpy array. + If return_as is "sparse" return dictionary with feature-position as key and count as value. """ fingerprint_generator = self._get_fp_generator() if self._return_as == "dense": diff --git a/molpipeline/mol2any/mol2path_fingerprint.py b/molpipeline/mol2any/mol2path_fingerprint.py index b04c325f..aab45014 100644 --- a/molpipeline/mol2any/mol2path_fingerprint.py +++ b/molpipeline/mol2any/mol2path_fingerprint.py @@ -1,4 +1,4 @@ -"""Implementations for the Morgan fingerprint.""" +"""Implementations for the RDKit Path Fingerprint.""" from __future__ import annotations # for all the python 3.8 users out there. diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py index 68d391cb..306325cd 100644 --- a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -24,7 +24,7 @@ class TestMol2PathFingerprint(unittest.TestCase): """Unittest for Mol2PathFP, which calculates the RDKit Path Fingerprint.""" def test_can_be_constructed(self) -> None: - """Test if the MolToFoldedMorganFingerprint pipeline element can be constructed. + """Test if the Mol2PathFP pipeline element can be constructed. Returns ------- @@ -40,7 +40,7 @@ def test_can_be_constructed(self) -> None: self.assertEqual(value, mol_fp_recreated.get_params()[key]) def test_sparse_dense_accordance(self) -> None: - """Test if the calculation of Morgan fingprints in dense and sparse are equal. + """Test if the calculation of path-fingprints in dense and sparse are equal. Compared to precalculated values. From c347c5f0df07e197eea63465b4530d75a2fadcf6 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 16:22:09 +0200 Subject: [PATCH 19/24] disallow fingerprints of length 0 --- molpipeline/mol2any/mol2morgan_fingerprint.py | 4 ++-- molpipeline/mol2any/mol2path_fingerprint.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/molpipeline/mol2any/mol2morgan_fingerprint.py b/molpipeline/mol2any/mol2morgan_fingerprint.py index d7a0e10b..7045c222 100644 --- a/molpipeline/mol2any/mol2morgan_fingerprint.py +++ b/molpipeline/mol2any/mol2morgan_fingerprint.py @@ -78,11 +78,11 @@ def __init__( n_jobs=n_jobs, uuid=uuid, ) - if isinstance(n_bits, int) and n_bits >= 0: + if isinstance(n_bits, int) and n_bits > 0: self._n_bits = n_bits else: raise ValueError( - f"Number of bits has to be a positive integer! (Received: {n_bits})" + f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})" ) def get_params(self, deep: bool = True) -> dict[str, Any]: diff --git a/molpipeline/mol2any/mol2path_fingerprint.py b/molpipeline/mol2any/mol2path_fingerprint.py index aab45014..7bbdb5e1 100644 --- a/molpipeline/mol2any/mol2path_fingerprint.py +++ b/molpipeline/mol2any/mol2path_fingerprint.py @@ -97,11 +97,11 @@ def __init__( n_jobs=n_jobs, uuid=uuid, ) - if isinstance(n_bits, int) and n_bits >= 0: + if isinstance(n_bits, int) and n_bits > 0: self._n_bits = n_bits else: raise ValueError( - f"Number of bits has to be a positive integer! (Received: {n_bits})" + f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})" ) self._min_path = min_path self._max_path = max_path From d53800c89fc289477b1e44aa6811038659c01e2a Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 16:25:21 +0200 Subject: [PATCH 20/24] switch variable check to guard format --- molpipeline/mol2any/mol2morgan_fingerprint.py | 5 ++--- molpipeline/mol2any/mol2path_fingerprint.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/molpipeline/mol2any/mol2morgan_fingerprint.py b/molpipeline/mol2any/mol2morgan_fingerprint.py index 7045c222..2f079e38 100644 --- a/molpipeline/mol2any/mol2morgan_fingerprint.py +++ b/molpipeline/mol2any/mol2morgan_fingerprint.py @@ -78,12 +78,11 @@ def __init__( n_jobs=n_jobs, uuid=uuid, ) - if isinstance(n_bits, int) and n_bits > 0: - self._n_bits = n_bits - else: + if not isinstance(n_bits, int) or n_bits < 1: raise ValueError( f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})" ) + self._n_bits = n_bits def get_params(self, deep: bool = True) -> dict[str, Any]: """Return all parameters defining the object. diff --git a/molpipeline/mol2any/mol2path_fingerprint.py b/molpipeline/mol2any/mol2path_fingerprint.py index 7bbdb5e1..368e8cb7 100644 --- a/molpipeline/mol2any/mol2path_fingerprint.py +++ b/molpipeline/mol2any/mol2path_fingerprint.py @@ -97,12 +97,11 @@ def __init__( n_jobs=n_jobs, uuid=uuid, ) - if isinstance(n_bits, int) and n_bits > 0: - self._n_bits = n_bits - else: + if not isinstance(n_bits, int) or n_bits < 1: raise ValueError( f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})" ) + self._n_bits = n_bits self._min_path = min_path self._max_path = max_path self._use_hs = use_hs From 98898e3010d997eeba96dfcab0b0e5538065601e Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 16:27:54 +0200 Subject: [PATCH 21/24] remove redundant tests --- .../test_mol2maccs_key_fingerprint.py | 30 ------------------- .../test_mol2morgan_fingerprint.py | 30 ------------------- .../test_mol2any/test_mol2path_fingerprint.py | 30 ------------------- 3 files changed, 90 deletions(-) diff --git a/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py index 41fb2510..553c28ec 100644 --- a/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2maccs_key_fingerprint.py @@ -40,36 +40,6 @@ def test_can_be_constructed(self) -> None: for key, value in mol_fp.get_params().items(): self.assertEqual(value, mol_fp_recreated.get_params()[key]) - def test_sparse_dense_accordance(self) -> None: - """Test if the calculation of MACCS Keys for dense and sparse are equal. - - Compared to precalculated values. - - Returns - ------- - None - """ - smi2mol = SmilesToMol() - sparse_maccs = MolToMACCSFP(return_as="sparse") - dense_maccs = MolToMACCSFP(return_as="dense") - sparse_pipeline = Pipeline( - [ - ("smi2mol", smi2mol), - ("sparse_maccs", sparse_maccs), - ], - ) - dense_pipeline = Pipeline( - [ - ("smi2mol", smi2mol), - ("dense_maccs", dense_maccs), - ], - ) - - sparse_output = sparse_pipeline.fit_transform(test_smiles) - dense_output = dense_pipeline.fit_transform(test_smiles) - - self.assertTrue(np.all(sparse_output.toarray() == dense_output)) - def test_output_types(self) -> None: """Test equality of different output_types.""" diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index f17c53c0..3a1812a8 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -57,36 +57,6 @@ def test_counted_bits(self) -> None: output_counted = pipeline.fit_transform(test_smiles) self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) - def test_sparse_dense_accordance(self) -> None: - """Test if the calculation of Morgan fingprints in dense and sparse are equal. - - Compared to precalculated values. - - Returns - ------- - None - """ - smi2mol = SmilesToMol() - sparse_morgan = MolToMorganFP(radius=2, n_bits=1024, return_as="sparse") - dense_morgan = MolToMorganFP(radius=2, n_bits=1024, return_as="dense") - sparse_pipeline = Pipeline( - [ - ("smi2mol", smi2mol), - ("sparse_morgan", sparse_morgan), - ], - ) - dense_pipeline = Pipeline( - [ - ("smi2mol", smi2mol), - ("dense_morgan", dense_morgan), - ], - ) - - sparse_output = sparse_pipeline.fit_transform(test_smiles) - dense_output = dense_pipeline.fit_transform(test_smiles) - - self.assertTrue(np.all(sparse_output.toarray() == dense_output)) - def test_output_types(self) -> None: """Test equality of different output_types.""" diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py index 306325cd..2d0045e9 100644 --- a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -39,36 +39,6 @@ def test_can_be_constructed(self) -> None: for key, value in mol_fp.get_params().items(): self.assertEqual(value, mol_fp_recreated.get_params()[key]) - def test_sparse_dense_accordance(self) -> None: - """Test if the calculation of path-fingprints in dense and sparse are equal. - - Compared to precalculated values. - - Returns - ------- - None - """ - smi2mol = SmilesToMol() - sparse_path_fp = Mol2PathFP(n_bits=1024, return_as="sparse") - dense_path_fp = Mol2PathFP(n_bits=1024, return_as="dense") - sparse_pipeline = Pipeline( - [ - ("smi2mol", smi2mol), - ("sparse_path_fp", sparse_path_fp), - ], - ) - dense_pipeline = Pipeline( - [ - ("smi2mol", smi2mol), - ("dense_path_fp", dense_path_fp), - ], - ) - - sparse_output = sparse_pipeline.fit_transform(test_smiles) - dense_output = dense_pipeline.fit_transform(test_smiles) - - self.assertTrue(np.all(sparse_output.toarray() == dense_output)) - def test_output_types(self) -> None: """Test equality of different output_types.""" From 627af9a9faf4b29a412d9837191475352f121509 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 16:31:41 +0200 Subject: [PATCH 22/24] check if at least on bit is greater than the non_counted fp --- tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py | 1 + tests/test_elements/test_mol2any/test_mol2path_fingerprint.py | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index 3a1812a8..22891ec4 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -56,6 +56,7 @@ def test_counted_bits(self) -> None: pipeline.set_params(mol_fp__counted=True) output_counted = pipeline.fit_transform(test_smiles) self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) + self.assertTrue(np.any(output_counted.toarray() > output_binary.toarray())) def test_output_types(self) -> None: """Test equality of different output_types.""" diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py index 2d0045e9..960c4dbc 100644 --- a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -101,6 +101,7 @@ def test_counted_bits(self) -> None: pipeline.set_params(mol_fp__counted=True) output_counted = pipeline.fit_transform(test_smiles) self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) + self.assertTrue(np.any(output_counted.toarray() > output_binary.toarray())) def test_setter_getter(self) -> None: """Test if the setters and getters work as expected.""" From ee2c00d09e8a434337da8c8b248f5ed4e2790b38 Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 16:36:13 +0200 Subject: [PATCH 23/24] switch to dense --- .../test_mol2any/test_mol2morgan_fingerprint.py | 6 +++--- .../test_elements/test_mol2any/test_mol2path_fingerprint.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index 22891ec4..d4a65fb5 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -44,7 +44,7 @@ def test_counted_bits(self) -> None: ------- None """ - mol_fp = MolToMorganFP(radius=2, n_bits=1024) + mol_fp = MolToMorganFP(radius=2, n_bits=1024, return_as="dense") smi2mol = SmilesToMol() pipeline = Pipeline( [ @@ -55,8 +55,8 @@ def test_counted_bits(self) -> None: output_binary = pipeline.fit_transform(test_smiles) pipeline.set_params(mol_fp__counted=True) output_counted = pipeline.fit_transform(test_smiles) - self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) - self.assertTrue(np.any(output_counted.toarray() > output_binary.toarray())) + self.assertTrue(np.all(output_counted >= output_binary)) + self.assertTrue(np.any(output_counted > output_binary)) def test_output_types(self) -> None: """Test equality of different output_types.""" diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py index 960c4dbc..d3486782 100644 --- a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -89,7 +89,7 @@ def test_counted_bits(self) -> None: ------- None """ - mol_fp = Mol2PathFP(n_bits=1024) + mol_fp = Mol2PathFP(n_bits=1024, return_as="dense") smi2mol = SmilesToMol() pipeline = Pipeline( [ @@ -100,8 +100,8 @@ def test_counted_bits(self) -> None: output_binary = pipeline.fit_transform(test_smiles) pipeline.set_params(mol_fp__counted=True) output_counted = pipeline.fit_transform(test_smiles) - self.assertTrue(np.all(output_counted.toarray() >= output_binary.toarray())) - self.assertTrue(np.any(output_counted.toarray() > output_binary.toarray())) + self.assertTrue(np.all(output_counted >= output_binary)) + self.assertTrue(np.any(output_counted > output_binary)) def test_setter_getter(self) -> None: """Test if the setters and getters work as expected.""" From 5ac0809e0cd2c8cf8132dc098426537249d5a23c Mon Sep 17 00:00:00 2001 From: Christian Feldmann Date: Tue, 18 Jun 2024 16:39:18 +0200 Subject: [PATCH 24/24] check equal non-zero positions --- .../test_elements/test_mol2any/test_mol2morgan_fingerprint.py | 3 +++ tests/test_elements/test_mol2any/test_mol2path_fingerprint.py | 3 +++ 2 files changed, 6 insertions(+) diff --git a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py index d4a65fb5..3a5e94a9 100644 --- a/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2morgan_fingerprint.py @@ -55,6 +55,9 @@ def test_counted_bits(self) -> None: output_binary = pipeline.fit_transform(test_smiles) pipeline.set_params(mol_fp__counted=True) output_counted = pipeline.fit_transform(test_smiles) + self.assertTrue( + np.all(np.flatnonzero(output_counted) == np.flatnonzero(output_binary)) + ) self.assertTrue(np.all(output_counted >= output_binary)) self.assertTrue(np.any(output_counted > output_binary)) diff --git a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py index d3486782..691abfb9 100644 --- a/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py +++ b/tests/test_elements/test_mol2any/test_mol2path_fingerprint.py @@ -100,6 +100,9 @@ def test_counted_bits(self) -> None: output_binary = pipeline.fit_transform(test_smiles) pipeline.set_params(mol_fp__counted=True) output_counted = pipeline.fit_transform(test_smiles) + self.assertTrue( + np.all(np.flatnonzero(output_counted) == np.flatnonzero(output_binary)) + ) self.assertTrue(np.all(output_counted >= output_binary)) self.assertTrue(np.any(output_counted > output_binary))