Skip to content

Commit

Permalink
27 extend to more fingerprints (#29)
Browse files Browse the repository at this point in the history
Add counted fingerprints, RDKitPath FP, and MACCS Key Fingerprint
  • Loading branch information
c-w-feldmann authored and JochenSiegWork committed Jun 27, 2024
1 parent c874430 commit f7bee38
Show file tree
Hide file tree
Showing 10 changed files with 712 additions and 51 deletions.
1 change: 1 addition & 0 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "numpy<2.0.0"
pip install mypy
mypy . || exit_code=$?
mypy --install-types --non-interactive
Expand Down
131 changes: 130 additions & 1 deletion molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import numpy as np
import numpy.typing as npt
from rdkit.Chem import rdFingerprintGenerator
from rdkit.DataStructs import ExplicitBitVect
from scipy import sparse

Expand Down Expand Up @@ -196,14 +197,139 @@ def pretransform_single(
"""


class ABCMorganFingerprintPipelineElement(MolToFingerprintPipelineElement, abc.ABC):
class MolToRDKitGenFPElement(MolToFingerprintPipelineElement, abc.ABC):
"""Abstract class for PipelineElements using the FingeprintGenerator64."""

def __init__(
self,
counted: bool = False,
return_as: OutputDatatype = "sparse",
name: str = "MolToRDKitGenFin",
n_jobs: int = 1,
uuid: Optional[str] = None,
):
"""Initialize abstract class.
Parameters
----------
counted: bool
Whether to count the bits or not.
return_as: Literal["sparse", "dense", "explicit_bit_vect"]
Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix
name: str
Name of PipelineElement.
n_jobs:
Number of jobs.
uuid: Optional[str]
Unique identifier.
"""
super().__init__(
return_as=return_as,
name=name,
n_jobs=n_jobs,
uuid=uuid,
)
self.counted = counted

@abc.abstractmethod
def _get_fp_generator(self) -> rdFingerprintGenerator.FingeprintGenerator64:
"""Get fingerprint generator.
Returns
-------
rdFingerprintGenerator.FingeprintGenerator64
Fingerprint generator.
"""

def pretransform_single(
self, value: RDKitMol
) -> ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int]:
"""Transform a single compound to a dictionary.
Keys denote the feature position, values the count. Here always 1.
Parameters
----------
value: RDKitMol
Molecule for which the fingerprint is generated.
Returns
-------
ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int]
If return_as is "explicit_bit_vect" return ExplicitBitVect.
If return_as is "dense" return numpy array.
If return_as is "sparse" return dictionary with feature-position as key and count as value.
"""
fingerprint_generator = self._get_fp_generator()
if self._return_as == "dense":
if self.counted:
return fingerprint_generator.GetCountFingerprintAsNumPy(value)
return fingerprint_generator.GetFingerprintAsNumPy(value)

if self.counted:
fingerprint = fingerprint_generator.GetCountFingerprint(value)
else:
fingerprint = fingerprint_generator.GetFingerprint(value)

if self._return_as == "explicit_bit_vect":
return fingerprint

if self.counted:
return fingerprint.GetNonzeroElements()

return {pos: 1 for pos in fingerprint.GetOnBits()}

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Get object parameters relevant for copying the class.
Parameters
----------
deep: bool
If True get a deep copy of the parameters.
Returns
-------
dict[str, Any]
Dictionary of parameter names and values.
"""
parameters = super().get_params(deep)
if deep:
parameters["counted"] = bool(self.counted)
else:
parameters["counted"] = self.counted

return parameters

def set_params(self, **parameters: dict[str, Any]) -> Self:
"""Set object parameters relevant for copying the class.
Parameters
----------
parameters: dict[str, Any]
Dictionary of parameter names and values.
Returns
-------
Self
Copied object with updated parameters.
"""
parameter_dict_copy = dict(parameters)
counted = parameter_dict_copy.pop("counted", None)
if counted is not None:
self.counted = bool(counted)
super().set_params(**parameter_dict_copy)
return self


class ABCMorganFingerprintPipelineElement(MolToRDKitGenFPElement, abc.ABC):
"""Abstract Class for Morgan fingerprints."""

# pylint: disable=R0913
def __init__(
self,
radius: int = 2,
use_features: bool = False,
counted: bool = False,
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
name: str = "AbstractMorgan",
n_jobs: int = 1,
Expand All @@ -217,6 +343,8 @@ def __init__(
Radius of fingerprint.
use_features: bool
Whether to represent atoms by element or category (donor, acceptor. etc.)
counted: bool
Whether to count the bits or not.
return_as: Literal["sparse", "dense", "explicit_bit_vect"]
Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix
holding a sparse representation of the bit vectors. With "dense" a numpy matrix will be returned.
Expand All @@ -232,6 +360,7 @@ def __init__(
# pylint: disable=R0801
super().__init__(
return_as=return_as,
counted=counted,
name=name,
n_jobs=n_jobs,
uuid=uuid,
Expand Down
4 changes: 4 additions & 0 deletions molpipeline/mol2any/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@
from molpipeline.mol2any.mol2bin import MolToBinary
from molpipeline.mol2any.mol2concatinated_vector import MolToConcatenatedVector
from molpipeline.mol2any.mol2inchi import MolToInchi, MolToInchiKey
from molpipeline.mol2any.mol2maccs_key_fingerprint import MolToMACCSFP
from molpipeline.mol2any.mol2morgan_fingerprint import MolToMorganFP
from molpipeline.mol2any.mol2net_charge import MolToNetCharge
from molpipeline.mol2any.mol2path_fingerprint import Mol2PathFP
from molpipeline.mol2any.mol2rdkit_phys_chem import MolToRDKitPhysChem
from molpipeline.mol2any.mol2smiles import MolToSmiles

__all__ = [
"MolToBinary",
"MolToConcatenatedVector",
"MolToSmiles",
"MolToMACCSFP",
"MolToMorganFP",
"MolToNetCharge",
"Mol2PathFP",
"MolToInchi",
"MolToInchiKey",
"MolToRDKitPhysChem",
Expand Down
77 changes: 77 additions & 0 deletions molpipeline/mol2any/mol2maccs_key_fingerprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Implementation of MACCS key fingerprint."""

from typing import Literal

import numpy as np
from numpy import typing as npt
from rdkit.Chem import MACCSkeys
from rdkit.DataStructs import ExplicitBitVect

from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
MolToFingerprintPipelineElement,
)
from molpipeline.utils.molpipeline_types import RDKitMol


class MolToMACCSFP(MolToFingerprintPipelineElement):
"""MACCS key fingerprint.
The MACCS keys are a set of 166 keys that encode the presence or absence of
particular substructures in a molecule. The MACCS keys are a subset of the
PubChem substructure keys.
"""

_n_bits = 167 # MACCS keys have 166 bits + 1 bit for an all-zero vector (bit 0)

def __init__(
self,
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
name: str = "MolToMACCS",
n_jobs: int = 1,
uuid: str | None = None,
) -> None:
"""Initialize MolToMACCS.
Parameters
----------
return_as: Literal["sparse", "dense", "explicit_bit_vect"], optional (default="sparse")
Type of output. When "sparse" the fingerprints will be returned as a
scipy.sparse.csr_matrix holding a sparse representation of the bit vectors.
With "dense" a numpy matrix will be returned.
With "explicit_bit_vect" the fingerprints will be returned as a list of RDKit's
rdkit.DataStructs.cDataStructs.ExplicitBitVect.
name: str, optional (default="MolToMACCS")
Name of PipelineElement
n_jobs: int, optional (default=1)
Number of cores to use.
uuid: str | None, optional (default=None)
UUID of the PipelineElement.
"""
super().__init__(return_as=return_as, name=name, n_jobs=n_jobs, uuid=uuid)

def pretransform_single(
self, value: RDKitMol
) -> dict[int, int] | npt.NDArray[np.int_] | ExplicitBitVect:
"""Transform a single molecule to MACCS key fingerprint.
Parameters
----------
value : RDKitMol
RDKit molecule.
Returns
-------
dict[int, int] | npt.NDArray[np.int_] | ExplicitBitVect
MACCS key fingerprint.
"""
fingerprint = MACCSkeys.GenMACCSKeys(value)
if self._return_as == "explicit_bit_vect":
return fingerprint
if self._return_as == "dense":
return np.array(fingerprint)
if self._return_as == "sparse":
return {idx: 1 for idx in fingerprint.GetOnBits()}
raise ValueError(f"Unknown return_as value: {self._return_as}")
44 changes: 13 additions & 31 deletions molpipeline/mol2any/mol2morgan_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@

import copy

import numpy as np
import numpy.typing as npt
from rdkit.Chem import AllChem, rdFingerprintGenerator
from rdkit.DataStructs import ExplicitBitVect

from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
ABCMorganFingerprintPipelineElement,
Expand All @@ -35,6 +32,7 @@ def __init__(
radius: int = 2,
use_features: bool = False,
n_bits: int = 2048,
counted: bool = False,
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
name: str = "MolToMorganFP",
n_jobs: int = 1,
Expand All @@ -50,6 +48,9 @@ def __init__(
Instead of atoms, features are encoded in the fingerprint. [2]
n_bits: int, optional (default=2048)
Size of fingerprint.
counted: bool, optional (default=False)
If True, the fingerprint will be counted.
If False, the fingerprint will be binary.
return_as: Literal["sparse", "dense", "explicit_bit_vect"]
Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix
holding a sparse representation of the bit vectors. With "dense" a numpy matrix will be returned.
Expand All @@ -71,17 +72,17 @@ def __init__(
super().__init__(
radius=radius,
use_features=use_features,
counted=counted,
return_as=return_as,
name=name,
n_jobs=n_jobs,
uuid=uuid,
)
if isinstance(n_bits, int) and n_bits >= 0:
self._n_bits = n_bits
else:
if not isinstance(n_bits, int) or n_bits < 1:
raise ValueError(
f"Number of bits has to be a positive integer! (Received: {n_bits})"
f"Number of bits has to be a positve integer, which is > 0! (Received: {n_bits})"
)
self._n_bits = n_bits

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Return all parameters defining the object.
Expand Down Expand Up @@ -124,38 +125,19 @@ def set_params(self, **parameters: dict[str, Any]) -> Self:

return self

def pretransform_single(
self, value: RDKitMol
) -> ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int]:
"""Transform a single compound to a dictionary.
Keys denote the feature position, values the count. Here always 1.
Parameters
----------
value: RDKitMol
Molecule for which the fingerprint is generated.
def _get_fp_generator(self) -> rdFingerprintGenerator.FingerprintGenerator:
"""Get the fingerprint generator.
Returns
-------
dict[int, int]
Dictionary with feature-position as key and count as value.
rdFingerprintGenerator.FingerprintGenerator
RDKit fingerprint generator.
"""
fingerprint_generator = rdFingerprintGenerator.GetMorganGenerator(
return rdFingerprintGenerator.GetMorganGenerator(
radius=self.radius,
fpSize=self._n_bits,
)

if self._return_as == "explicit_bit_vect":
return fingerprint_generator.GetFingerprint(value)
if self._return_as == "dense":
return fingerprint_generator.GetFingerprintAsNumPy(value)
# sparse return type
return {
bit_idx: 1
for bit_idx in fingerprint_generator.GetFingerprint(value).GetOnBits()
}

def _explain_rdmol(self, mol_obj: RDKitMol) -> dict[int, list[tuple[int, int]]]:
"""Get central atom and radius of all features in molecule.
Expand Down
Loading

0 comments on commit f7bee38

Please sign in to comment.