Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

27 extend to more fingerprints #29

Merged
merged 24 commits into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/linting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install "numpy<2.0.0"
pip install mypy
mypy . || exit_code=$?
mypy --install-types --non-interactive
Expand Down
126 changes: 125 additions & 1 deletion molpipeline/abstract_pipeline_elements/mol2any/mol2bitvector.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import numpy as np
import numpy.typing as npt
from rdkit.Chem import rdFingerprintGenerator
from rdkit.DataStructs import ExplicitBitVect
from scipy import sparse

Expand Down Expand Up @@ -196,14 +197,134 @@ def pretransform_single(
"""


class ABCMorganFingerprintPipelineElement(MolToFingerprintPipelineElement, abc.ABC):
class MolToRDKitGenFPElement(MolToFingerprintPipelineElement, abc.ABC):
"""Abstract class for PipelineElements using the FingeprintGenerator64."""

def __init__(
self,
counted: bool = False,
return_as: OutputDatatype = "sparse",
name: str = "MolToRDKitGenFin",
n_jobs: int = 1,
uuid: Optional[str] = None,
):
"""Initialize abstract class.

Parameters
----------
counted: bool
Whether to count the bits or not.
return_as: Literal["sparse", "dense", "explicit_bit_vect"]
Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix
name: str
Name of PipelineElement.
n_jobs:
Number of jobs.
uuid: Optional[str]
Unique identifier.
"""
super().__init__(
return_as=return_as,
name=name,
n_jobs=n_jobs,
uuid=uuid,
)
self.counted = counted

@abc.abstractmethod
def _get_fp_generator(self) -> rdFingerprintGenerator.FingeprintGenerator64:
"""Get fingerprint generator.

Returns
-------
rdFingerprintGenerator.FingeprintGenerator64
Fingerprint generator.
"""

def pretransform_single(
self, value: RDKitMol
) -> ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int]:
"""Transform a single compound to a dictionary.

Keys denote the feature position, values the count. Here always 1.
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved

Parameters
----------
value: RDKitMol
Molecule for which the fingerprint is generated.

Returns
-------
dict[int, int]
Dictionary with feature-position as key and count as value.
"""
fingerprint_generator = self._get_fp_generator()
if self._return_as == "explicit_bit_vect":
if self.counted:
return fingerprint_generator.GetCountFingerprint(value)
return fingerprint_generator.GetFingerprint(value)

if self.counted:
fingerprint = fingerprint_generator.GetCountFingerprintAsNumPy(value)
else:
fingerprint = fingerprint_generator.GetFingerprintAsNumPy(value)

if self._return_as == "dense":
return fingerprint

return {pos: count for pos, count in enumerate(fingerprint) if count > 0}
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Get object parameters relevant for copying the class.

Parameters
----------
deep: bool
If True get a deep copy of the parameters.

Returns
-------
dict[str, Any]
Dictionary of parameter names and values.
"""
parameters = super().get_params(deep)
if deep:
parameters["counted"] = bool(self.counted)
else:
parameters["counted"] = self.counted

return parameters

def set_params(self, **parameters: dict[str, Any]) -> Self:
"""Set object parameters relevant for copying the class.

Parameters
----------
parameters: dict[str, Any]
Dictionary of parameter names and values.

Returns
-------
Self
Copied object with updated parameters.
"""
parameter_dict_copy = dict(parameters)
counted = parameter_dict_copy.pop("counted", None)
if counted is not None:
self.counted = bool(counted)
super().set_params(**parameter_dict_copy)
return self


class ABCMorganFingerprintPipelineElement(MolToRDKitGenFPElement, abc.ABC):
"""Abstract Class for Morgan fingerprints."""

# pylint: disable=R0913
def __init__(
self,
radius: int = 2,
use_features: bool = False,
counted: bool = False,
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
name: str = "AbstractMorgan",
n_jobs: int = 1,
Expand All @@ -217,6 +338,8 @@ def __init__(
Radius of fingerprint.
use_features: bool
Whether to represent atoms by element or category (donor, acceptor. etc.)
counted: bool
Whether to count the bits or not.
return_as: Literal["sparse", "dense", "explicit_bit_vect"]
Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix
holding a sparse representation of the bit vectors. With "dense" a numpy matrix will be returned.
Expand All @@ -232,6 +355,7 @@ def __init__(
# pylint: disable=R0801
super().__init__(
return_as=return_as,
counted=counted,
name=name,
n_jobs=n_jobs,
uuid=uuid,
Expand Down
4 changes: 4 additions & 0 deletions molpipeline/mol2any/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,21 @@
from molpipeline.mol2any.mol2bin import MolToBinary
from molpipeline.mol2any.mol2concatinated_vector import MolToConcatenatedVector
from molpipeline.mol2any.mol2inchi import MolToInchi, MolToInchiKey
from molpipeline.mol2any.mol2maccs_key_fingerprint import MolToMACCSFP
from molpipeline.mol2any.mol2morgan_fingerprint import MolToMorganFP
from molpipeline.mol2any.mol2net_charge import MolToNetCharge
from molpipeline.mol2any.mol2path_fingerprint import Mol2PathFP
from molpipeline.mol2any.mol2rdkit_phys_chem import MolToRDKitPhysChem
from molpipeline.mol2any.mol2smiles import MolToSmiles

__all__ = [
"MolToBinary",
"MolToConcatenatedVector",
"MolToSmiles",
"MolToMACCSFP",
"MolToMorganFP",
"MolToNetCharge",
"Mol2PathFP",
"MolToInchi",
"MolToInchiKey",
"MolToRDKitPhysChem",
Expand Down
77 changes: 77 additions & 0 deletions molpipeline/mol2any/mol2maccs_key_fingerprint.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
"""Implementation of MACCS key fingerprint."""

from typing import Literal

import numpy as np
from numpy import typing as npt
from rdkit.Chem import MACCSkeys
from rdkit.DataStructs import ExplicitBitVect

from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
MolToFingerprintPipelineElement,
)
from molpipeline.utils.molpipeline_types import RDKitMol


class MolToMACCSFP(MolToFingerprintPipelineElement):
"""MACCS key fingerprint.

The MACCS keys are a set of 166 keys that encode the presence or absence of
particular substructures in a molecule. The MACCS keys are a subset of the
PubChem substructure keys.

"""

_n_bits = 167 # MACCS keys have 166 bits + 1 bit for an all-zero vector (bit 0)
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved

def __init__(
self,
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
name: str = "MolToMACCS",
n_jobs: int = 1,
uuid: str | None = None,
) -> None:
"""Initialize MolToMACCS.

Parameters
----------
return_as: Literal["sparse", "dense", "explicit_bit_vect"], optional (default="sparse")
Type of output. When "sparse" the fingerprints will be returned as a
scipy.sparse.csr_matrix holding a sparse representation of the bit vectors.
With "dense" a numpy matrix will be returned.
With "explicit_bit_vect" the fingerprints will be returned as a list of RDKit's
rdkit.DataStructs.cDataStructs.ExplicitBitVect.
name: str, optional (default="MolToMACCS")
Name of PipelineElement
n_jobs: int, optional (default=1)
Number of cores to use.
uuid: str | None, optional (default=None)
UUID of the PipelineElement.

"""
super().__init__(return_as=return_as, name=name, n_jobs=n_jobs, uuid=uuid)

def pretransform_single(
self, value: RDKitMol
) -> dict[int, int] | npt.NDArray[np.int_] | ExplicitBitVect:
"""Transform a single molecule to MACCS key fingerprint.

Parameters
----------
value : RDKitMol
RDKit molecule.

Returns
-------
dict[int, int] | npt.NDArray[np.int_] | ExplicitBitVect
MACCS key fingerprint.

"""
fingerprint = MACCSkeys.GenMACCSKeys(value)
JochenSiegWork marked this conversation as resolved.
Show resolved Hide resolved
if self._return_as == "explicit_bit_vect":
return fingerprint
if self._return_as == "dense":
return np.array(fingerprint)
if self._return_as == "sparse":
return {idx: 1 for idx in fingerprint.GetOnBits()}
raise ValueError(f"Unknown return_as value: {self._return_as}")
37 changes: 10 additions & 27 deletions molpipeline/mol2any/mol2morgan_fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,7 @@

import copy

import numpy as np
import numpy.typing as npt
from rdkit.Chem import AllChem, rdFingerprintGenerator
from rdkit.DataStructs import ExplicitBitVect

from molpipeline.abstract_pipeline_elements.mol2any.mol2bitvector import (
ABCMorganFingerprintPipelineElement,
Expand All @@ -35,6 +32,7 @@ def __init__(
radius: int = 2,
use_features: bool = False,
n_bits: int = 2048,
counted: bool = False,
return_as: Literal["sparse", "dense", "explicit_bit_vect"] = "sparse",
name: str = "MolToMorganFP",
n_jobs: int = 1,
Expand All @@ -50,6 +48,9 @@ def __init__(
Instead of atoms, features are encoded in the fingerprint. [2]
n_bits: int, optional (default=2048)
Size of fingerprint.
counted: bool, optional (default=False)
If True, the fingerprint will be counted.
If False, the fingerprint will be binary.
return_as: Literal["sparse", "dense", "explicit_bit_vect"]
Type of output. When "sparse" the fingerprints will be returned as a scipy.sparse.csr_matrix
holding a sparse representation of the bit vectors. With "dense" a numpy matrix will be returned.
Expand All @@ -71,6 +72,7 @@ def __init__(
super().__init__(
radius=radius,
use_features=use_features,
counted=counted,
return_as=return_as,
name=name,
n_jobs=n_jobs,
Expand Down Expand Up @@ -124,38 +126,19 @@ def set_params(self, **parameters: dict[str, Any]) -> Self:

return self

def pretransform_single(
self, value: RDKitMol
) -> ExplicitBitVect | npt.NDArray[np.int_] | dict[int, int]:
"""Transform a single compound to a dictionary.

Keys denote the feature position, values the count. Here always 1.

Parameters
----------
value: RDKitMol
Molecule for which the fingerprint is generated.
def _get_fp_generator(self) -> rdFingerprintGenerator.FingerprintGenerator:
"""Get the fingerprint generator.

Returns
-------
dict[int, int]
Dictionary with feature-position as key and count as value.
rdFingerprintGenerator.FingerprintGenerator
RDKit fingerprint generator.
"""
fingerprint_generator = rdFingerprintGenerator.GetMorganGenerator(
return rdFingerprintGenerator.GetMorganGenerator(
radius=self.radius,
fpSize=self._n_bits,
)

if self._return_as == "explicit_bit_vect":
return fingerprint_generator.GetFingerprint(value)
if self._return_as == "dense":
return fingerprint_generator.GetFingerprintAsNumPy(value)
# sparse return type
return {
bit_idx: 1
for bit_idx in fingerprint_generator.GetFingerprint(value).GetOnBits()
}

def _explain_rdmol(self, mol_obj: RDKitMol) -> dict[int, list[tuple[int, int]]]:
"""Get central atom and radius of all features in molecule.

Expand Down
Loading
Loading