Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

68 makescaffoldgeneric set all carbons to wild card symbols to allow a substrcutre search #69

Merged
109 changes: 108 additions & 1 deletion molpipeline/mol2mol/scaffolds.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@

from __future__ import annotations

from typing import Any, Optional

try:
from typing import Self # pylint: disable=no-name-in-module
except ImportError:
from typing_extensions import Self

from rdkit import Chem
from rdkit.Chem.Scaffolds import MurckoScaffold as RDKIT_MurckoScaffold

from molpipeline.abstract_pipeline_elements.core import (
Expand Down Expand Up @@ -38,6 +46,42 @@ class MakeScaffoldGeneric(_MolToMolPipelineElement):
Done to make scaffolds less speciffic.
"""

def __init__(
self,
generic_atoms: bool = False,
generic_bonds: bool = False,
name: str = "MakeScaffoldGeneric",
n_jobs: int = 1,
uuid: Optional[str] = None,
) -> None:
"""Initialize MakeScaffoldGeneric.

c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved
Note
----
Making atoms and or bonds generic allows to check their presence in via SubstructureSearches.
c-w-feldmann marked this conversation as resolved.
Show resolved Hide resolved
Otherwise, all atoms and bonds are set to carbon and single bond, which may not match the molecule.

Parameters
----------
generic_atoms: bool
If True, all atoms in the molecule are set to generic atoms (*).
generic_bonds: bool
If True, all bonds in the molecule are set to any bonds.
name: str
Name of pipeline element.
n_jobs: int
Number of jobs to use for parallelization.
uuid: Optional[str]
UUID of pipeline element.

Returns
-------
None
"""
self.generic_atoms = generic_atoms
self.generic_bonds = generic_bonds
super().__init__(name=name, n_jobs=n_jobs, uuid=uuid)

def pretransform_single(self, value: RDKitMol) -> OptionalMol:
"""Set all atoms to carbon and all bonds to single bond and return mol object.

Expand All @@ -52,4 +96,67 @@ def pretransform_single(self, value: RDKitMol) -> OptionalMol:
Molecule where all atoms are carbon and all bonds are single bonds.
If transformation failed, it returns InvalidInstance.
"""
return RDKIT_MurckoScaffold.MakeScaffoldGeneric(value)
scaffold = RDKIT_MurckoScaffold.MakeScaffoldGeneric(value)
if self.generic_atoms:
for atom in scaffold.GetAtoms():
atom.SetAtomicNum(0)
if self.generic_bonds:
for bond in scaffold.GetBonds():
bond.SetBondType(Chem.rdchem.BondType.UNSPECIFIED)
return scaffold

def get_params(self, deep: bool = True) -> dict[str, Any]:
"""Get parameters of pipeline element.

Parameters
----------
deep: bool
If True, return the parameters of the pipeline element.

Returns
-------
dict[str, Any]
Parameters of the pipeline element.
"""
parent_params = super().get_params()
if deep:
parent_params.update(
{
"generic_atoms": bool(self.generic_atoms),
"generic_bonds": bool(self.generic_bonds),
}
)
else:
parent_params.update(
{
"generic_atoms": self.generic_atoms,
"generic_bonds": self.generic_bonds,
}
)
return parent_params

def set_params(self, **parameters: dict[str, Any]) -> Self:
"""Set parameters of pipeline element.

Parameters
----------
parameters: dict[str, Any]
Parameters to set.

Returns
-------
Self
Pipeline element with set parameters.
"""
param_copy = parameters.copy()
generic_atoms = param_copy.pop("generic_atoms", None)
generic_bonds = param_copy.pop("generic_bonds", None)
if generic_atoms is not None:
if not isinstance(generic_atoms, bool):
raise ValueError("generic_atoms must be a boolean.")
self.generic_atoms = generic_atoms
if generic_bonds is not None:
if not isinstance(generic_bonds, bool):
raise ValueError("generic_bonds must be a boolean.")
self.generic_bonds = generic_bonds
return self
100 changes: 100 additions & 0 deletions tests/test_elements/test_mol2mol/test_mol2scaffold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Test the mol2scaffold module."""

from typing import Any
from unittest import TestCase

from molpipeline import Pipeline
from molpipeline.any2mol import AutoToMol
from molpipeline.mol2any import MolToSmiles
from molpipeline.mol2mol.scaffolds import MakeScaffoldGeneric, MurckoScaffold


class TestMurckoScaffold(TestCase):
"""Test the MurckoScaffold class."""

def test_murcko_scaffold_generation_pipeline(self) -> None:
"""Test the scaffold generation."""
scaffold_pipeline = Pipeline(
steps=[
("smiles_to_mol", AutoToMol()),
("murcko_scaffold", MurckoScaffold()),
("scaffold_to_smiles", MolToSmiles()),
]
)
smiles_list = ["Cc1ccc(=O)[nH]c1", "O=CC1CCC(c2ccccc2)CC1", "CCC"]
expected_scaffold_list = ["O=c1cccc[nH]1", "c1ccc(C2CCCCC2)cc1", ""]

scaffold_list = scaffold_pipeline.transform(smiles_list)
self.assertListEqual(expected_scaffold_list, scaffold_list)


class TestMakeScaffoldGeneric(TestCase):
"""Test the MakeScaffoldGeneric class."""

def setUp(self) -> None:
"""Set up the pipeline and common variables."""
self.generic_scaffold_pipeline = Pipeline(
steps=[
("smiles_to_mol", AutoToMol()),
("murcko_scaffold", MurckoScaffold()),
("make_scaffold_generic", MakeScaffoldGeneric()),
("scaffold_to_smiles", MolToSmiles()),
]
)
self.smiles_list = ["Cc1ccc(=O)[nH]c1", "O=CC1CCC(c2ccccc2)CC1", "CCC"]

def check_generic_scaffold(
self, params: dict[str, Any], expected_scaffold_list: list[str]
) -> None:
"""Helper function to set parameters and check the results.

Parameters
----------
params: dict[str, Any]
Parameters to set for the pipeline.
expected_scaffold_list: list[str]
Expected output of the pipeline.
"""
self.generic_scaffold_pipeline.set_params(**params)
generic_scaffold_list = self.generic_scaffold_pipeline.transform(
self.smiles_list
)
self.assertListEqual(expected_scaffold_list, generic_scaffold_list)

def test_generic_scaffold_generation_pipeline(self) -> None:
"""Test the generic scaffold generation."""
self.check_generic_scaffold(
params={}, expected_scaffold_list=["CC1CCCCC1", "C1CCC(C2CCCCC2)CC1", ""]
)

# Test the generic scaffold generation with generic atoms
self.check_generic_scaffold(
params={"make_scaffold_generic__generic_atoms": True},
expected_scaffold_list=["**1*****1", "*1***(*2*****2)**1", ""],
)

# Test the generic scaffold generation with generic bonds
self.check_generic_scaffold(
params={
"make_scaffold_generic__generic_atoms": False,
"make_scaffold_generic__generic_bonds": True,
},
expected_scaffold_list=[
"C~C1~C~C~C~C~C~1",
"C1~C~C~C(~C2~C~C~C~C~C~2)~C~C~1",
"",
],
)

# Test the generic scaffold generation with generic atoms and bonds
self.check_generic_scaffold(
params={
"make_scaffold_generic__generic_atoms": True,
"make_scaffold_generic__generic_bonds": True,
},
expected_scaffold_list=[
"*~*1~*~*~*~*~*~1",
"*1~*~*~*(~*2~*~*~*~*~*~2)~*~*~1",
"",
],
)
Loading