Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fuzzy_scaffold can now return a pandas dataframe that is more readable. #188

Merged
merged 9 commits into from
May 5, 2023
39 changes: 31 additions & 8 deletions datamol/scaffold/_fuzzy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import collections
import itertools
import pandas as pd

from rdkit import Chem
from rdkit.Chem import rdFMCS
Expand Down Expand Up @@ -80,7 +81,7 @@ def fuzzy_scaffolding(
additional_templates: Optional[List[Mol]] = None,
ignore_non_ring: bool = False,
mcs_params: Optional[Dict[Any, Any]] = None,
) -> Tuple[set, Dict[str, dict], Dict[str, list]]:
) -> Tuple[set, pd.DataFrame, pd.DataFrame]:
"""Generate fuzzy scaffold with enforceable group that needs to appear
in the core, forcing to keep the full side chain if required

Expand All @@ -94,11 +95,13 @@ def fuzzy_scaffolding(

Returns:
- `set` - `scaffolds` - All found scaffolds in the molecules as valid smiles.
- `Dict[Dict]` - `scaffold_infos` - Infos on the scaffold mapping, ignoring any side chain that had
to be enforced. Key corresponds to generic scaffold smiles
Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS)
Values at ['mols'] corresponds to list of molecules matching the scaffold
- `Dict[List]` - `scaffold_to_group` - Map between each generic scaffold and the R-groups decomposition row.
- `pd.DataFrame` - `df_scaffold_infos_transposed` - A pandas dataframe with Infos on the scaffold mapping, ignoring
any side chain that had to be enforced. Key corresponds to generic scaffold smiles.
Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS)
Values at ['mols'] corresponds to list of molecules matching the scaffold
Values at ['scf'] corresponds to the list of scaffolds from MurckoScaffold.GetScaffoldForMol
- `pd.DataFrame` - `df_scaffold_groups` - A pandas dataframe with Map between each generic scaffold
and the R-groups decomposition row.
"""

# NOTE(hadim): consider parallelize this (if possible).
Expand Down Expand Up @@ -228,5 +231,25 @@ def fuzzy_scaffolding(
except:
continue
all_scaffolds.add(to_smiles(scaff))

return all_scaffolds, scf2infos, scf2groups
# if user wants a dataframe turned on...
# there are processing routines to make the df more readable.
df_infos = pd.DataFrame(scf2infos)
df_infos_t = df_infos.transpose()
df_infos_t.insert(0, "scf", list(scf2infos.keys()), True)
df_infos_t.reset_index(inplace=True, drop=True)

# relabel index and column labels to
# to be more readable
df_infos_t.index.name = "index"

df_groups = pd.DataFrame.from_dict(scf2groups, orient="index")
df_groups.reset_index(inplace=True, drop=True)

# relabel index and column labels to
# to be more readable
df_groups.index.name = "index"
df_groups.columns = [f"{str(h)}_core_group" for h in df_groups.columns]

# enter the scf columns at the first column for df_groups
df_groups.insert(0, "scf", list(scf2groups.keys()), True)
return all_scaffolds, df_infos_t, df_groups
23 changes: 23 additions & 0 deletions news/my-feature-or-branch.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
**Added:**

* fuzzy_scaffold can now return a pandas dataframe (when flagged) that is more readable.

**Changed:**

* <news item>

**Deprecated:**

* <news item>

**Removed:**

* <news item>

**Fixed:**

* <news item>

**Security:**

* <news item>
21 changes: 17 additions & 4 deletions tests/test_scaffold.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datamol as dm
import pandas as pd


def test_fuzzy_scaffolding():
Expand All @@ -8,16 +9,28 @@ def test_fuzzy_scaffolding():
"CC(NC(=O)CSCc1cccs1)C1CCCO1",
"CC1CCCCN1C(=O)CN1CCC[C@@H](N)C1",
"CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1", # no way this one (Remdesivir) is in the db
"COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1",
]

mols = [dm.to_mol(s) for s in smiles]
all_scaffolds, scf2infos, scf2groups = dm.scaffold.fuzzy_scaffolding(mols)

assert scf2infos.keys() == scf2groups.keys()
assert len(all_scaffolds) == 5

# NOTE(hadim): different version of rdkit (2020.09 vs 2021.03) returns
# different SMILES here.
# assert "O=C(CN1CCC[C@@H]([*:1])C1)N1CCCCC1[*:2]" in all_scaffolds
# assert "O=C(CSCc1cccs1)NC(C1CCCO1)[*:1]" in all_scaffolds
# assert "O=C(N=c1sccn1[*:1])C(Oc1ccc([*:3])cc1)[*:2]" in all_scaffolds

all_scaffolds, df_scf2infos, df_scf2groups = dm.scaffold.fuzzy_scaffolding(mols)

assert len(all_scaffolds) == 5
assert len(df_scf2infos.columns) == 3

# because we are returning the output for each scf
# these should be the same
assert len(df_scf2infos.index) == len(df_scf2groups.index)
assert list(df_scf2infos["scf"]) == list(df_scf2groups["scf"])

# mere coincidence that scf2infos and scf2groups for the columns have the
# the same length. the reason there are 3 not two is because it could have
# extra columns where a cell may have none values.
assert len(df_scf2groups.columns) == 3