datamol-io · hadim · May 5, 2023 · May 4, 2023 · May 4, 2023 · May 4, 2023
diff --git a/datamol/scaffold/_fuzzy.py b/datamol/scaffold/_fuzzy.py
@@ -6,6 +6,7 @@
 
 import collections
 import itertools
+import pandas as pd
 
 from rdkit import Chem
 from rdkit.Chem import rdFMCS
@@ -80,7 +81,7 @@ def fuzzy_scaffolding(
     additional_templates: Optional[List[Mol]] = None,
     ignore_non_ring: bool = False,
     mcs_params: Optional[Dict[Any, Any]] = None,
-) -> Tuple[set, Dict[str, dict], Dict[str, list]]:
+) -> Tuple[set, pd.DataFrame, pd.DataFrame]:
     """Generate fuzzy scaffold with enforceable group that needs to appear
     in the core, forcing to keep the full side chain if required
 
@@ -94,11 +95,13 @@ def fuzzy_scaffolding(
 
     Returns:
         - `set` - `scaffolds` - All found scaffolds in the molecules as valid smiles.
-        - `Dict[Dict]` - `scaffold_infos` - Infos on the scaffold mapping, ignoring any side chain that had
-                to be enforced. Key corresponds to generic scaffold smiles
-                Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS)
-                Values at ['mols'] corresponds to list of molecules matching the scaffold
-        - `Dict[List]` - `scaffold_to_group` - Map between each generic scaffold and the R-groups decomposition row.
+        - `pd.DataFrame` - `df_scaffold_infos_transposed` - A pandas dataframe with Infos on the scaffold mapping, ignoring
+            any side chain that had to be enforced. Key corresponds to generic scaffold smiles.
+            Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS)
+            Values at ['mols'] corresponds to list of molecules matching the scaffold
+            Values at ['scf'] corresponds to the list of scaffolds from MurckoScaffold.GetScaffoldForMol
+        - `pd.DataFrame` - `df_scaffold_groups` - A pandas dataframe with Map between each generic scaffold
+            and the R-groups decomposition row.
     """
 
     # NOTE(hadim): consider parallelize this (if possible).
@@ -228,5 +231,25 @@ def fuzzy_scaffolding(
             except:
                 continue
             all_scaffolds.add(to_smiles(scaff))
-
-    return all_scaffolds, scf2infos, scf2groups
+        # if user wants a dataframe turned on...
+        # there are processing routines to make the df more readable.
+    df_infos = pd.DataFrame(scf2infos)
+    df_infos_t = df_infos.transpose()
+    df_infos_t.insert(0, "scf", list(scf2infos.keys()), True)
+    df_infos_t.reset_index(inplace=True, drop=True)
+
+    # relabel index and column labels to
+    # to be more readable
+    df_infos_t.index.name = "index"
+
+    df_groups = pd.DataFrame.from_dict(scf2groups, orient="index")
+    df_groups.reset_index(inplace=True, drop=True)
+
+    # relabel index and column labels to
+    # to be more readable
+    df_groups.index.name = "index"
+    df_groups.columns = [f"{str(h)}_core_group" for h in df_groups.columns]
+
+    # enter the scf columns at the first column for df_groups
+    df_groups.insert(0, "scf", list(scf2groups.keys()), True)
+    return all_scaffolds, df_infos_t, df_groups
diff --git a/news/my-feature-or-branch.rst b/news/my-feature-or-branch.rst
@@ -0,0 +1,23 @@
+**Added:**
+
+* fuzzy_scaffold can now return a pandas dataframe (when flagged) that is more readable.
+
+**Changed:**
+
+* <news item>
+
+**Deprecated:**
+
+* <news item>
+
+**Removed:**
+
+* <news item>
+
+**Fixed:**
+
+* <news item>
+
+**Security:**
+
+* <news item>
diff --git a/tests/test_scaffold.py b/tests/test_scaffold.py
@@ -1,4 +1,5 @@
 import datamol as dm
+import pandas as pd
 
 
 def test_fuzzy_scaffolding():
@@ -8,16 +9,28 @@ def test_fuzzy_scaffolding():
         "CC(NC(=O)CSCc1cccs1)C1CCCO1",
         "CC1CCCCN1C(=O)CN1CCC[C@@H](N)C1",
         "CCC(CC)COC(=O)[C@H](C)N[P@](=O)(OC[C@H]1O[C@](C#N)([C@H](O)[C@@H]1O)C1=CC=C2N1N=CN=C2N)OC1=CC=CC=C1",  # no way this one (Remdesivir) is in the db
+        "COc1ccc(OC(C)C(=O)N=c2sccn2C)cc1",
     ]
 
     mols = [dm.to_mol(s) for s in smiles]
-    all_scaffolds, scf2infos, scf2groups = dm.scaffold.fuzzy_scaffolding(mols)
-
-    assert scf2infos.keys() == scf2groups.keys()
-    assert len(all_scaffolds) == 5
 
     # NOTE(hadim): different version of rdkit (2020.09 vs 2021.03) returns
     # different SMILES here.
     # assert "O=C(CN1CCC[C@@H]([*:1])C1)N1CCCCC1[*:2]" in all_scaffolds
     # assert "O=C(CSCc1cccs1)NC(C1CCCO1)[*:1]" in all_scaffolds
     # assert "O=C(N=c1sccn1[*:1])C(Oc1ccc([*:3])cc1)[*:2]" in all_scaffolds
+
+    all_scaffolds, df_scf2infos, df_scf2groups = dm.scaffold.fuzzy_scaffolding(mols)
+
+    assert len(all_scaffolds) == 5
+    assert len(df_scf2infos.columns) == 3
+
+    # because we are returning the output for each scf
+    # these should be the same
+    assert len(df_scf2infos.index) == len(df_scf2groups.index)
+    assert list(df_scf2infos["scf"]) == list(df_scf2groups["scf"])
+
+    # mere coincidence that scf2infos and scf2groups for the columns have the
+    # the same length. the reason there are 3 not two is because it could have
+    # extra columns where a cell may have none values.
+    assert len(df_scf2groups.columns) == 3