support custom motifs in annotate_dataset, including regex

BojarLab · Dec 12, 2023 · 69743ba · 69743ba
1 parent 59610ad
commit 69743ba
Show file tree

Hide file tree

Showing 3 changed files with 40 additions and 17 deletions.
diff --git a/build/lib/glycowork/motif/annotate.py b/build/lib/glycowork/motif/annotate.py
@@ -6,6 +6,7 @@
 from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets
 from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph
 from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans
+from glycowork.motif.re import get_match
 
 
 def link_find(glycan):
@@ -48,7 +49,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
   | Arguments:
   | :-
   | glycan (string or networkx): glycan in IUPAC-condensed format (or as networkx graph) that has to contain a floating substituent
-  | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
+  | motifs (dataframe): dataframe of glycan motifs (name + sequence), can be used with a list of glycans too; default:motif_list
   | libr (dict): dictionary of form glycoletter:index
   | termini_list (list): list of monosaccharide positions (from 'terminal', 'internal', and 'flexible')
   | gmotifs (networkx): precalculated motif graphs for speed-up; default:None\n
@@ -59,7 +60,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
   if motifs is None:
     motifs = motif_list
   # Check whether termini are specified
-  if not termini_list:
+  if not termini_list and isinstance(motifs, pd.DataFrame):
     termini_list = [eval(k) for k in motifs.termini_spec]
   if libr is None:
     libr = lib
@@ -78,7 +79,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
                                 termini_list = termini_list,
                                 count = True) for k in range(len(motifs))]*1
 
-  out = pd.DataFrame(columns = motifs.motif_name)
+  out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
   out.loc[0] = res
   out.loc[0] = out.loc[0].astype('int')
   if isinstance(glycan, str):
@@ -137,16 +138,19 @@ def get_molecular_properties(glycan_list, verbose = False, placeholder = False):
 
 
 @rescue_glycans
-def annotate_dataset(glycans, motifs = None,
-                     feature_set = ['known'], termini_list = [], condense = False):
+def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
+                     termini_list = [], condense = False, custom_motifs = []):
   """wrapper function to annotate motifs in list of glycans\n
   | Arguments:
   | :-
   | glycans (list): list of IUPAC-condensed glycan sequences as strings
   | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
-  | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
+  | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
+  |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
+  |   'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
   | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
-  | condense (bool): if True, throws away columns with only zeroes; default:False\n
+  | condense (bool): if True, throws away columns with only zeroes; default:False
+  | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
   | Returns:
   | :-                      
   | Returns dataframe of glycans (rows) and presence/absence of known motifs (columns)
@@ -167,6 +171,13 @@ def annotate_dataset(glycans, motifs = None,
     # Counts literature-annotated motifs in each glycan
     shopping_cart.append(pd.concat([annotate_glycan(k, motifs = motifs, libr = libr,
                                                     gmotifs = gmotifs, termini_list = termini_list) for k in glycans], axis = 0))
+  if 'custom' in feature_set:
+    normal_motifs = [m for m in custom_motifs if not m.startswith('r')]
+    gmotifs = [glycan_to_nxGraph(g, libr = libr) for g in normal_motifs]
+    shopping_cart.append(pd.concat([annotate_glycan(k, motifs = normal_motifs, libr = libr,
+                                                    gmotifs = gmotifs) for k in glycans], axis = 0))
+    regex_motifs = [m[1:] for m in custom_motifs if m.startswith('r')]
+    shopping_cart.append(pd.concat([pd.DataFrame([len(get_match(p, k)) for p in regex_motifs], columns = regex_motifs, index = [k]) for k in glycans], axis = 0))
   if 'graph' in feature_set:
     # Calculates graph features of each glycan
     shopping_cart.append(pd.concat([generate_graph_features(k, libr = libr) for k in glycans], axis = 0))

diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
@@ -6,6 +6,7 @@
 from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets
 from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph
 from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans
+from glycowork.motif.re import get_match
 
 
 def link_find(glycan):
@@ -48,7 +49,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
   | Arguments:
   | :-
   | glycan (string or networkx): glycan in IUPAC-condensed format (or as networkx graph) that has to contain a floating substituent
-  | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
+  | motifs (dataframe): dataframe of glycan motifs (name + sequence), can be used with a list of glycans too; default:motif_list
   | libr (dict): dictionary of form glycoletter:index
   | termini_list (list): list of monosaccharide positions (from 'terminal', 'internal', and 'flexible')
   | gmotifs (networkx): precalculated motif graphs for speed-up; default:None\n
@@ -59,7 +60,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
   if motifs is None:
     motifs = motif_list
   # Check whether termini are specified
-  if not termini_list:
+  if not termini_list and isinstance(motifs, pd.DataFrame):
     termini_list = [eval(k) for k in motifs.termini_spec]
   if libr is None:
     libr = lib
@@ -78,7 +79,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
                                 termini_list = termini_list,
                                 count = True) for k in range(len(motifs))]*1
 
-  out = pd.DataFrame(columns = motifs.motif_name)
+  out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
   out.loc[0] = res
   out.loc[0] = out.loc[0].astype('int')
   if isinstance(glycan, str):
@@ -137,16 +138,19 @@ def get_molecular_properties(glycan_list, verbose = False, placeholder = False):
 
 
 @rescue_glycans
-def annotate_dataset(glycans, motifs = None,
-                     feature_set = ['known'], termini_list = [], condense = False):
+def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
+                     termini_list = [], condense = False, custom_motifs = []):
   """wrapper function to annotate motifs in list of glycans\n
   | Arguments:
   | :-
   | glycans (list): list of IUPAC-condensed glycan sequences as strings
   | motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
-  | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
+  | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
+  |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
+  |   'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
   | termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
-  | condense (bool): if True, throws away columns with only zeroes; default:False\n
+  | condense (bool): if True, throws away columns with only zeroes; default:False
+  | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
   | Returns:
   | :-                      
   | Returns dataframe of glycans (rows) and presence/absence of known motifs (columns)
@@ -167,6 +171,13 @@ def annotate_dataset(glycans, motifs = None,
     # Counts literature-annotated motifs in each glycan
     shopping_cart.append(pd.concat([annotate_glycan(k, motifs = motifs, libr = libr,
                                                     gmotifs = gmotifs, termini_list = termini_list) for k in glycans], axis = 0))
+  if 'custom' in feature_set:
+    normal_motifs = [m for m in custom_motifs if not m.startswith('r')]
+    gmotifs = [glycan_to_nxGraph(g, libr = libr) for g in normal_motifs]
+    shopping_cart.append(pd.concat([annotate_glycan(k, motifs = normal_motifs, libr = libr,
+                                                    gmotifs = gmotifs) for k in glycans], axis = 0))
+    regex_motifs = [m[1:] for m in custom_motifs if m.startswith('r')]
+    shopping_cart.append(pd.concat([pd.DataFrame([len(get_match(p, k)) for p in regex_motifs], columns = regex_motifs, index = [k]) for k in glycans], axis = 0))
   if 'graph' in feature_set:
     # Calculates graph features of each glycan
     shopping_cart.append(pd.concat([generate_graph_features(k, libr = libr) for k in glycans], axis = 0))

diff --git a/glycowork/motif/re.py b/glycowork/motif/re.py
@@ -38,7 +38,7 @@ def replacer(match):
       number = match.group(2)
       return f'({letter}1-{number})'
     pattern_component = re.sub(pattern, replacer, pattern_component)
-  return pattern_component
+  return pattern_component.replace('5Ac(a1', '5Ac(a2').replace('5Gc(a1', '5Gc(a2').replace('Kdn(a1', 'Kdn(a2').replace('Sia(a1', 'Sia(a2')
 
 
 def replace_patterns(s):
@@ -443,7 +443,8 @@ def trace_path(pattern_matches, ggraph):
     for component, component_matches in pattern_matches[idx:]:
       extended = False
       min_occur, max_occur = optional_components.get(component, (1, 1))
-      to_extend = try_matching(trace, component_matches, edges, min_occur, max_occur, branch = '[' in component)
+      branch = '(' in component and '(?' not in component
+      to_extend = try_matching(trace, component_matches, edges, min_occur, max_occur, branch = branch)
       if to_extend:
         extend = to_extend[-1] if not isinstance(to_extend, bool) else []
         extend = list(extend) if isinstance(extend, tuple) else extend
@@ -481,7 +482,7 @@ def fill_missing_in_list(lists):
       # Check whether the gap between current and previous element is exactly 2
       if gap == 2:
         filled_sublist.append(sublist[i] - 1)
-      elif gap > 2:
+      elif gap > 2 and gap % 2 == 0:
         filled_sublist.append(sublist[i-1] + 1)
       filled_sublist.append(sublist[i])
     filled_lists.append(filled_sublist)