Skip to content

Commit

Permalink
support custom motifs in annotate_dataset, including regex
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Dec 12, 2023
1 parent 59610ad commit 69743ba
Show file tree
Hide file tree
Showing 3 changed files with 40 additions and 17 deletions.
25 changes: 18 additions & 7 deletions build/lib/glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets
from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph
from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans
from glycowork.motif.re import get_match


def link_find(glycan):
Expand Down Expand Up @@ -48,7 +49,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
| Arguments:
| :-
| glycan (string or networkx): glycan in IUPAC-condensed format (or as networkx graph) that has to contain a floating substituent
| motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
| motifs (dataframe): dataframe of glycan motifs (name + sequence), can be used with a list of glycans too; default:motif_list
| libr (dict): dictionary of form glycoletter:index
| termini_list (list): list of monosaccharide positions (from 'terminal', 'internal', and 'flexible')
| gmotifs (networkx): precalculated motif graphs for speed-up; default:None\n
Expand All @@ -59,7 +60,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
if motifs is None:
motifs = motif_list
# Check whether termini are specified
if not termini_list:
if not termini_list and isinstance(motifs, pd.DataFrame):
termini_list = [eval(k) for k in motifs.termini_spec]
if libr is None:
libr = lib
Expand All @@ -78,7 +79,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
termini_list = termini_list,
count = True) for k in range(len(motifs))]*1

out = pd.DataFrame(columns = motifs.motif_name)
out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
out.loc[0] = res
out.loc[0] = out.loc[0].astype('int')
if isinstance(glycan, str):
Expand Down Expand Up @@ -137,16 +138,19 @@ def get_molecular_properties(glycan_list, verbose = False, placeholder = False):


@rescue_glycans
def annotate_dataset(glycans, motifs = None,
feature_set = ['known'], termini_list = [], condense = False):
def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
termini_list = [], condense = False, custom_motifs = []):
"""wrapper function to annotate motifs in list of glycans\n
| Arguments:
| :-
| glycans (list): list of IUPAC-condensed glycan sequences as strings
| motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
| condense (bool): if True, throws away columns with only zeroes; default:False\n
| condense (bool): if True, throws away columns with only zeroes; default:False
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
| Returns:
| :-
| Returns dataframe of glycans (rows) and presence/absence of known motifs (columns)
Expand All @@ -167,6 +171,13 @@ def annotate_dataset(glycans, motifs = None,
# Counts literature-annotated motifs in each glycan
shopping_cart.append(pd.concat([annotate_glycan(k, motifs = motifs, libr = libr,
gmotifs = gmotifs, termini_list = termini_list) for k in glycans], axis = 0))
if 'custom' in feature_set:
normal_motifs = [m for m in custom_motifs if not m.startswith('r')]
gmotifs = [glycan_to_nxGraph(g, libr = libr) for g in normal_motifs]
shopping_cart.append(pd.concat([annotate_glycan(k, motifs = normal_motifs, libr = libr,
gmotifs = gmotifs) for k in glycans], axis = 0))
regex_motifs = [m[1:] for m in custom_motifs if m.startswith('r')]
shopping_cart.append(pd.concat([pd.DataFrame([len(get_match(p, k)) for p in regex_motifs], columns = regex_motifs, index = [k]) for k in glycans], axis = 0))
if 'graph' in feature_set:
# Calculates graph features of each glycan
shopping_cart.append(pd.concat([generate_graph_features(k, libr = libr) for k in glycans], axis = 0))
Expand Down
25 changes: 18 additions & 7 deletions glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets
from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph
from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans
from glycowork.motif.re import get_match


def link_find(glycan):
Expand Down Expand Up @@ -48,7 +49,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
| Arguments:
| :-
| glycan (string or networkx): glycan in IUPAC-condensed format (or as networkx graph) that has to contain a floating substituent
| motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
| motifs (dataframe): dataframe of glycan motifs (name + sequence), can be used with a list of glycans too; default:motif_list
| libr (dict): dictionary of form glycoletter:index
| termini_list (list): list of monosaccharide positions (from 'terminal', 'internal', and 'flexible')
| gmotifs (networkx): precalculated motif graphs for speed-up; default:None\n
Expand All @@ -59,7 +60,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
if motifs is None:
motifs = motif_list
# Check whether termini are specified
if not termini_list:
if not termini_list and isinstance(motifs, pd.DataFrame):
termini_list = [eval(k) for k in motifs.termini_spec]
if libr is None:
libr = lib
Expand All @@ -78,7 +79,7 @@ def annotate_glycan(glycan, motifs = None, libr = None,
termini_list = termini_list,
count = True) for k in range(len(motifs))]*1

out = pd.DataFrame(columns = motifs.motif_name)
out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
out.loc[0] = res
out.loc[0] = out.loc[0].astype('int')
if isinstance(glycan, str):
Expand Down Expand Up @@ -137,16 +138,19 @@ def get_molecular_properties(glycan_list, verbose = False, placeholder = False):


@rescue_glycans
def annotate_dataset(glycans, motifs = None,
feature_set = ['known'], termini_list = [], condense = False):
def annotate_dataset(glycans, motifs = None, feature_set = ['known'],
termini_list = [], condense = False, custom_motifs = []):
"""wrapper function to annotate motifs in list of glycans\n
| Arguments:
| :-
| glycans (list): list of IUPAC-condensed glycan sequences as strings
| motifs (dataframe): dataframe of glycan motifs (name + sequence); default:motif_list
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), and 'chemical' (molecular properties of glycan)
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'custom' (specify your own motifs in custom_motifs), and 'chemical' (molecular properties of glycan)
| termini_list (list): list of monosaccharide/linkage positions (from 'terminal', 'internal', and 'flexible')
| condense (bool): if True, throws away columns with only zeroes; default:False\n
| condense (bool): if True, throws away columns with only zeroes; default:False
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty\n
| Returns:
| :-
| Returns dataframe of glycans (rows) and presence/absence of known motifs (columns)
Expand All @@ -167,6 +171,13 @@ def annotate_dataset(glycans, motifs = None,
# Counts literature-annotated motifs in each glycan
shopping_cart.append(pd.concat([annotate_glycan(k, motifs = motifs, libr = libr,
gmotifs = gmotifs, termini_list = termini_list) for k in glycans], axis = 0))
if 'custom' in feature_set:
normal_motifs = [m for m in custom_motifs if not m.startswith('r')]
gmotifs = [glycan_to_nxGraph(g, libr = libr) for g in normal_motifs]
shopping_cart.append(pd.concat([annotate_glycan(k, motifs = normal_motifs, libr = libr,
gmotifs = gmotifs) for k in glycans], axis = 0))
regex_motifs = [m[1:] for m in custom_motifs if m.startswith('r')]
shopping_cart.append(pd.concat([pd.DataFrame([len(get_match(p, k)) for p in regex_motifs], columns = regex_motifs, index = [k]) for k in glycans], axis = 0))
if 'graph' in feature_set:
# Calculates graph features of each glycan
shopping_cart.append(pd.concat([generate_graph_features(k, libr = libr) for k in glycans], axis = 0))
Expand Down
7 changes: 4 additions & 3 deletions glycowork/motif/re.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def replacer(match):
number = match.group(2)
return f'({letter}1-{number})'
pattern_component = re.sub(pattern, replacer, pattern_component)
return pattern_component
return pattern_component.replace('5Ac(a1', '5Ac(a2').replace('5Gc(a1', '5Gc(a2').replace('Kdn(a1', 'Kdn(a2').replace('Sia(a1', 'Sia(a2')


def replace_patterns(s):
Expand Down Expand Up @@ -443,7 +443,8 @@ def trace_path(pattern_matches, ggraph):
for component, component_matches in pattern_matches[idx:]:
extended = False
min_occur, max_occur = optional_components.get(component, (1, 1))
to_extend = try_matching(trace, component_matches, edges, min_occur, max_occur, branch = '[' in component)
branch = '(' in component and '(?' not in component
to_extend = try_matching(trace, component_matches, edges, min_occur, max_occur, branch = branch)
if to_extend:
extend = to_extend[-1] if not isinstance(to_extend, bool) else []
extend = list(extend) if isinstance(extend, tuple) else extend
Expand Down Expand Up @@ -481,7 +482,7 @@ def fill_missing_in_list(lists):
# Check whether the gap between current and previous element is exactly 2
if gap == 2:
filled_sublist.append(sublist[i] - 1)
elif gap > 2:
elif gap > 2 and gap % 2 == 0:
filled_sublist.append(sublist[i-1] + 1)
filled_sublist.append(sublist[i])
filled_lists.append(filled_sublist)
Expand Down

0 comments on commit 69743ba

Please sign in to comment.