Skip to content

Commit

Permalink
add helper function for stripping out unmatched brackets
Browse files Browse the repository at this point in the history
  • Loading branch information
Bribak committed Nov 23, 2023
1 parent 74f7a07 commit fa3db89
Show file tree
Hide file tree
Showing 4 changed files with 68 additions and 2 deletions.
31 changes: 31 additions & 0 deletions build/lib/glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,37 @@ def find_nth_reverse(string, substring, n, ignore_branches = False):
return original_start_index


def remove_unmatched_brackets(s):
"""Removes all unmatched brackets from the string s.\n
| Arguments:
| :-
| s (string): glycan string in IUPAC-condensed\n
| Returns:
| :-
| Returns glycan without unmatched brackets
"""
while True:
# Keep track of the indexes of the brackets
stack = []
unmatched_open = set()
unmatched_close = set()
for i, char in enumerate(s):
if char == '[':
stack.append(i)
elif char == ']':
if stack:
stack.pop()
else:
unmatched_close.add(i)
unmatched_open.update(stack)
# If there are no unmatched brackets, break the loop
if not unmatched_open and not unmatched_close:
break
# Build a new string without the unmatched brackets
s = ''.join([char for i, char in enumerate(s) if i not in unmatched_open and i not in unmatched_close])
return s


def reindex(df_new, df_old, out_col, ind_col, inp_col):
"""Returns columns values in order of new dataframe rows\n
| Arguments:
Expand Down
4 changes: 3 additions & 1 deletion build/lib/glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from collections import defaultdict

from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second
from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets
from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph
from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans

Expand Down Expand Up @@ -304,6 +304,8 @@ def get_k_saccharides(glycans, size = 2, libr = None, up_to = False, just_motifs
if col_sums[col1] == col_sums[col2] and col_subs[col1] == col2:
drop_columns.append(col2)
out_matrix = out_matrix.drop(drop_columns, axis = 1)
if size > 3:
out_matrix.columns = [remove_unmatched_brackets(g) for g in out_matrix.columns]
if up_to:
combined_df= pd.concat([wga_letter, out_matrix], axis = 1).fillna(0).astype(int)
if just_motifs:
Expand Down
31 changes: 31 additions & 0 deletions glycowork/glycan_data/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,37 @@ def find_nth_reverse(string, substring, n, ignore_branches = False):
return original_start_index


def remove_unmatched_brackets(s):
"""Removes all unmatched brackets from the string s.\n
| Arguments:
| :-
| s (string): glycan string in IUPAC-condensed\n
| Returns:
| :-
| Returns glycan without unmatched brackets
"""
while True:
# Keep track of the indexes of the brackets
stack = []
unmatched_open = set()
unmatched_close = set()
for i, char in enumerate(s):
if char == '[':
stack.append(i)
elif char == ']':
if stack:
stack.pop()
else:
unmatched_close.add(i)
unmatched_open.update(stack)
# If there are no unmatched brackets, break the loop
if not unmatched_open and not unmatched_close:
break
# Build a new string without the unmatched brackets
s = ''.join([char for i, char in enumerate(s) if i not in unmatched_open and i not in unmatched_close])
return s


def reindex(df_new, df_old, out_col, ind_col, inp_col):
"""Returns columns values in order of new dataframe rows\n
| Arguments:
Expand Down
4 changes: 3 additions & 1 deletion glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import re
from collections import defaultdict

from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second
from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets
from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph
from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans

Expand Down Expand Up @@ -304,6 +304,8 @@ def get_k_saccharides(glycans, size = 2, libr = None, up_to = False, just_motifs
if col_sums[col1] == col_sums[col2] and col_subs[col1] == col2:
drop_columns.append(col2)
out_matrix = out_matrix.drop(drop_columns, axis = 1)
if size > 3:
out_matrix.columns = [remove_unmatched_brackets(g) for g in out_matrix.columns]
if up_to:
combined_df= pd.concat([wga_letter, out_matrix], axis = 1).fillna(0).astype(int)
if just_motifs:
Expand Down

0 comments on commit fa3db89

Please sign in to comment.