diff --git a/build/lib/glycowork/glycan_data/loader.py b/build/lib/glycowork/glycan_data/loader.py index 1bd8ece..c45afac 100644 --- a/build/lib/glycowork/glycan_data/loader.py +++ b/build/lib/glycowork/glycan_data/loader.py @@ -89,6 +89,37 @@ def find_nth_reverse(string, substring, n, ignore_branches = False): return original_start_index +def remove_unmatched_brackets(s): + """Removes all unmatched brackets from the string s.\n + | Arguments: + | :- + | s (string): glycan string in IUPAC-condensed\n + | Returns: + | :- + | Returns glycan without unmatched brackets + """ + while True: + # Keep track of the indexes of the brackets + stack = [] + unmatched_open = set() + unmatched_close = set() + for i, char in enumerate(s): + if char == '[': + stack.append(i) + elif char == ']': + if stack: + stack.pop() + else: + unmatched_close.add(i) + unmatched_open.update(stack) + # If there are no unmatched brackets, break the loop + if not unmatched_open and not unmatched_close: + break + # Build a new string without the unmatched brackets + s = ''.join([char for i, char in enumerate(s) if i not in unmatched_open and i not in unmatched_close]) + return s + + def reindex(df_new, df_old, out_col, ind_col, inp_col): """Returns columns values in order of new dataframe rows\n | Arguments: diff --git a/build/lib/glycowork/motif/annotate.py b/build/lib/glycowork/motif/annotate.py index ef47ef0..10641f6 100644 --- a/build/lib/glycowork/motif/annotate.py +++ b/build/lib/glycowork/motif/annotate.py @@ -3,7 +3,7 @@ import re from collections import defaultdict -from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second +from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans @@ -304,6 +304,8 @@ def get_k_saccharides(glycans, size = 2, libr = None, up_to = False, just_motifs if col_sums[col1] == col_sums[col2] and col_subs[col1] == col2: drop_columns.append(col2) out_matrix = out_matrix.drop(drop_columns, axis = 1) + if size > 3: + out_matrix.columns = [remove_unmatched_brackets(g) for g in out_matrix.columns] if up_to: combined_df= pd.concat([wga_letter, out_matrix], axis = 1).fillna(0).astype(int) if just_motifs: diff --git a/glycowork/glycan_data/loader.py b/glycowork/glycan_data/loader.py index 1bd8ece..c45afac 100644 --- a/glycowork/glycan_data/loader.py +++ b/glycowork/glycan_data/loader.py @@ -89,6 +89,37 @@ def find_nth_reverse(string, substring, n, ignore_branches = False): return original_start_index +def remove_unmatched_brackets(s): + """Removes all unmatched brackets from the string s.\n + | Arguments: + | :- + | s (string): glycan string in IUPAC-condensed\n + | Returns: + | :- + | Returns glycan without unmatched brackets + """ + while True: + # Keep track of the indexes of the brackets + stack = [] + unmatched_open = set() + unmatched_close = set() + for i, char in enumerate(s): + if char == '[': + stack.append(i) + elif char == ']': + if stack: + stack.pop() + else: + unmatched_close.add(i) + unmatched_open.update(stack) + # If there are no unmatched brackets, break the loop + if not unmatched_open and not unmatched_close: + break + # Build a new string without the unmatched brackets + s = ''.join([char for i, char in enumerate(s) if i not in unmatched_open and i not in unmatched_close]) + return s + + def reindex(df_new, df_old, out_col, ind_col, inp_col): """Returns columns values in order of new dataframe rows\n | Arguments: diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py index ef47ef0..10641f6 100644 --- a/glycowork/motif/annotate.py +++ b/glycowork/motif/annotate.py @@ -3,7 +3,7 @@ import re from collections import defaultdict -from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second +from glycowork.glycan_data.loader import lib, linkages, motif_list, find_nth, unwrap, replace_every_second, remove_unmatched_brackets from glycowork.motif.graph import subgraph_isomorphism, generate_graph_features, glycan_to_nxGraph, graph_to_string, ensure_graph from glycowork.motif.processing import IUPAC_to_SMILES, get_lib, find_isomorphs, expand_lib, rescue_glycans @@ -304,6 +304,8 @@ def get_k_saccharides(glycans, size = 2, libr = None, up_to = False, just_motifs if col_sums[col1] == col_sums[col2] and col_subs[col1] == col2: drop_columns.append(col2) out_matrix = out_matrix.drop(drop_columns, axis = 1) + if size > 3: + out_matrix.columns = [remove_unmatched_brackets(g) for g in out_matrix.columns] if up_to: combined_df= pd.concat([wga_letter, out_matrix], axis = 1).fillna(0).astype(int) if just_motifs: