nomenclature and homogenization

- change "mode" in get_heatmap to "motifs" - give warning if get_heatmap only works with one motif - add motif cleaning to get_jtk - add (b1) type of nomenclature to canonicalize_iupac fixing - add rescue_glycans decorator to construct_network
BojarLab · Mar 8, 2024 · 7cb51f8 · 7cb51f8
1 parent 7da8ceb
commit 7cb51f8
Show file tree

Hide file tree

Showing 8 changed files with 11,992 additions and 11,989 deletions.
diff --git a/00_core.ipynb b/00_core.ipynb
diff --git a/03_motif.ipynb b/03_motif.ipynb
diff --git a/05_examples.ipynb b/05_examples.ipynb
diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
@@ -151,14 +151,14 @@ def clean_up_heatmap(df):
   return result
 
 
-def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
+def get_heatmap(df, motifs = False, feature_set = ['known'],
                  datatype = 'response', rarity_filter = 0.05, filepath = '', index_col = 'glycan',
                 custom_motifs = [], **kwargs):
   """clusters samples based on glycan data (for instance glycan binding etc.)\n
   | Arguments:
   | :-
   | df (dataframe): dataframe with glycan data, rows are samples and columns are glycans [alternative: filepath to .csv or .xlsx]
-  | mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence
+  | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
   |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
@@ -178,7 +178,9 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
   if index_col in df.columns:
       df.set_index(index_col, inplace = True)
   df.fillna(0, inplace = True)
-  if mode == 'motif':
+  if motifs:
+      if 'custom' in feature_set and len(feature_set) == 1 and len(custom_motifs) < 2:
+          raise ValueError("A heatmap needs to have at least two motifs.")
       # Count glycan motifs and remove rare motifs from the result
       df_motif = annotate_dataset(df.columns.tolist(), feature_set = feature_set, condense = True, custom_motifs = custom_motifs)
       df_motif = df_motif.replace(0, np.nan).dropna(thresh = np.max([np.round(rarity_filter * df_motif.shape[0]), 1]), axis = 1)
@@ -202,7 +204,7 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
   # Cluster the motif abundances
   sns.clustermap(df, **kwargs)
   plt.xlabel('Samples')
-  plt.ylabel('Glycans' if mode == 'sequence' else 'Motifs')
+  plt.ylabel('Glycans' if not motifs else 'Motifs')
   plt.tight_layout()
   if filepath:
       plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300,
@@ -960,7 +962,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
     df = mf.fit_transform(df)
     df.insert(0, 'Molecule_Name', annot)
     if motifs:
-        df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T.reset_index()
+        df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T
+        df = clean_up_heatmap(df).reset_index()
     res = df.iloc[:, 1:].apply(jtkx, param_dic = param_dic, axis = 1)
     JTK_BHQ = pd.DataFrame(multipletests(res[0], method = 'fdr_bh')[1])
     Results = pd.concat([df.iloc[:, 0], JTK_BHQ, res], axis = 1)

diff --git a/build/lib/glycowork/motif/processing.py b/build/lib/glycowork/motif/processing.py
@@ -862,6 +862,10 @@ def canonicalize_iupac(glycan):
   glycan = re.sub(r'(a|b|\?)-(\d)', r'\g<1>1-\2', glycan)
   # If still no '-' in glycan, assume 'a3' type of linkage denomination
   if '-' not in glycan:
+    # Check whether linkages are recorded as b1 or as a3
+    if bool(re.search(r"^[^0-9]*1?[^0-9]*$", glycan)):
+      glycan = re.sub(r'(a|b)(\d)', r'\g<1>\g<2>-?', glycan)
+    else:
       glycan = re.sub(r'(a|b)(\d)', r'\g<1>1-\g<2>', glycan)
   # Smudge uncertainty
   while '/' in glycan:

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -151,14 +151,14 @@ def clean_up_heatmap(df):
   return result
 
 
-def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
+def get_heatmap(df, motifs = False, feature_set = ['known'],
                  datatype = 'response', rarity_filter = 0.05, filepath = '', index_col = 'glycan',
                 custom_motifs = [], **kwargs):
   """clusters samples based on glycan data (for instance glycan binding etc.)\n
   | Arguments:
   | :-
   | df (dataframe): dataframe with glycan data, rows are samples and columns are glycans [alternative: filepath to .csv or .xlsx]
-  | mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence
+  | motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
   | feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
   |   'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
   |   'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
@@ -178,7 +178,9 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
   if index_col in df.columns:
       df.set_index(index_col, inplace = True)
   df.fillna(0, inplace = True)
-  if mode == 'motif':
+  if motifs:
+      if 'custom' in feature_set and len(feature_set) == 1 and len(custom_motifs) < 2:
+          raise ValueError("A heatmap needs to have at least two motifs.")
       # Count glycan motifs and remove rare motifs from the result
       df_motif = annotate_dataset(df.columns.tolist(), feature_set = feature_set, condense = True, custom_motifs = custom_motifs)
       df_motif = df_motif.replace(0, np.nan).dropna(thresh = np.max([np.round(rarity_filter * df_motif.shape[0]), 1]), axis = 1)
@@ -202,7 +204,7 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
   # Cluster the motif abundances
   sns.clustermap(df, **kwargs)
   plt.xlabel('Samples')
-  plt.ylabel('Glycans' if mode == 'sequence' else 'Motifs')
+  plt.ylabel('Glycans' if not motifs else 'Motifs')
   plt.tight_layout()
   if filepath:
       plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300,
@@ -960,7 +962,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
     df = mf.fit_transform(df)
     df.insert(0, 'Molecule_Name', annot)
     if motifs:
-        df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T.reset_index()
+        df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T
+        df = clean_up_heatmap(df).reset_index()
     res = df.iloc[:, 1:].apply(jtkx, param_dic = param_dic, axis = 1)
     JTK_BHQ = pd.DataFrame(multipletests(res[0], method = 'fdr_bh')[1])
     Results = pd.concat([df.iloc[:, 0], JTK_BHQ, res], axis = 1)

diff --git a/glycowork/motif/processing.py b/glycowork/motif/processing.py
@@ -862,6 +862,10 @@ def canonicalize_iupac(glycan):
   glycan = re.sub(r'(a|b|\?)-(\d)', r'\g<1>1-\2', glycan)
   # If still no '-' in glycan, assume 'a3' type of linkage denomination
   if '-' not in glycan:
+    # Check whether linkages are recorded as b1 or as a3
+    if bool(re.search(r"^[^0-9]*1?[^0-9]*$", glycan)):
+      glycan = re.sub(r'(a|b)(\d)', r'\g<1>\g<2>-?', glycan)
+    else:
       glycan = re.sub(r'(a|b)(\d)', r'\g<1>1-\g<2>', glycan)
   # Smudge uncertainty
   while '/' in glycan:

diff --git a/glycowork/network/biosynthesis.py b/glycowork/network/biosynthesis.py
@@ -14,7 +14,7 @@
 from glycowork.glycan_data.loader import unwrap, linkages
 from glycowork.glycan_data.stats import cohen_d
 from glycowork.motif.graph import compare_glycans, glycan_to_nxGraph, graph_to_string, subgraph_isomorphism
-from glycowork.motif.processing import choose_correct_isoform, get_lib
+from glycowork.motif.processing import choose_correct_isoform, get_lib, rescue_glycans
 from glycowork.motif.tokenization import get_stem_lib
 from glycowork.motif.regex import get_match
 
@@ -605,6 +605,7 @@ def infer_roots(glycans):
     print("Glycan class not detected; depending on the class, glycans should end in -ol, GalNAc, GlcNAc, or Glc")
 
 
+@rescue_glycans
 def construct_network(glycans, allowed_ptms = allowed_ptms,
                       edge_type = 'monolink', permitted_roots = None, abundances = []):
   """construct a glycan biosynthetic network\n