plot styling and efficiency tweaks

- plot styling in get_volcano - plot saving in get_roc - fix internal_LacNAc_type1 in motif_list - minor efficiency tweaks
BojarLab · May 14, 2024 · 3319db9 · 3319db9
1 parent 2955570
commit 3319db9
Show file tree

Hide file tree

Showing 9 changed files with 6,287 additions and 6,293 deletions.
diff --git a/05_examples.ipynb b/05_examples.ipynb
diff --git a/build/lib/glycowork/glycan_data/glycan_motifs.csv b/build/lib/glycowork/glycan_data/glycan_motifs.csv
@@ -41,7 +41,7 @@ i_antigen,Gal(b1-4)GlcNAc(b1-3)Gal,"['flexible', 'flexible', 'flexible']"
 PI_antigen,Gal(a1-4)Gal(a1-4)GlcNAc,"['flexible', 'flexible', 'flexible']"
 Chitobiose,GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible']"
 Trimannosylcore,Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible', 'flexible', 'flexible', 'flexible']"
-Internal_LacNAc_type1,Gal(b1-3)GlcNAc(b1-3)Gal,"['internal', 'flexible', 'flexible']"
+Internal_LacNAc_type1,Gal(b1-3)GlcNAc,"['internal', 'flexible']"
 Terminal_LacNAc_type1,Gal(b1-3)GlcNAc,"['terminal', 'flexible']"
 Internal_LacNAc_type2,Gal(b1-4)GlcNAc,"['internal', 'flexible']"
 Terminal_LacNAc_type2,Gal(b1-4)GlcNAc,"['terminal', 'flexible']"

diff --git a/build/lib/glycowork/motif/analysis.py b/build/lib/glycowork/motif/analysis.py
@@ -601,7 +601,7 @@ def get_differential_expression(df, group1, group2,
         gp1, gp2 = df_a.loc[cluster, :], df_b.loc[cluster, :]
         mean_abundance_c.append(mean_abundance.loc[cluster].mean())
         log2fc.append(((gp2.values - gp1.values).mean(axis = 1)).mean() if paired else (gp2.mean(axis = 1) - gp1.mean(axis = 1)).mean())
-      gp1, gp2 = df2.loc[cluster, group1], df2.loc[cluster, group2]
+      gp1, gp2 = df.loc[cluster, group1], df.loc[cluster, group2]
       # Hotelling's T^2 test for multivariate comparisons
       pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
       levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
@@ -694,7 +694,7 @@ def get_ma(df_res, log2fc_thresh = 1, sig_thresh = 0.05, filepath = ''):
 
 
 def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed = True,
-                x_metric = 'Log2FC', annotate_volcano = False, filepath = ''):
+                x_metric = 'Log2FC', annotate_volcano = False, filepath = '', **kwargs):
   """Plots glycan differential expression results in a volcano plot\n
   | Arguments:
   | :-
@@ -705,7 +705,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
   | label_changed (bool): if True, add text labels to significantly up- and downregulated datapoints; default:True
   | x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'
   | annotate_volcano (bool): whether to annotate the dots in the plot with SNFG images; default: False
-  | filepath (string): absolute path including full filename allows for saving the plot\n
+  | filepath (string): absolute path including full filename allows for saving the plot
+  | **kwargs: keyword arguments that are directly passed on to seaborn scatterplot\n
   | Returns:
   | :-
   | Prints volcano plot
@@ -722,7 +723,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
   else:
     print(f"You're working with a default alpha of 0.05. Set sample size (n = ...) for Bayesian-Adaptive Alpha Adjustment")
   # Make plot
-  ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = '#3E3E3E', alpha = 0.8)
+  color = kwargs.pop('color', '#3E3E3E')
+  ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = color, alpha = 0.8, **kwargs)
   ax.set(xlabel = x_metric, ylabel = '-log10(corr p-val)', title = '')
   plt.axhline(y = -np.log10(y_thresh), c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
   plt.axvline(x = x_thresh, c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
@@ -1245,7 +1247,7 @@ def get_SparCC(df1, df2, motifs = False, feature_set = ["known", "exhaustive"],
 
 
 def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["known", "exhaustive"], paired = False, impute = True,
-            min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0):
+            min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0, filepath = ''):
   """Calculates ROC AUC for every feature and, optionally, plots the best\n
   | Arguments:
   | :-
@@ -1264,7 +1266,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
   | transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:will be inferred
   | gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
-  | custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
+  | custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)
+  | filepath (string): absolute path including full filename allows for saving the plot, if plot=True\n
   | Returns:
   | :-
   | Returns a sorted list of tuples of type (glycan, AUC score) and, optionally, ROC curve for best feature
@@ -1324,6 +1327,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
       plt.ylabel('True Positive Rate')
       plt.title(f'ROC Curve for {best}')
       plt.legend(loc = 'lower right')
+      if filepath:
+        plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
       plt.show()
   else: # multi-group comparison
     df = df.groupby(df.index).mean()
@@ -1360,6 +1365,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
         plt.ylabel('True Positive Rate')
         plt.title(f'Best Feature ROC for {classy}: {best_feature}')
         plt.legend(loc = "lower right")
+        if filepath:
+          plt.savefig(filepath.split('.')[0] + "_" + str(classy) + filepath.split('.')[-1], format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
   return sorted_auc_scores
 
 

diff --git a/build/lib/glycowork/motif/annotate.py b/build/lib/glycowork/motif/annotate.py
@@ -64,15 +64,9 @@ def annotate_glycan(glycan, motifs = None, termini_list = [], gmotifs = None):
     termini = 'provided' if termini_list else 'ignore'
     gmotifs = [glycan_to_nxGraph(g, termini = termini, termini_list = termini_list[i]) for i, g in enumerate(motifs.motif)]
   # Count the number of times each motif occurs in a glycan
-  if termini_list:
-    ggraph = ensure_graph(glycan, termini = 'calc')
-    res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k],
+  ggraph = ensure_graph(glycan, termini = 'calc' if termini_list else 'ignore')
+  res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k] if termini_list else termini_list,
                                 count = True) for k in range(len(motifs))]*1
-  else:
-    ggraph = ensure_graph(glycan, termini = 'ignore')
-    res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list,
-                                count = True) for k in range(len(motifs))]*1
-
   out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
   out.loc[0] = res
   out.loc[0] = out.loc[0].astype('int')

diff --git a/build/lib/glycowork/motif/graph.py b/build/lib/glycowork/motif/graph.py
@@ -89,7 +89,7 @@ def glycan_to_nxGraph_int(glycan, libr = None,
   g1 = nx.from_numpy_array(adj_matrix) if len(node_dict) > 1 else nx.Graph()
   if len(node_dict) > 1:
     # Needed for compatibility with monosaccharide-only graphs (size = 1)
-    for n1, n2, d in g1.edges(data = True):
+    for _, _, d in g1.edges(data = True):
       del d['weight']
   else:
     g1.add_node(0)
@@ -539,9 +539,5 @@ def possible_topology_check(glycan, glycans, exhaustive = False, **kwargs):
   | Returns list of glycans that could match input glycan
   """
   topologies = get_possible_topologies(glycan, exhaustive = exhaustive)
-  out_glycs = []
-  for g in glycans:
-    ggraph = ensure_graph(g)
-    if any([compare_glycans(t, ggraph, **kwargs) for t in topologies]):
-      out_glycs.append(g)
-  return out_glycs
+  ggraphs = map(ensure_graph, glycans)
+  return [g for g, ggraph in zip(glycans, ggraphs) if any(compare_glycans(t, ggraph, **kwargs) for t in topologies)]
diff --git a/glycowork/glycan_data/glycan_motifs.csv b/glycowork/glycan_data/glycan_motifs.csv
@@ -41,7 +41,7 @@ i_antigen,Gal(b1-4)GlcNAc(b1-3)Gal,"['flexible', 'flexible', 'flexible']"
 PI_antigen,Gal(a1-4)Gal(a1-4)GlcNAc,"['flexible', 'flexible', 'flexible']"
 Chitobiose,GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible']"
 Trimannosylcore,Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible', 'flexible', 'flexible', 'flexible']"
-Internal_LacNAc_type1,Gal(b1-3)GlcNAc(b1-3)Gal,"['internal', 'flexible', 'flexible']"
+Internal_LacNAc_type1,Gal(b1-3)GlcNAc,"['internal', 'flexible']"
 Terminal_LacNAc_type1,Gal(b1-3)GlcNAc,"['terminal', 'flexible']"
 Internal_LacNAc_type2,Gal(b1-4)GlcNAc,"['internal', 'flexible']"
 Terminal_LacNAc_type2,Gal(b1-4)GlcNAc,"['terminal', 'flexible']"

diff --git a/glycowork/motif/analysis.py b/glycowork/motif/analysis.py
@@ -601,7 +601,7 @@ def get_differential_expression(df, group1, group2,
         gp1, gp2 = df_a.loc[cluster, :], df_b.loc[cluster, :]
         mean_abundance_c.append(mean_abundance.loc[cluster].mean())
         log2fc.append(((gp2.values - gp1.values).mean(axis = 1)).mean() if paired else (gp2.mean(axis = 1) - gp1.mean(axis = 1)).mean())
-      gp1, gp2 = df2.loc[cluster, group1], df2.loc[cluster, group2]
+      gp1, gp2 = df.loc[cluster, group1], df.loc[cluster, group2]
       # Hotelling's T^2 test for multivariate comparisons
       pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
       levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
@@ -694,7 +694,7 @@ def get_ma(df_res, log2fc_thresh = 1, sig_thresh = 0.05, filepath = ''):
 
 
 def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed = True,
-                x_metric = 'Log2FC', annotate_volcano = False, filepath = ''):
+                x_metric = 'Log2FC', annotate_volcano = False, filepath = '', **kwargs):
   """Plots glycan differential expression results in a volcano plot\n
   | Arguments:
   | :-
@@ -705,7 +705,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
   | label_changed (bool): if True, add text labels to significantly up- and downregulated datapoints; default:True
   | x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'
   | annotate_volcano (bool): whether to annotate the dots in the plot with SNFG images; default: False
-  | filepath (string): absolute path including full filename allows for saving the plot\n
+  | filepath (string): absolute path including full filename allows for saving the plot
+  | **kwargs: keyword arguments that are directly passed on to seaborn scatterplot\n
   | Returns:
   | :-
   | Prints volcano plot
@@ -722,7 +723,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
   else:
     print(f"You're working with a default alpha of 0.05. Set sample size (n = ...) for Bayesian-Adaptive Alpha Adjustment")
   # Make plot
-  ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = '#3E3E3E', alpha = 0.8)
+  color = kwargs.pop('color', '#3E3E3E')
+  ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = color, alpha = 0.8, **kwargs)
   ax.set(xlabel = x_metric, ylabel = '-log10(corr p-val)', title = '')
   plt.axhline(y = -np.log10(y_thresh), c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
   plt.axvline(x = x_thresh, c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
@@ -1245,7 +1247,7 @@ def get_SparCC(df1, df2, motifs = False, feature_set = ["known", "exhaustive"],
 
 
 def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["known", "exhaustive"], paired = False, impute = True,
-            min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0):
+            min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0, filepath = ''):
   """Calculates ROC AUC for every feature and, optionally, plots the best\n
   | Arguments:
   | :-
@@ -1264,7 +1266,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
   | custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
   | transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:will be inferred
   | gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
-  | custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
+  | custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)
+  | filepath (string): absolute path including full filename allows for saving the plot, if plot=True\n
   | Returns:
   | :-
   | Returns a sorted list of tuples of type (glycan, AUC score) and, optionally, ROC curve for best feature
@@ -1324,6 +1327,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
       plt.ylabel('True Positive Rate')
       plt.title(f'ROC Curve for {best}')
       plt.legend(loc = 'lower right')
+      if filepath:
+        plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
       plt.show()
   else: # multi-group comparison
     df = df.groupby(df.index).mean()
@@ -1360,6 +1365,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
         plt.ylabel('True Positive Rate')
         plt.title(f'Best Feature ROC for {classy}: {best_feature}')
         plt.legend(loc = "lower right")
+        if filepath:
+          plt.savefig(filepath.split('.')[0] + "_" + str(classy) + filepath.split('.')[-1], format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
   return sorted_auc_scores
 
 

diff --git a/glycowork/motif/annotate.py b/glycowork/motif/annotate.py
@@ -64,15 +64,9 @@ def annotate_glycan(glycan, motifs = None, termini_list = [], gmotifs = None):
     termini = 'provided' if termini_list else 'ignore'
     gmotifs = [glycan_to_nxGraph(g, termini = termini, termini_list = termini_list[i]) for i, g in enumerate(motifs.motif)]
   # Count the number of times each motif occurs in a glycan
-  if termini_list:
-    ggraph = ensure_graph(glycan, termini = 'calc')
-    res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k],
+  ggraph = ensure_graph(glycan, termini = 'calc' if termini_list else 'ignore')
+  res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k] if termini_list else termini_list,
                                 count = True) for k in range(len(motifs))]*1
-  else:
-    ggraph = ensure_graph(glycan, termini = 'ignore')
-    res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list,
-                                count = True) for k in range(len(motifs))]*1
-
   out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
   out.loc[0] = res
   out.loc[0] = out.loc[0].astype('int')

diff --git a/glycowork/motif/graph.py b/glycowork/motif/graph.py
@@ -89,7 +89,7 @@ def glycan_to_nxGraph_int(glycan, libr = None,
   g1 = nx.from_numpy_array(adj_matrix) if len(node_dict) > 1 else nx.Graph()
   if len(node_dict) > 1:
     # Needed for compatibility with monosaccharide-only graphs (size = 1)
-    for n1, n2, d in g1.edges(data = True):
+    for _, _, d in g1.edges(data = True):
       del d['weight']
   else:
     g1.add_node(0)
@@ -539,9 +539,5 @@ def possible_topology_check(glycan, glycans, exhaustive = False, **kwargs):
   | Returns list of glycans that could match input glycan
   """
   topologies = get_possible_topologies(glycan, exhaustive = exhaustive)
-  out_glycs = []
-  for g in glycans:
-    ggraph = ensure_graph(g)
-    if any([compare_glycans(t, ggraph, **kwargs) for t in topologies]):
-      out_glycs.append(g)
-  return out_glycs
+  ggraphs = map(ensure_graph, glycans)
+  return [g for g, ggraph in zip(glycans, ggraphs) if any(compare_glycans(t, ggraph, **kwargs) for t in topologies)]