Skip to content

Commit

Permalink
plot styling and efficiency tweaks
Browse files Browse the repository at this point in the history
- plot styling in get_volcano
- plot saving in get_roc
- fix internal_LacNAc_type1 in motif_list
- minor efficiency tweaks
  • Loading branch information
Bribak committed May 14, 2024
1 parent 2955570 commit 3319db9
Show file tree
Hide file tree
Showing 9 changed files with 6,287 additions and 6,293 deletions.
12,498 changes: 6,249 additions & 6,249 deletions 05_examples.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion build/lib/glycowork/glycan_data/glycan_motifs.csv
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ i_antigen,Gal(b1-4)GlcNAc(b1-3)Gal,"['flexible', 'flexible', 'flexible']"
PI_antigen,Gal(a1-4)Gal(a1-4)GlcNAc,"['flexible', 'flexible', 'flexible']"
Chitobiose,GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible']"
Trimannosylcore,Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible', 'flexible', 'flexible', 'flexible']"
Internal_LacNAc_type1,Gal(b1-3)GlcNAc(b1-3)Gal,"['internal', 'flexible', 'flexible']"
Internal_LacNAc_type1,Gal(b1-3)GlcNAc,"['internal', 'flexible']"
Terminal_LacNAc_type1,Gal(b1-3)GlcNAc,"['terminal', 'flexible']"
Internal_LacNAc_type2,Gal(b1-4)GlcNAc,"['internal', 'flexible']"
Terminal_LacNAc_type2,Gal(b1-4)GlcNAc,"['terminal', 'flexible']"
Expand Down
19 changes: 13 additions & 6 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ def get_differential_expression(df, group1, group2,
gp1, gp2 = df_a.loc[cluster, :], df_b.loc[cluster, :]
mean_abundance_c.append(mean_abundance.loc[cluster].mean())
log2fc.append(((gp2.values - gp1.values).mean(axis = 1)).mean() if paired else (gp2.mean(axis = 1) - gp1.mean(axis = 1)).mean())
gp1, gp2 = df2.loc[cluster, group1], df2.loc[cluster, group2]
gp1, gp2 = df.loc[cluster, group1], df.loc[cluster, group2]
# Hotelling's T^2 test for multivariate comparisons
pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
Expand Down Expand Up @@ -694,7 +694,7 @@ def get_ma(df_res, log2fc_thresh = 1, sig_thresh = 0.05, filepath = ''):


def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed = True,
x_metric = 'Log2FC', annotate_volcano = False, filepath = ''):
x_metric = 'Log2FC', annotate_volcano = False, filepath = '', **kwargs):
"""Plots glycan differential expression results in a volcano plot\n
| Arguments:
| :-
Expand All @@ -705,7 +705,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
| label_changed (bool): if True, add text labels to significantly up- and downregulated datapoints; default:True
| x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'
| annotate_volcano (bool): whether to annotate the dots in the plot with SNFG images; default: False
| filepath (string): absolute path including full filename allows for saving the plot\n
| filepath (string): absolute path including full filename allows for saving the plot
| **kwargs: keyword arguments that are directly passed on to seaborn scatterplot\n
| Returns:
| :-
| Prints volcano plot
Expand All @@ -722,7 +723,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
else:
print(f"You're working with a default alpha of 0.05. Set sample size (n = ...) for Bayesian-Adaptive Alpha Adjustment")
# Make plot
ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = '#3E3E3E', alpha = 0.8)
color = kwargs.pop('color', '#3E3E3E')
ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = color, alpha = 0.8, **kwargs)
ax.set(xlabel = x_metric, ylabel = '-log10(corr p-val)', title = '')
plt.axhline(y = -np.log10(y_thresh), c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
plt.axvline(x = x_thresh, c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
Expand Down Expand Up @@ -1245,7 +1247,7 @@ def get_SparCC(df1, df2, motifs = False, feature_set = ["known", "exhaustive"],


def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["known", "exhaustive"], paired = False, impute = True,
min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0):
min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0, filepath = ''):
"""Calculates ROC AUC for every feature and, optionally, plots the best\n
| Arguments:
| :-
Expand All @@ -1264,7 +1266,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
| transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:will be inferred
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)
| filepath (string): absolute path including full filename allows for saving the plot, if plot=True\n
| Returns:
| :-
| Returns a sorted list of tuples of type (glycan, AUC score) and, optionally, ROC curve for best feature
Expand Down Expand Up @@ -1324,6 +1327,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve for {best}')
plt.legend(loc = 'lower right')
if filepath:
plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
plt.show()
else: # multi-group comparison
df = df.groupby(df.index).mean()
Expand Down Expand Up @@ -1360,6 +1365,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
plt.ylabel('True Positive Rate')
plt.title(f'Best Feature ROC for {classy}: {best_feature}')
plt.legend(loc = "lower right")
if filepath:
plt.savefig(filepath.split('.')[0] + "_" + str(classy) + filepath.split('.')[-1], format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
return sorted_auc_scores


Expand Down
10 changes: 2 additions & 8 deletions build/lib/glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,9 @@ def annotate_glycan(glycan, motifs = None, termini_list = [], gmotifs = None):
termini = 'provided' if termini_list else 'ignore'
gmotifs = [glycan_to_nxGraph(g, termini = termini, termini_list = termini_list[i]) for i, g in enumerate(motifs.motif)]
# Count the number of times each motif occurs in a glycan
if termini_list:
ggraph = ensure_graph(glycan, termini = 'calc')
res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k],
ggraph = ensure_graph(glycan, termini = 'calc' if termini_list else 'ignore')
res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k] if termini_list else termini_list,
count = True) for k in range(len(motifs))]*1
else:
ggraph = ensure_graph(glycan, termini = 'ignore')
res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list,
count = True) for k in range(len(motifs))]*1

out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
out.loc[0] = res
out.loc[0] = out.loc[0].astype('int')
Expand Down
10 changes: 3 additions & 7 deletions build/lib/glycowork/motif/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def glycan_to_nxGraph_int(glycan, libr = None,
g1 = nx.from_numpy_array(adj_matrix) if len(node_dict) > 1 else nx.Graph()
if len(node_dict) > 1:
# Needed for compatibility with monosaccharide-only graphs (size = 1)
for n1, n2, d in g1.edges(data = True):
for _, _, d in g1.edges(data = True):
del d['weight']
else:
g1.add_node(0)
Expand Down Expand Up @@ -539,9 +539,5 @@ def possible_topology_check(glycan, glycans, exhaustive = False, **kwargs):
| Returns list of glycans that could match input glycan
"""
topologies = get_possible_topologies(glycan, exhaustive = exhaustive)
out_glycs = []
for g in glycans:
ggraph = ensure_graph(g)
if any([compare_glycans(t, ggraph, **kwargs) for t in topologies]):
out_glycs.append(g)
return out_glycs
ggraphs = map(ensure_graph, glycans)
return [g for g, ggraph in zip(glycans, ggraphs) if any(compare_glycans(t, ggraph, **kwargs) for t in topologies)]
2 changes: 1 addition & 1 deletion glycowork/glycan_data/glycan_motifs.csv
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ i_antigen,Gal(b1-4)GlcNAc(b1-3)Gal,"['flexible', 'flexible', 'flexible']"
PI_antigen,Gal(a1-4)Gal(a1-4)GlcNAc,"['flexible', 'flexible', 'flexible']"
Chitobiose,GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible']"
Trimannosylcore,Man(a1-3)[Man(a1-6)]Man(b1-4)GlcNAc(b1-4)GlcNAc,"['flexible', 'flexible', 'flexible', 'flexible', 'flexible']"
Internal_LacNAc_type1,Gal(b1-3)GlcNAc(b1-3)Gal,"['internal', 'flexible', 'flexible']"
Internal_LacNAc_type1,Gal(b1-3)GlcNAc,"['internal', 'flexible']"
Terminal_LacNAc_type1,Gal(b1-3)GlcNAc,"['terminal', 'flexible']"
Internal_LacNAc_type2,Gal(b1-4)GlcNAc,"['internal', 'flexible']"
Terminal_LacNAc_type2,Gal(b1-4)GlcNAc,"['terminal', 'flexible']"
Expand Down
19 changes: 13 additions & 6 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,7 @@ def get_differential_expression(df, group1, group2,
gp1, gp2 = df_a.loc[cluster, :], df_b.loc[cluster, :]
mean_abundance_c.append(mean_abundance.loc[cluster].mean())
log2fc.append(((gp2.values - gp1.values).mean(axis = 1)).mean() if paired else (gp2.mean(axis = 1) - gp1.mean(axis = 1)).mean())
gp1, gp2 = df2.loc[cluster, group1], df2.loc[cluster, group2]
gp1, gp2 = df.loc[cluster, group1], df.loc[cluster, group2]
# Hotelling's T^2 test for multivariate comparisons
pvals.append(hotellings_t2(gp1.T.values, gp2.T.values, paired = paired)[1])
levene_pvals.append(np.mean([levene(gp1.loc[variable, :], gp2.loc[variable, :])[1] for variable in cluster]))
Expand Down Expand Up @@ -694,7 +694,7 @@ def get_ma(df_res, log2fc_thresh = 1, sig_thresh = 0.05, filepath = ''):


def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed = True,
x_metric = 'Log2FC', annotate_volcano = False, filepath = ''):
x_metric = 'Log2FC', annotate_volcano = False, filepath = '', **kwargs):
"""Plots glycan differential expression results in a volcano plot\n
| Arguments:
| :-
Expand All @@ -705,7 +705,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
| label_changed (bool): if True, add text labels to significantly up- and downregulated datapoints; default:True
| x_metric (string): x-axis metric; default:'Log2FC'; options are 'Log2FC', 'Effect size'
| annotate_volcano (bool): whether to annotate the dots in the plot with SNFG images; default: False
| filepath (string): absolute path including full filename allows for saving the plot\n
| filepath (string): absolute path including full filename allows for saving the plot
| **kwargs: keyword arguments that are directly passed on to seaborn scatterplot\n
| Returns:
| :-
| Prints volcano plot
Expand All @@ -722,7 +723,8 @@ def get_volcano(df_res, y_thresh = 0.05, x_thresh = 0, n = None, label_changed =
else:
print(f"You're working with a default alpha of 0.05. Set sample size (n = ...) for Bayesian-Adaptive Alpha Adjustment")
# Make plot
ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = '#3E3E3E', alpha = 0.8)
color = kwargs.pop('color', '#3E3E3E')
ax = sns.scatterplot(x = x_metric, y = 'log_p', data = df_res, color = color, alpha = 0.8, **kwargs)
ax.set(xlabel = x_metric, ylabel = '-log10(corr p-val)', title = '')
plt.axhline(y = -np.log10(y_thresh), c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
plt.axvline(x = x_thresh, c = 'k', ls = ':', lw = 0.5, alpha = 0.3)
Expand Down Expand Up @@ -1245,7 +1247,7 @@ def get_SparCC(df1, df2, motifs = False, feature_set = ["known", "exhaustive"],


def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["known", "exhaustive"], paired = False, impute = True,
min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0):
min_samples = 0.1, custom_motifs = [], transform = None, gamma = 0.1, custom_scale = 0, filepath = ''):
"""Calculates ROC AUC for every feature and, optionally, plots the best\n
| Arguments:
| :-
Expand All @@ -1264,7 +1266,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
| custom_motifs (list): list of glycan motifs, used if feature_set includes 'custom'; default:empty
| transform (str): transformation to escape Aitchison space; options are CLR and ALR (use ALR if you have many glycans (>100) with low values); default:will be inferred
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)
| filepath (string): absolute path including full filename allows for saving the plot, if plot=True\n
| Returns:
| :-
| Returns a sorted list of tuples of type (glycan, AUC score) and, optionally, ROC curve for best feature
Expand Down Expand Up @@ -1324,6 +1327,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
plt.ylabel('True Positive Rate')
plt.title(f'ROC Curve for {best}')
plt.legend(loc = 'lower right')
if filepath:
plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
plt.show()
else: # multi-group comparison
df = df.groupby(df.index).mean()
Expand Down Expand Up @@ -1360,6 +1365,8 @@ def get_roc(df, group1, group2, plot = False, motifs = False, feature_set = ["kn
plt.ylabel('True Positive Rate')
plt.title(f'Best Feature ROC for {classy}: {best_feature}')
plt.legend(loc = "lower right")
if filepath:
plt.savefig(filepath.split('.')[0] + "_" + str(classy) + filepath.split('.')[-1], format = filepath.split('.')[-1], dpi = 300, bbox_inches = 'tight')
return sorted_auc_scores


Expand Down
10 changes: 2 additions & 8 deletions glycowork/motif/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,9 @@ def annotate_glycan(glycan, motifs = None, termini_list = [], gmotifs = None):
termini = 'provided' if termini_list else 'ignore'
gmotifs = [glycan_to_nxGraph(g, termini = termini, termini_list = termini_list[i]) for i, g in enumerate(motifs.motif)]
# Count the number of times each motif occurs in a glycan
if termini_list:
ggraph = ensure_graph(glycan, termini = 'calc')
res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k],
ggraph = ensure_graph(glycan, termini = 'calc' if termini_list else 'ignore')
res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list[k] if termini_list else termini_list,
count = True) for k in range(len(motifs))]*1
else:
ggraph = ensure_graph(glycan, termini = 'ignore')
res = [subgraph_isomorphism(ggraph, gmotifs[k], termini_list = termini_list,
count = True) for k in range(len(motifs))]*1

out = pd.DataFrame(columns = motifs.motif_name if isinstance(motifs, pd.DataFrame) else motifs)
out.loc[0] = res
out.loc[0] = out.loc[0].astype('int')
Expand Down
10 changes: 3 additions & 7 deletions glycowork/motif/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ def glycan_to_nxGraph_int(glycan, libr = None,
g1 = nx.from_numpy_array(adj_matrix) if len(node_dict) > 1 else nx.Graph()
if len(node_dict) > 1:
# Needed for compatibility with monosaccharide-only graphs (size = 1)
for n1, n2, d in g1.edges(data = True):
for _, _, d in g1.edges(data = True):
del d['weight']
else:
g1.add_node(0)
Expand Down Expand Up @@ -539,9 +539,5 @@ def possible_topology_check(glycan, glycans, exhaustive = False, **kwargs):
| Returns list of glycans that could match input glycan
"""
topologies = get_possible_topologies(glycan, exhaustive = exhaustive)
out_glycs = []
for g in glycans:
ggraph = ensure_graph(g)
if any([compare_glycans(t, ggraph, **kwargs) for t in topologies]):
out_glycs.append(g)
return out_glycs
ggraphs = map(ensure_graph, glycans)
return [g for g, ggraph in zip(glycans, ggraphs) if any(compare_glycans(t, ggraph, **kwargs) for t in topologies)]

0 comments on commit 3319db9

Please sign in to comment.