Skip to content

Commit

Permalink
nomenclature and homogenization
Browse files Browse the repository at this point in the history
- change "mode" in get_heatmap to "motifs"
- give warning if get_heatmap only works with one motif
- add motif cleaning to get_jtk
- add (b1) type of nomenclature to canonicalize_iupac fixing
- add rescue_glycans decorator to construct_network
  • Loading branch information
Bribak committed Mar 8, 2024
1 parent 7da8ceb commit 7cb51f8
Show file tree
Hide file tree
Showing 8 changed files with 11,992 additions and 11,989 deletions.
4,566 changes: 2,283 additions & 2,283 deletions 00_core.ipynb

Large diffs are not rendered by default.

7,006 changes: 3,497 additions & 3,509 deletions 03_motif.ipynb

Large diffs are not rendered by default.

12,372 changes: 6,186 additions & 6,186 deletions 05_examples.ipynb

Large diffs are not rendered by default.

13 changes: 8 additions & 5 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,14 @@ def clean_up_heatmap(df):
return result


def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
def get_heatmap(df, motifs = False, feature_set = ['known'],
datatype = 'response', rarity_filter = 0.05, filepath = '', index_col = 'glycan',
custom_motifs = [], **kwargs):
"""clusters samples based on glycan data (for instance glycan binding etc.)\n
| Arguments:
| :-
| df (dataframe): dataframe with glycan data, rows are samples and columns are glycans [alternative: filepath to .csv or .xlsx]
| mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence
| motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
Expand All @@ -178,7 +178,9 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
if index_col in df.columns:
df.set_index(index_col, inplace = True)
df.fillna(0, inplace = True)
if mode == 'motif':
if motifs:
if 'custom' in feature_set and len(feature_set) == 1 and len(custom_motifs) < 2:
raise ValueError("A heatmap needs to have at least two motifs.")
# Count glycan motifs and remove rare motifs from the result
df_motif = annotate_dataset(df.columns.tolist(), feature_set = feature_set, condense = True, custom_motifs = custom_motifs)
df_motif = df_motif.replace(0, np.nan).dropna(thresh = np.max([np.round(rarity_filter * df_motif.shape[0]), 1]), axis = 1)
Expand All @@ -202,7 +204,7 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
# Cluster the motif abundances
sns.clustermap(df, **kwargs)
plt.xlabel('Samples')
plt.ylabel('Glycans' if mode == 'sequence' else 'Motifs')
plt.ylabel('Glycans' if not motifs else 'Motifs')
plt.tight_layout()
if filepath:
plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300,
Expand Down Expand Up @@ -960,7 +962,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
df = mf.fit_transform(df)
df.insert(0, 'Molecule_Name', annot)
if motifs:
df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T.reset_index()
df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T
df = clean_up_heatmap(df).reset_index()
res = df.iloc[:, 1:].apply(jtkx, param_dic = param_dic, axis = 1)
JTK_BHQ = pd.DataFrame(multipletests(res[0], method = 'fdr_bh')[1])
Results = pd.concat([df.iloc[:, 0], JTK_BHQ, res], axis = 1)
Expand Down
4 changes: 4 additions & 0 deletions build/lib/glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,10 @@ def canonicalize_iupac(glycan):
glycan = re.sub(r'(a|b|\?)-(\d)', r'\g<1>1-\2', glycan)
# If still no '-' in glycan, assume 'a3' type of linkage denomination
if '-' not in glycan:
# Check whether linkages are recorded as b1 or as a3
if bool(re.search(r"^[^0-9]*1?[^0-9]*$", glycan)):
glycan = re.sub(r'(a|b)(\d)', r'\g<1>\g<2>-?', glycan)
else:
glycan = re.sub(r'(a|b)(\d)', r'\g<1>1-\g<2>', glycan)
# Smudge uncertainty
while '/' in glycan:
Expand Down
13 changes: 8 additions & 5 deletions glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,14 +151,14 @@ def clean_up_heatmap(df):
return result


def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
def get_heatmap(df, motifs = False, feature_set = ['known'],
datatype = 'response', rarity_filter = 0.05, filepath = '', index_col = 'glycan',
custom_motifs = [], **kwargs):
"""clusters samples based on glycan data (for instance glycan binding etc.)\n
| Arguments:
| :-
| df (dataframe): dataframe with glycan data, rows are samples and columns are glycans [alternative: filepath to .csv or .xlsx]
| mode (string): whether glycan 'sequence' or 'motif' should be used for clustering; default:sequence
| motifs (bool): whether to analyze full sequences (False) or motifs (True); default:False
| feature_set (list): which feature set to use for annotations, add more to list to expand; default is 'known'; options are: 'known' (hand-crafted glycan features), \
| 'graph' (structural graph features of glycans), 'exhaustive' (all mono- and disaccharide features), 'terminal' (non-reducing end motifs), \
| 'terminal2' (non-reducing end motifs of size 2), 'terminal3' (non-reducing end motifs of size 3), 'custom' (specify your own motifs in custom_motifs), \
Expand All @@ -178,7 +178,9 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
if index_col in df.columns:
df.set_index(index_col, inplace = True)
df.fillna(0, inplace = True)
if mode == 'motif':
if motifs:
if 'custom' in feature_set and len(feature_set) == 1 and len(custom_motifs) < 2:
raise ValueError("A heatmap needs to have at least two motifs.")
# Count glycan motifs and remove rare motifs from the result
df_motif = annotate_dataset(df.columns.tolist(), feature_set = feature_set, condense = True, custom_motifs = custom_motifs)
df_motif = df_motif.replace(0, np.nan).dropna(thresh = np.max([np.round(rarity_filter * df_motif.shape[0]), 1]), axis = 1)
Expand All @@ -202,7 +204,7 @@ def get_heatmap(df, mode = 'sequence', feature_set = ['known'],
# Cluster the motif abundances
sns.clustermap(df, **kwargs)
plt.xlabel('Samples')
plt.ylabel('Glycans' if mode == 'sequence' else 'Motifs')
plt.ylabel('Glycans' if not motifs else 'Motifs')
plt.tight_layout()
if filepath:
plt.savefig(filepath, format = filepath.split('.')[-1], dpi = 300,
Expand Down Expand Up @@ -960,7 +962,8 @@ def get_jtk(df_in, timepoints, periods, interval, motifs = False, feature_set =
df = mf.fit_transform(df)
df.insert(0, 'Molecule_Name', annot)
if motifs:
df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T.reset_index()
df = quantify_motifs(df.iloc[:, 1:], df.iloc[:, 0].values.tolist(), feature_set, custom_motifs = custom_motifs).T
df = clean_up_heatmap(df).reset_index()
res = df.iloc[:, 1:].apply(jtkx, param_dic = param_dic, axis = 1)
JTK_BHQ = pd.DataFrame(multipletests(res[0], method = 'fdr_bh')[1])
Results = pd.concat([df.iloc[:, 0], JTK_BHQ, res], axis = 1)
Expand Down
4 changes: 4 additions & 0 deletions glycowork/motif/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -862,6 +862,10 @@ def canonicalize_iupac(glycan):
glycan = re.sub(r'(a|b|\?)-(\d)', r'\g<1>1-\2', glycan)
# If still no '-' in glycan, assume 'a3' type of linkage denomination
if '-' not in glycan:
# Check whether linkages are recorded as b1 or as a3
if bool(re.search(r"^[^0-9]*1?[^0-9]*$", glycan)):
glycan = re.sub(r'(a|b)(\d)', r'\g<1>\g<2>-?', glycan)
else:
glycan = re.sub(r'(a|b)(\d)', r'\g<1>1-\g<2>', glycan)
# Smudge uncertainty
while '/' in glycan:
Expand Down
3 changes: 2 additions & 1 deletion glycowork/network/biosynthesis.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from glycowork.glycan_data.loader import unwrap, linkages
from glycowork.glycan_data.stats import cohen_d
from glycowork.motif.graph import compare_glycans, glycan_to_nxGraph, graph_to_string, subgraph_isomorphism
from glycowork.motif.processing import choose_correct_isoform, get_lib
from glycowork.motif.processing import choose_correct_isoform, get_lib, rescue_glycans
from glycowork.motif.tokenization import get_stem_lib
from glycowork.motif.regex import get_match

Expand Down Expand Up @@ -605,6 +605,7 @@ def infer_roots(glycans):
print("Glycan class not detected; depending on the class, glycans should end in -ol, GalNAc, GlcNAc, or Glc")


@rescue_glycans
def construct_network(glycans, allowed_ptms = allowed_ptms,
edge_type = 'monolink', permitted_roots = None, abundances = []):
"""construct a glycan biosynthetic network\n
Expand Down

0 comments on commit 7cb51f8

Please sign in to comment.