Skip to content

Commit

Permalink
add get_glycoshift_per_site to analyze differential glycosylation in …
Browse files Browse the repository at this point in the history
…glycoproteomics
  • Loading branch information
Bribak committed Jun 17, 2024
1 parent f60c6d4 commit 6919fa5
Show file tree
Hide file tree
Showing 8 changed files with 6,275 additions and 5,451 deletions.
10,501 changes: 5,537 additions & 4,964 deletions 03_motif.ipynb

Large diffs are not rendered by default.

48 changes: 46 additions & 2 deletions build/lib/glycowork/motif/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@
sequence_richness, shannon_diversity_index, simpson_diversity_index,
get_equivalence_test, clr_transformation, anosim, permanova_with_permutation,
alpha_biodiversity_stats, get_additive_logratio_transformation, correct_multiple_testing,
omega_squared, get_glycoform_diff)
from glycowork.motif.processing import enforce_class
omega_squared, get_glycoform_diff, process_glm_results)
from glycowork.motif.processing import enforce_class, process_for_glycoshift
from glycowork.motif.annotate import (annotate_dataset, quantify_motifs, link_find, create_correlation_network,
group_glycans_core, group_glycans_sia_fuc, group_glycans_N_glycan_type, load_lectin_lib,
create_lectin_and_motif_mappings, lectin_motif_scoring, clean_up_heatmap)
Expand Down Expand Up @@ -1451,3 +1451,47 @@ def get_lectin_array(df, group1, group2, paired = False, transform = ''):
if not group2:
df_out["change"] = ["different"] * len(df_out)
return df_out


def get_glycoshift_per_site(df, group1, group2, paired = False, impute = True,
min_samples = 0.2, gamma = 0.1, custom_scale = 0):
"""Calculates differentially expressed glycans or motifs from glycomics data\n
| Arguments:
| :-
| df (dataframe): dataframe containing glycan sequences in first column and relative abundances in subsequent columns [alternative: filepath to .csv or .xlsx]
| group1 (list): list of column indices or names for the first group of samples, usually the control
| group2 (list): list of column indices or names for the second group of samples
| paired (bool): whether samples are paired or not (e.g., tumor & tumor-adjacent tissue from same patient); default:False
| impute (bool): replaces zeroes with a Random Forest based model; default:True
| min_samples (float): Percent of the samples that need to have non-zero values for glycan to be kept; default: 20%
| gamma (float): uncertainty parameter to estimate scale uncertainty for CLR transformation; default: 0.1
| custom_scale (float or dict): Ratio of total signal in group2/group1 for an informed scale model (or group_idx: mean(group)/min(mean(groups)) signal dict for multivariate)\n
| Returns:
| :-
| Returns a dataframe with:
| (for each condition/interaction feature)
| (i) Regression coefficient from the GLM (indicating direction of change in the treatment condition)
| (ii) Corrected p-values (two-tailed t-test with two-stage Benjamini-Hochberg correction) for testing the coefficient against zero
| (iii) Significance: True/False of whether the corrected p-value lies below the sample size-appropriate significance threshold
"""
df, df_org, group1, group2 = preprocess_data(df, group1, group2, experiment = "diff", motifs = False, impute = impute,
min_samples = min_samples, transform = "Nothing", paired = paired)
alpha = get_alphaN(len(group1+group2))
df, glycan_features = process_for_glycoshift(df) # potentially expand this further to infer and label high-Man/Hybrid/complex
necessary_columns = ['Glycoform'] + glycan_features
preserved_data = df[necessary_columns]
df = df.drop(necessary_columns, axis = 1)
df = df.set_index('Glycosite')
df = df.div(df.sum(axis = 0), axis = 1) * 100
df = df.reset_index()
results = [
clr_transformation(group_df[group1 + group2], group1, group2, gamma = gamma, custom_scale = custom_scale)
.assign(Glycosite = glycosite)
for glycosite, group_df in df.groupby('Glycosite')
]
df = pd.concat(results, ignore_index = True)
df = pd.concat([df, preserved_data.reset_index(drop = True)], axis = 1)
df_long = pd.melt(df, id_vars = ['Glycosite', 'Glycoform'] + glycan_features,
var_name = 'Sample', value_name = 'Abundance')
df_long['Condition'] = df_long['Sample'].apply(lambda x: 0 if x in group1 else 1)
return process_glm_results(df_long, alpha, glycan_features)
53 changes: 52 additions & 1 deletion build/lib/glycowork/motif/processing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
import pandas as pd
import copy
import re
from functools import wraps
Expand Down Expand Up @@ -309,7 +310,7 @@ def canonicalize_composition(comp):
"""converts a composition from any common format into the dictionary that is optimized for glycowork\n
| Arguments:
| :-
| comp (string): composition formatted either in the style of HexNAc2Hex1Fuc3Neu5Ac1 or N2H1F3A1\n
| comp (string): composition formatted either in the style of Hex5HexNAc4Fuc1Neu5Ac2 or H5N4F1A2\n
| Returns:
| :-
| Returns composition as a dictionary of style monosaccharide : count
Expand All @@ -321,6 +322,13 @@ def canonicalize_composition(comp):
elif comp.isdigit():
temp = {"Hex": int(comp[0]), "HexNAc": int(comp[1]), "Neu5Ac": int(comp[2]), "dHex": int(comp[3])}
return {k: v for k, v in temp.items() if v}
elif comp[0].isdigit():
comp = comp.replace(' ', '')
if len(comp) < 5:
temp = {"Hex": int(comp[0]), "HexNAc": int(comp[1]), "Neu5Ac": int(comp[2]), "dHex": int(comp[3])}
else:
temp = {"Hex": int(comp[0]), "HexNAc": int(comp[1]), "Neu5Ac": int(comp[2]), "Neu5Gc": int(comp[3]), "dHex": int(comp[4])}
return {k: v for k, v in temp.items() if v}
comp_dict = {}
i = 0
replace_dic = {"Neu5Ac": "NeuAc", "Neu5Gc": "NeuGc", '(': '', ')': '', ' ': '', '+': ''}
Expand Down Expand Up @@ -959,3 +967,46 @@ def equal_repeats(r1, r2):
"""
r1_long = r1[:r1.rindex(')')+1] * 2
return any(r1_long[i:i + len(r2)] == r2 for i in range(len(r1)))


@rescue_compositions
def parse_glycoform(glycoform, glycan_features = ['H', 'N', 'A', 'F', 'G']):
"""converts composition of style H5N4F1A2 into monosaccharide counts\n
| Arguments:
| :-
| comp (string): composition formatted either in the style of Hex5HexNAc4Fuc1Neu5Ac2 or H5N4F1A2\n
| Returns:
| :-
| Returns composition as a dictionary of style monosaccharide : count
"""
if isinstance(glycoform, dict):
return {k: glycoform.get(k, 0) for k in glycan_features}
components = {c: 0 for c in glycan_features}
matches = re.finditer(r'([HNAFG])(\d+)', glycoform)
for match in matches:
components[match.group(1)] = int(match.group(2))
return components


def process_for_glycoshift(df):
"""extracts and formats compositions in glycoproteomics dataset\n
| Arguments:
| :-
| df (dataframe): glycoproteomics dataset, expects index to be formatted as protein_site_composition\n
| Returns:
| :-
| (i) glycoproteomics dataset with new columns for protein_site, composition, and composition counts
| (ii) list of identified glycan features, such as different monosaccharides
"""
df['Glycosite'] = [k.split('_')[0] + '_' + k.split('_')[2] for i, k in enumerate(df.index)]
if '[' in df.index[0]:
comps = ['['+k.split('[')[1] for k in df.index]
comps = [list(map(int, re.findall(r'\d+', s))) for s in comps]
df['Glycoform'] = [f'H{c[0]}N{c[1]}F{c[3]}A{c[2]}' for c in comps]
glycan_features = ['H', 'N', 'A', 'F', 'G']
else:
df['Glycoform'] = [canonicalize_composition(k.split('_')[-1]) for k in df.index]
glycan_features = set(unwrap([list(c.keys()) for c in df.Glycoform]))
org_cols = df.columns.tolist()
df = df.join(df['Glycoform'].apply(parse_glycoform, glycan_features = glycan_features).apply(pd.Series))
return df, [c for c in df.columns if c not in org_cols]
Loading

0 comments on commit 6919fa5

Please sign in to comment.